xref: /xnu-8792.41.9/osfmk/vm/vm_fault.c (revision 5c2921b07a2480ab43ec66f5b9e41cb872bc554f)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm_fault.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *
62  *	Page fault handling module.
63  */
64 
65 #include <libkern/OSAtomic.h>
66 
67 #include <mach/mach_types.h>
68 #include <mach/kern_return.h>
69 #include <mach/message.h>       /* for error codes */
70 #include <mach/vm_param.h>
71 #include <mach/vm_behavior.h>
72 #include <mach/memory_object.h>
73 /* For memory_object_data_{request,unlock} */
74 #include <mach/sdt.h>
75 
76 #include <kern/kern_types.h>
77 #include <kern/host_statistics.h>
78 #include <kern/counter.h>
79 #include <kern/task.h>
80 #include <kern/thread.h>
81 #include <kern/sched_prim.h>
82 #include <kern/host.h>
83 #include <kern/mach_param.h>
84 #include <kern/macro_help.h>
85 #include <kern/zalloc_internal.h>
86 #include <kern/misc_protos.h>
87 #include <kern/policy_internal.h>
88 
89 #include <vm/vm_compressor.h>
90 #include <vm/vm_compressor_pager.h>
91 #include <vm/vm_fault.h>
92 #include <vm/vm_map.h>
93 #include <vm/vm_object.h>
94 #include <vm/vm_page.h>
95 #include <vm/vm_kern.h>
96 #include <vm/pmap.h>
97 #include <vm/vm_pageout.h>
98 #include <vm/vm_protos.h>
99 #include <vm/vm_external.h>
100 #include <vm/memory_object.h>
101 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
102 #include <vm/vm_shared_region.h>
103 
104 #include <sys/codesign.h>
105 #include <sys/reason.h>
106 #include <sys/signalvar.h>
107 
108 #include <sys/kdebug_triage.h>
109 
110 #include <san/kasan.h>
111 #include <libkern/coreanalytics/coreanalytics.h>
112 
113 #define VM_FAULT_CLASSIFY       0
114 
115 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
116 
117 int vm_protect_privileged_from_untrusted = 1;
118 
119 unsigned int    vm_object_pagein_throttle = 16;
120 
121 /*
122  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
123  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
124  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
125  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
126  * keep the UI active so that the user has a chance to kill the offending task before the system
127  * completely hangs.
128  *
129  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
130  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
131  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
132  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
133  */
134 
135 extern void throttle_lowpri_io(int);
136 
137 extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
138 
139 uint64_t vm_hard_throttle_threshold;
140 
141 #if DEBUG || DEVELOPMENT
142 static bool vmtc_panic_instead = false;
143 int panic_object_not_alive = 1;
144 #endif /* DEBUG || DEVELOPMENT */
145 
146 OS_ALWAYS_INLINE
147 boolean_t
NEED_TO_HARD_THROTTLE_THIS_TASK(void)148 NEED_TO_HARD_THROTTLE_THIS_TASK(void)
149 {
150 	return vm_wants_task_throttled(current_task()) ||
151 	       ((vm_page_free_count < vm_page_throttle_limit ||
152 	       HARD_THROTTLE_LIMIT_REACHED()) &&
153 	       proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED);
154 }
155 
156 #define HARD_THROTTLE_DELAY     10000   /* 10000 us == 10 ms */
157 #define SOFT_THROTTLE_DELAY     200     /* 200 us == .2 ms */
158 
159 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS   6
160 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC  20000
161 
162 
163 #define VM_STAT_DECOMPRESSIONS()        \
164 MACRO_BEGIN                             \
165 	counter_inc(&vm_statistics_decompressions); \
166 	current_thread()->decompressions++; \
167 MACRO_END
168 
169 boolean_t current_thread_aborted(void);
170 
171 /* Forward declarations of internal routines. */
172 static kern_return_t vm_fault_wire_fast(
173 	vm_map_t        map,
174 	vm_map_offset_t va,
175 	vm_prot_t       prot,
176 	vm_tag_t        wire_tag,
177 	vm_map_entry_t  entry,
178 	pmap_t          pmap,
179 	vm_map_offset_t pmap_addr,
180 	ppnum_t         *physpage_p);
181 
182 static kern_return_t vm_fault_internal(
183 	vm_map_t        map,
184 	vm_map_offset_t vaddr,
185 	vm_prot_t       caller_prot,
186 	boolean_t       change_wiring,
187 	vm_tag_t        wire_tag,
188 	int             interruptible,
189 	pmap_t          pmap,
190 	vm_map_offset_t pmap_addr,
191 	ppnum_t         *physpage_p);
192 
193 static void vm_fault_copy_cleanup(
194 	vm_page_t       page,
195 	vm_page_t       top_page);
196 
197 static void vm_fault_copy_dst_cleanup(
198 	vm_page_t       page);
199 
200 #if     VM_FAULT_CLASSIFY
201 extern void vm_fault_classify(vm_object_t       object,
202     vm_object_offset_t    offset,
203     vm_prot_t             fault_type);
204 
205 extern void vm_fault_classify_init(void);
206 #endif
207 
208 unsigned long vm_pmap_enter_blocked = 0;
209 unsigned long vm_pmap_enter_retried = 0;
210 
211 unsigned long vm_cs_validates = 0;
212 unsigned long vm_cs_revalidates = 0;
213 unsigned long vm_cs_query_modified = 0;
214 unsigned long vm_cs_validated_dirtied = 0;
215 unsigned long vm_cs_bitmap_validated = 0;
216 
217 void vm_pre_fault(vm_map_offset_t, vm_prot_t);
218 
219 extern char *kdp_compressor_decompressed_page;
220 extern addr64_t kdp_compressor_decompressed_page_paddr;
221 extern ppnum_t  kdp_compressor_decompressed_page_ppnum;
222 
223 struct vmrtfr {
224 	int vmrtfr_maxi;
225 	int vmrtfr_curi;
226 	int64_t vmrtf_total;
227 	vm_rtfault_record_t *vm_rtf_records;
228 } vmrtfrs;
229 #define VMRTF_DEFAULT_BUFSIZE (4096)
230 #define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
231 TUNABLE(int, vmrtf_num_records, "vm_rtfault_records", VMRTF_NUM_RECORDS_DEFAULT);
232 
233 static void vm_rtfrecord_lock(void);
234 static void vm_rtfrecord_unlock(void);
235 static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
236 
237 extern lck_grp_t vm_page_lck_grp_bucket;
238 extern lck_attr_t vm_page_lck_attr;
239 LCK_SPIN_DECLARE_ATTR(vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
240 
241 #if DEVELOPMENT || DEBUG
242 extern int madvise_free_debug;
243 #endif /* DEVELOPMENT || DEBUG */
244 
245 #if CONFIG_FREEZE
246 #endif /* CONFIG_FREEZE */
247 
248 /*
249  *	Routine:	vm_fault_init
250  *	Purpose:
251  *		Initialize our private data structures.
252  */
253 __startup_func
254 void
vm_fault_init(void)255 vm_fault_init(void)
256 {
257 	int i, vm_compressor_temp;
258 	boolean_t need_default_val = TRUE;
259 	/*
260 	 * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
261 	 * computed as a percentage of available memory, and the percentage used is scaled inversely with
262 	 * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
263 	 * and reduce the value down to 10% for very large memory configurations.  This helps give us a
264 	 * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
265 	 * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
266 	 */
267 
268 	vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024 * 1024 * 1024)), 25)) / 100;
269 
270 	/*
271 	 * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
272 	 */
273 
274 	if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof(vm_compressor_temp))) {
275 		for (i = 0; i < VM_PAGER_MAX_MODES; i++) {
276 			if (((vm_compressor_temp & (1 << i)) == vm_compressor_temp)) {
277 				need_default_val = FALSE;
278 				vm_compressor_mode = vm_compressor_temp;
279 				break;
280 			}
281 		}
282 		if (need_default_val) {
283 			printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
284 		}
285 	}
286 #if CONFIG_FREEZE
287 	if (need_default_val) {
288 		if (osenvironment_is_diagnostics()) {
289 			printf("osenvironment == \"diagnostics\". Setting \"vm_compressor_mode\" to in-core compressor only\n");
290 			vm_compressor_mode = VM_PAGER_COMPRESSOR_NO_SWAP;
291 			need_default_val = false;
292 		}
293 	}
294 #endif /* CONFIG_FREEZE */
295 	if (need_default_val) {
296 		/* If no boot arg or incorrect boot arg, try device tree. */
297 		PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
298 	}
299 	printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
300 	vm_config_init();
301 
302 	PE_parse_boot_argn("vm_protect_privileged_from_untrusted",
303 	    &vm_protect_privileged_from_untrusted,
304 	    sizeof(vm_protect_privileged_from_untrusted));
305 
306 #if DEBUG || DEVELOPMENT
307 	(void)PE_parse_boot_argn("text_corruption_panic", &vmtc_panic_instead, sizeof(vmtc_panic_instead));
308 
309 	if (kern_feature_override(KF_MADVISE_FREE_DEBUG_OVRD)) {
310 		madvise_free_debug = 0;
311 	}
312 
313 	PE_parse_boot_argn("panic_object_not_alive", &panic_object_not_alive, sizeof(panic_object_not_alive));
314 #endif /* DEBUG || DEVELOPMENT */
315 }
316 
317 __startup_func
318 static void
vm_rtfault_record_init(void)319 vm_rtfault_record_init(void)
320 {
321 	size_t size;
322 
323 	vmrtf_num_records = MAX(vmrtf_num_records, 1);
324 	size = vmrtf_num_records * sizeof(vm_rtfault_record_t);
325 	vmrtfrs.vm_rtf_records = zalloc_permanent_tag(size,
326 	    ZALIGN(vm_rtfault_record_t), VM_KERN_MEMORY_DIAG);
327 	vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
328 }
329 STARTUP(ZALLOC, STARTUP_RANK_MIDDLE, vm_rtfault_record_init);
330 
331 /*
332  *	Routine:	vm_fault_cleanup
333  *	Purpose:
334  *		Clean up the result of vm_fault_page.
335  *	Results:
336  *		The paging reference for "object" is released.
337  *		"object" is unlocked.
338  *		If "top_page" is not null,  "top_page" is
339  *		freed and the paging reference for the object
340  *		containing it is released.
341  *
342  *	In/out conditions:
343  *		"object" must be locked.
344  */
345 void
vm_fault_cleanup(vm_object_t object,vm_page_t top_page)346 vm_fault_cleanup(
347 	vm_object_t     object,
348 	vm_page_t       top_page)
349 {
350 	vm_object_paging_end(object);
351 	vm_object_unlock(object);
352 
353 	if (top_page != VM_PAGE_NULL) {
354 		object = VM_PAGE_OBJECT(top_page);
355 
356 		vm_object_lock(object);
357 		VM_PAGE_FREE(top_page);
358 		vm_object_paging_end(object);
359 		vm_object_unlock(object);
360 	}
361 }
362 
363 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
364 
365 
366 boolean_t       vm_page_deactivate_behind = TRUE;
367 /*
368  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
369  */
370 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
371 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
372                                                                 /* we use it to size an array on the stack */
373 
374 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
375 
376 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
377 
378 /*
379  * vm_page_is_sequential
380  *
381  * Determine if sequential access is in progress
382  * in accordance with the behavior specified.
383  * Update state to indicate current access pattern.
384  *
385  * object must have at least the shared lock held
386  */
387 static
388 void
vm_fault_is_sequential(vm_object_t object,vm_object_offset_t offset,vm_behavior_t behavior)389 vm_fault_is_sequential(
390 	vm_object_t             object,
391 	vm_object_offset_t      offset,
392 	vm_behavior_t           behavior)
393 {
394 	vm_object_offset_t      last_alloc;
395 	int                     sequential;
396 	int                     orig_sequential;
397 
398 	last_alloc = object->last_alloc;
399 	sequential = object->sequential;
400 	orig_sequential = sequential;
401 
402 	offset = vm_object_trunc_page(offset);
403 	if (offset == last_alloc && behavior != VM_BEHAVIOR_RANDOM) {
404 		/* re-faulting in the same page: no change in behavior */
405 		return;
406 	}
407 
408 	switch (behavior) {
409 	case VM_BEHAVIOR_RANDOM:
410 		/*
411 		 * reset indicator of sequential behavior
412 		 */
413 		sequential = 0;
414 		break;
415 
416 	case VM_BEHAVIOR_SEQUENTIAL:
417 		if (offset && last_alloc == offset - PAGE_SIZE_64) {
418 			/*
419 			 * advance indicator of sequential behavior
420 			 */
421 			if (sequential < MAX_SEQUENTIAL_RUN) {
422 				sequential += PAGE_SIZE;
423 			}
424 		} else {
425 			/*
426 			 * reset indicator of sequential behavior
427 			 */
428 			sequential = 0;
429 		}
430 		break;
431 
432 	case VM_BEHAVIOR_RSEQNTL:
433 		if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
434 			/*
435 			 * advance indicator of sequential behavior
436 			 */
437 			if (sequential > -MAX_SEQUENTIAL_RUN) {
438 				sequential -= PAGE_SIZE;
439 			}
440 		} else {
441 			/*
442 			 * reset indicator of sequential behavior
443 			 */
444 			sequential = 0;
445 		}
446 		break;
447 
448 	case VM_BEHAVIOR_DEFAULT:
449 	default:
450 		if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
451 			/*
452 			 * advance indicator of sequential behavior
453 			 */
454 			if (sequential < 0) {
455 				sequential = 0;
456 			}
457 			if (sequential < MAX_SEQUENTIAL_RUN) {
458 				sequential += PAGE_SIZE;
459 			}
460 		} else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
461 			/*
462 			 * advance indicator of sequential behavior
463 			 */
464 			if (sequential > 0) {
465 				sequential = 0;
466 			}
467 			if (sequential > -MAX_SEQUENTIAL_RUN) {
468 				sequential -= PAGE_SIZE;
469 			}
470 		} else {
471 			/*
472 			 * reset indicator of sequential behavior
473 			 */
474 			sequential = 0;
475 		}
476 		break;
477 	}
478 	if (sequential != orig_sequential) {
479 		if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
480 			/*
481 			 * if someone else has already updated object->sequential
482 			 * don't bother trying to update it or object->last_alloc
483 			 */
484 			return;
485 		}
486 	}
487 	/*
488 	 * I'd like to do this with a OSCompareAndSwap64, but that
489 	 * doesn't exist for PPC...  however, it shouldn't matter
490 	 * that much... last_alloc is maintained so that we can determine
491 	 * if a sequential access pattern is taking place... if only
492 	 * one thread is banging on this object, no problem with the unprotected
493 	 * update... if 2 or more threads are banging away, we run the risk of
494 	 * someone seeing a mangled update... however, in the face of multiple
495 	 * accesses, no sequential access pattern can develop anyway, so we
496 	 * haven't lost any real info.
497 	 */
498 	object->last_alloc = offset;
499 }
500 
501 #if DEVELOPMENT || DEBUG
502 uint64_t vm_page_deactivate_behind_count = 0;
503 #endif /* DEVELOPMENT || DEBUG */
504 
505 /*
506  * vm_page_deactivate_behind
507  *
508  * Determine if sequential access is in progress
509  * in accordance with the behavior specified.  If
510  * so, compute a potential page to deactivate and
511  * deactivate it.
512  *
513  * object must be locked.
514  *
515  * return TRUE if we actually deactivate a page
516  */
517 static
518 boolean_t
vm_fault_deactivate_behind(vm_object_t object,vm_object_offset_t offset,vm_behavior_t behavior)519 vm_fault_deactivate_behind(
520 	vm_object_t             object,
521 	vm_object_offset_t      offset,
522 	vm_behavior_t           behavior)
523 {
524 	int             n;
525 	int             pages_in_run = 0;
526 	int             max_pages_in_run = 0;
527 	int             sequential_run;
528 	int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
529 	vm_object_offset_t      run_offset = 0;
530 	vm_object_offset_t      pg_offset = 0;
531 	vm_page_t       m;
532 	vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
533 
534 	pages_in_run = 0;
535 #if TRACEFAULTPAGE
536 	dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
537 #endif
538 	if (object == kernel_object || vm_page_deactivate_behind == FALSE || (vm_object_trunc_page(offset) != offset)) {
539 		/*
540 		 * Do not deactivate pages from the kernel object: they
541 		 * are not intended to become pageable.
542 		 * or we've disabled the deactivate behind mechanism
543 		 * or we are dealing with an offset that is not aligned to
544 		 * the system's PAGE_SIZE because in that case we will
545 		 * handle the deactivation on the aligned offset and, thus,
546 		 * the full PAGE_SIZE page once. This helps us avoid the redundant
547 		 * deactivates and the extra faults.
548 		 */
549 		return FALSE;
550 	}
551 	if ((sequential_run = object->sequential)) {
552 		if (sequential_run < 0) {
553 			sequential_behavior = VM_BEHAVIOR_RSEQNTL;
554 			sequential_run = 0 - sequential_run;
555 		} else {
556 			sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
557 		}
558 	}
559 	switch (behavior) {
560 	case VM_BEHAVIOR_RANDOM:
561 		break;
562 	case VM_BEHAVIOR_SEQUENTIAL:
563 		if (sequential_run >= (int)PAGE_SIZE) {
564 			run_offset = 0 - PAGE_SIZE_64;
565 			max_pages_in_run = 1;
566 		}
567 		break;
568 	case VM_BEHAVIOR_RSEQNTL:
569 		if (sequential_run >= (int)PAGE_SIZE) {
570 			run_offset = PAGE_SIZE_64;
571 			max_pages_in_run = 1;
572 		}
573 		break;
574 	case VM_BEHAVIOR_DEFAULT:
575 	default:
576 	{       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
577 
578 		/*
579 		 * determine if the run of sequential accesss has been
580 		 * long enough on an object with default access behavior
581 		 * to consider it for deactivation
582 		 */
583 		if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
584 			/*
585 			 * the comparisons between offset and behind are done
586 			 * in this kind of odd fashion in order to prevent wrap around
587 			 * at the end points
588 			 */
589 			if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
590 				if (offset >= behind) {
591 					run_offset = 0 - behind;
592 					pg_offset = PAGE_SIZE_64;
593 					max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
594 				}
595 			} else {
596 				if (offset < -behind) {
597 					run_offset = behind;
598 					pg_offset = 0 - PAGE_SIZE_64;
599 					max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
600 				}
601 			}
602 		}
603 		break;}
604 	}
605 	for (n = 0; n < max_pages_in_run; n++) {
606 		m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
607 
608 		if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
609 			page_run[pages_in_run++] = m;
610 
611 			/*
612 			 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
613 			 *
614 			 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
615 			 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
616 			 * new reference happens. If no futher references happen on the page after that remote TLB flushes
617 			 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
618 			 * by pageout_scan, which is just fine since the last reference would have happened quite far
619 			 * in the past (TLB caches don't hang around for very long), and of course could just as easily
620 			 * have happened before we did the deactivate_behind.
621 			 */
622 			pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
623 		}
624 	}
625 	if (pages_in_run) {
626 		vm_page_lockspin_queues();
627 
628 		for (n = 0; n < pages_in_run; n++) {
629 			m = page_run[n];
630 
631 			vm_page_deactivate_internal(m, FALSE);
632 
633 #if DEVELOPMENT || DEBUG
634 			vm_page_deactivate_behind_count++;
635 #endif /* DEVELOPMENT || DEBUG */
636 
637 #if TRACEFAULTPAGE
638 			dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
639 #endif
640 		}
641 		vm_page_unlock_queues();
642 
643 		return TRUE;
644 	}
645 	return FALSE;
646 }
647 
648 
649 #if (DEVELOPMENT || DEBUG)
650 uint32_t        vm_page_creation_throttled_hard = 0;
651 uint32_t        vm_page_creation_throttled_soft = 0;
652 uint64_t        vm_page_creation_throttle_avoided = 0;
653 #endif /* DEVELOPMENT || DEBUG */
654 
655 static int
vm_page_throttled(boolean_t page_kept)656 vm_page_throttled(boolean_t page_kept)
657 {
658 	clock_sec_t     elapsed_sec;
659 	clock_sec_t     tv_sec;
660 	clock_usec_t    tv_usec;
661 	task_t          curtask = current_task_early();
662 
663 	thread_t thread = current_thread();
664 
665 	if (thread->options & TH_OPT_VMPRIV) {
666 		return 0;
667 	}
668 
669 	if (curtask && !curtask->active) {
670 		return 0;
671 	}
672 
673 	if (thread->t_page_creation_throttled) {
674 		thread->t_page_creation_throttled = 0;
675 
676 		if (page_kept == FALSE) {
677 			goto no_throttle;
678 		}
679 	}
680 	if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
681 #if (DEVELOPMENT || DEBUG)
682 		thread->t_page_creation_throttled_hard++;
683 		OSAddAtomic(1, &vm_page_creation_throttled_hard);
684 #endif /* DEVELOPMENT || DEBUG */
685 		return HARD_THROTTLE_DELAY;
686 	}
687 
688 	if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
689 	    thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
690 		if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
691 #if (DEVELOPMENT || DEBUG)
692 			OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
693 #endif
694 			goto no_throttle;
695 		}
696 		clock_get_system_microtime(&tv_sec, &tv_usec);
697 
698 		elapsed_sec = tv_sec - thread->t_page_creation_time;
699 
700 		if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
701 		    (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
702 			if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
703 				/*
704 				 * we'll reset our stats to give a well behaved app
705 				 * that was unlucky enough to accumulate a bunch of pages
706 				 * over a long period of time a chance to get out of
707 				 * the throttled state... we reset the counter and timestamp
708 				 * so that if it stays under the rate limit for the next second
709 				 * it will be back in our good graces... if it exceeds it, it
710 				 * will remain in the throttled state
711 				 */
712 				thread->t_page_creation_time = tv_sec;
713 				thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
714 			}
715 			VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
716 
717 			thread->t_page_creation_throttled = 1;
718 
719 			if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
720 #if (DEVELOPMENT || DEBUG)
721 				thread->t_page_creation_throttled_hard++;
722 				OSAddAtomic(1, &vm_page_creation_throttled_hard);
723 #endif /* DEVELOPMENT || DEBUG */
724 				return HARD_THROTTLE_DELAY;
725 			} else {
726 #if (DEVELOPMENT || DEBUG)
727 				thread->t_page_creation_throttled_soft++;
728 				OSAddAtomic(1, &vm_page_creation_throttled_soft);
729 #endif /* DEVELOPMENT || DEBUG */
730 				return SOFT_THROTTLE_DELAY;
731 			}
732 		}
733 		thread->t_page_creation_time = tv_sec;
734 		thread->t_page_creation_count = 0;
735 	}
736 no_throttle:
737 	thread->t_page_creation_count++;
738 
739 	return 0;
740 }
741 
742 extern boolean_t vm_pageout_running;
743 static __attribute__((noinline, not_tail_called)) void
__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(int throttle_delay)744 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(
745 	int throttle_delay)
746 {
747 	/* make sure vm_pageout_scan() gets to work while we're throttled */
748 	if (!vm_pageout_running) {
749 		thread_wakeup((event_t)&vm_page_free_wanted);
750 	}
751 	delay(throttle_delay);
752 }
753 
754 
755 /*
756  * check for various conditions that would
757  * prevent us from creating a ZF page...
758  * cleanup is based on being called from vm_fault_page
759  *
760  * object must be locked
761  * object == m->vmp_object
762  */
763 static vm_fault_return_t
vm_fault_check(vm_object_t object,vm_page_t m,vm_page_t first_m,wait_interrupt_t interruptible_state,boolean_t page_throttle)764 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
765 {
766 	int throttle_delay;
767 
768 	if (object->shadow_severed ||
769 	    VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
770 		/*
771 		 * Either:
772 		 * 1. the shadow chain was severed,
773 		 * 2. the purgeable object is volatile or empty and is marked
774 		 *    to fault on access while volatile.
775 		 * Just have to return an error at this point
776 		 */
777 		if (m != VM_PAGE_NULL) {
778 			VM_PAGE_FREE(m);
779 		}
780 		vm_fault_cleanup(object, first_m);
781 
782 		thread_interrupt_level(interruptible_state);
783 
784 		if (VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
785 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
786 		}
787 
788 		if (object->shadow_severed) {
789 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_SHADOW_SEVERED), 0 /* arg */);
790 		}
791 		return VM_FAULT_MEMORY_ERROR;
792 	}
793 	if (page_throttle == TRUE) {
794 		if ((throttle_delay = vm_page_throttled(FALSE))) {
795 			/*
796 			 * we're throttling zero-fills...
797 			 * treat this as if we couldn't grab a page
798 			 */
799 			if (m != VM_PAGE_NULL) {
800 				VM_PAGE_FREE(m);
801 			}
802 			vm_fault_cleanup(object, first_m);
803 
804 			VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
805 
806 			__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
807 
808 			if (current_thread_aborted()) {
809 				thread_interrupt_level(interruptible_state);
810 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
811 				return VM_FAULT_INTERRUPTED;
812 			}
813 			thread_interrupt_level(interruptible_state);
814 
815 			return VM_FAULT_MEMORY_SHORTAGE;
816 		}
817 	}
818 	return VM_FAULT_SUCCESS;
819 }
820 
821 /*
822  * Clear the code signing bits on the given page_t
823  */
824 static void
vm_fault_cs_clear(vm_page_t m)825 vm_fault_cs_clear(vm_page_t m)
826 {
827 	m->vmp_cs_validated = VMP_CS_ALL_FALSE;
828 	m->vmp_cs_tainted = VMP_CS_ALL_FALSE;
829 	m->vmp_cs_nx = VMP_CS_ALL_FALSE;
830 }
831 
832 /*
833  * Enqueues the given page on the throttled queue.
834  * The caller must hold the vm_page_queue_lock and it will be held on return.
835  */
836 static void
vm_fault_enqueue_throttled_locked(vm_page_t m)837 vm_fault_enqueue_throttled_locked(vm_page_t m)
838 {
839 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
840 	assert(!VM_PAGE_WIRED(m));
841 
842 	/*
843 	 * can't be on the pageout queue since we don't
844 	 * have a pager to try and clean to
845 	 */
846 	vm_page_queues_remove(m, TRUE);
847 	vm_page_check_pageable_safe(m);
848 	vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
849 	m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
850 	vm_page_throttled_count++;
851 }
852 
853 /*
854  * do the work to zero fill a page and
855  * inject it into the correct paging queue
856  *
857  * m->vmp_object must be locked
858  * page queue lock must NOT be held
859  */
860 static int
vm_fault_zero_page(vm_page_t m,boolean_t no_zero_fill)861 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
862 {
863 	int my_fault = DBG_ZERO_FILL_FAULT;
864 	vm_object_t     object;
865 
866 	object = VM_PAGE_OBJECT(m);
867 
868 	/*
869 	 * This is is a zero-fill page fault...
870 	 *
871 	 * Checking the page lock is a waste of
872 	 * time;  this page was absent, so
873 	 * it can't be page locked by a pager.
874 	 *
875 	 * we also consider it undefined
876 	 * with respect to instruction
877 	 * execution.  i.e. it is the responsibility
878 	 * of higher layers to call for an instruction
879 	 * sync after changing the contents and before
880 	 * sending a program into this area.  We
881 	 * choose this approach for performance
882 	 */
883 	vm_fault_cs_clear(m);
884 	m->vmp_pmapped = TRUE;
885 
886 	if (no_zero_fill == TRUE) {
887 		my_fault = DBG_NZF_PAGE_FAULT;
888 
889 		if (m->vmp_absent && m->vmp_busy) {
890 			return my_fault;
891 		}
892 	} else {
893 		vm_page_zero_fill(m);
894 
895 		counter_inc(&vm_statistics_zero_fill_count);
896 		DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
897 	}
898 	assert(!m->vmp_laundry);
899 	assert(object != kernel_object);
900 	//assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
901 	if (!VM_DYNAMIC_PAGING_ENABLED() &&
902 	    (object->purgable == VM_PURGABLE_DENY ||
903 	    object->purgable == VM_PURGABLE_NONVOLATILE ||
904 	    object->purgable == VM_PURGABLE_VOLATILE)) {
905 		vm_page_lockspin_queues();
906 		if (!VM_DYNAMIC_PAGING_ENABLED()) {
907 			vm_fault_enqueue_throttled_locked(m);
908 		}
909 		vm_page_unlock_queues();
910 	}
911 	return my_fault;
912 }
913 
914 
915 /*
916  *	Routine:	vm_fault_page
917  *	Purpose:
918  *		Find the resident page for the virtual memory
919  *		specified by the given virtual memory object
920  *		and offset.
921  *	Additional arguments:
922  *		The required permissions for the page is given
923  *		in "fault_type".  Desired permissions are included
924  *		in "protection".
925  *		fault_info is passed along to determine pagein cluster
926  *		limits... it contains the expected reference pattern,
927  *		cluster size if available, etc...
928  *
929  *		If the desired page is known to be resident (for
930  *		example, because it was previously wired down), asserting
931  *		the "unwiring" parameter will speed the search.
932  *
933  *		If the operation can be interrupted (by thread_abort
934  *		or thread_terminate), then the "interruptible"
935  *		parameter should be asserted.
936  *
937  *	Results:
938  *		The page containing the proper data is returned
939  *		in "result_page".
940  *
941  *	In/out conditions:
942  *		The source object must be locked and referenced,
943  *		and must donate one paging reference.  The reference
944  *		is not affected.  The paging reference and lock are
945  *		consumed.
946  *
947  *		If the call succeeds, the object in which "result_page"
948  *		resides is left locked and holding a paging reference.
949  *		If this is not the original object, a busy page in the
950  *		original object is returned in "top_page", to prevent other
951  *		callers from pursuing this same data, along with a paging
952  *		reference for the original object.  The "top_page" should
953  *		be destroyed when this guarantee is no longer required.
954  *		The "result_page" is also left busy.  It is not removed
955  *		from the pageout queues.
956  *	Special Case:
957  *		A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
958  *		fault succeeded but there's no VM page (i.e. the VM object
959  *              does not actually hold VM pages, but device memory or
960  *		large pages).  The object is still locked and we still hold a
961  *		paging_in_progress reference.
962  */
963 unsigned int vm_fault_page_blocked_access = 0;
964 unsigned int vm_fault_page_forced_retry = 0;
965 
966 vm_fault_return_t
vm_fault_page(vm_object_t first_object,vm_object_offset_t first_offset,vm_prot_t fault_type,boolean_t must_be_resident,boolean_t caller_lookup,vm_prot_t * protection,vm_page_t * result_page,vm_page_t * top_page,int * type_of_fault,kern_return_t * error_code,boolean_t no_zero_fill,vm_object_fault_info_t fault_info)967 vm_fault_page(
968 	/* Arguments: */
969 	vm_object_t     first_object,   /* Object to begin search */
970 	vm_object_offset_t first_offset,        /* Offset into object */
971 	vm_prot_t       fault_type,     /* What access is requested */
972 	boolean_t       must_be_resident,/* Must page be resident? */
973 	boolean_t       caller_lookup,  /* caller looked up page */
974 	/* Modifies in place: */
975 	vm_prot_t       *protection,    /* Protection for mapping */
976 	vm_page_t       *result_page,   /* Page found, if successful */
977 	/* Returns: */
978 	vm_page_t       *top_page,      /* Page in top object, if
979                                          * not result_page.  */
980 	int             *type_of_fault, /* if non-null, fill in with type of fault
981                                          * COW, zero-fill, etc... returned in trace point */
982 	/* More arguments: */
983 	kern_return_t   *error_code,    /* code if page is in error */
984 	boolean_t       no_zero_fill,   /* don't zero fill absent pages */
985 	vm_object_fault_info_t fault_info)
986 {
987 	vm_page_t               m;
988 	vm_object_t             object;
989 	vm_object_offset_t      offset;
990 	vm_page_t               first_m;
991 	vm_object_t             next_object;
992 	vm_object_t             copy_object;
993 	boolean_t               look_for_page;
994 	boolean_t               force_fault_retry = FALSE;
995 	vm_prot_t               access_required = fault_type;
996 	vm_prot_t               wants_copy_flag;
997 	kern_return_t           wait_result;
998 	wait_interrupt_t        interruptible_state;
999 	boolean_t               data_already_requested = FALSE;
1000 	vm_behavior_t           orig_behavior;
1001 	vm_size_t               orig_cluster_size;
1002 	vm_fault_return_t       error;
1003 	int                     my_fault;
1004 	uint32_t                try_failed_count;
1005 	int                     interruptible; /* how may fault be interrupted? */
1006 	int                     external_state = VM_EXTERNAL_STATE_UNKNOWN;
1007 	memory_object_t         pager;
1008 	vm_fault_return_t       retval;
1009 	int                     grab_options;
1010 	bool                    clear_absent_on_error = false;
1011 
1012 /*
1013  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
1014  * marked as paged out in the compressor pager or the pager doesn't exist.
1015  * Note also that if the pager for an internal object
1016  * has not been created, the pager is not invoked regardless of the value
1017  * of MUST_ASK_PAGER().
1018  *
1019  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
1020  * is marked as paged out in the compressor pager.
1021  * PAGED_OUT() is used to determine if a page has already been pushed
1022  * into a copy object in order to avoid a redundant page out operation.
1023  */
1024 #define MUST_ASK_PAGER(o, f, s)                                 \
1025 	((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
1026 
1027 #define PAGED_OUT(o, f) \
1028 	(VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
1029 
1030 /*
1031  *	Recovery actions
1032  */
1033 #define RELEASE_PAGE(m)                                 \
1034 	MACRO_BEGIN                                     \
1035 	PAGE_WAKEUP_DONE(m);                            \
1036 	if ( !VM_PAGE_PAGEABLE(m)) {                    \
1037 	        vm_page_lockspin_queues();              \
1038 	        if (clear_absent_on_error && m->vmp_absent) {\
1039 	                vm_page_zero_fill(m);           \
1040 	                counter_inc(&vm_statistics_zero_fill_count);\
1041 	                DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);\
1042 	                m->vmp_absent = false;          \
1043 	        }                                       \
1044 	        if ( !VM_PAGE_PAGEABLE(m)) {            \
1045 	                if (VM_CONFIG_COMPRESSOR_IS_ACTIVE)     \
1046 	                        vm_page_deactivate(m);          \
1047 	                else                                    \
1048 	                        vm_page_activate(m);            \
1049 	        }                                               \
1050 	        vm_page_unlock_queues();                        \
1051 	}                                                       \
1052 	clear_absent_on_error = false;                  \
1053 	MACRO_END
1054 
1055 #if TRACEFAULTPAGE
1056 	dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
1057 #endif
1058 
1059 	interruptible = fault_info->interruptible;
1060 	interruptible_state = thread_interrupt_level(interruptible);
1061 
1062 	/*
1063 	 *	INVARIANTS (through entire routine):
1064 	 *
1065 	 *	1)	At all times, we must either have the object
1066 	 *		lock or a busy page in some object to prevent
1067 	 *		some other thread from trying to bring in
1068 	 *		the same page.
1069 	 *
1070 	 *		Note that we cannot hold any locks during the
1071 	 *		pager access or when waiting for memory, so
1072 	 *		we use a busy page then.
1073 	 *
1074 	 *	2)	To prevent another thread from racing us down the
1075 	 *		shadow chain and entering a new page in the top
1076 	 *		object before we do, we must keep a busy page in
1077 	 *		the top object while following the shadow chain.
1078 	 *
1079 	 *	3)	We must increment paging_in_progress on any object
1080 	 *		for which we have a busy page before dropping
1081 	 *		the object lock
1082 	 *
1083 	 *	4)	We leave busy pages on the pageout queues.
1084 	 *		If the pageout daemon comes across a busy page,
1085 	 *		it will remove the page from the pageout queues.
1086 	 */
1087 
1088 	object = first_object;
1089 	offset = first_offset;
1090 	first_m = VM_PAGE_NULL;
1091 	access_required = fault_type;
1092 
1093 	/*
1094 	 * default type of fault
1095 	 */
1096 	my_fault = DBG_CACHE_HIT_FAULT;
1097 	thread_pri_floor_t token;
1098 	bool    drop_floor = false;
1099 
1100 	while (TRUE) {
1101 #if TRACEFAULTPAGE
1102 		dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1103 #endif
1104 
1105 		grab_options = 0;
1106 #if CONFIG_SECLUDED_MEMORY
1107 		if (object->can_grab_secluded) {
1108 			grab_options |= VM_PAGE_GRAB_SECLUDED;
1109 		}
1110 #endif /* CONFIG_SECLUDED_MEMORY */
1111 
1112 		if (!object->alive) {
1113 			/*
1114 			 * object is no longer valid
1115 			 * clean up and return error
1116 			 */
1117 #if DEVELOPMENT || DEBUG
1118 			printf("FBDP rdar://93769854 %s:%d object %p internal %d pager %p (%s) copy %p shadow %p alive %d terminating %d named %d ref %d shadow_severed %d\n", __FUNCTION__, __LINE__, object, object->internal, object->pager, object->pager ? object->pager->mo_pager_ops->memory_object_pager_name : "?", object->copy, object->shadow, object->alive, object->terminating, object->named, object->ref_count, object->shadow_severed);
1119 			if (panic_object_not_alive) {
1120 				panic("FBDP rdar://93769854 %s:%d object %p internal %d pager %p (%s) copy %p shadow %p alive %d terminating %d named %d ref %d shadow_severed %d\n", __FUNCTION__, __LINE__, object, object->internal, object->pager, object->pager ? object->pager->mo_pager_ops->memory_object_pager_name : "?", object->copy, object->shadow, object->alive, object->terminating, object->named, object->ref_count, object->shadow_severed);
1121 			}
1122 #endif /* DEVELOPMENT || DEBUG */
1123 			vm_fault_cleanup(object, first_m);
1124 			thread_interrupt_level(interruptible_state);
1125 
1126 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_NOT_ALIVE), 0 /* arg */);
1127 			return VM_FAULT_MEMORY_ERROR;
1128 		}
1129 
1130 		if (!object->pager_created && object->phys_contiguous) {
1131 			/*
1132 			 * A physically-contiguous object without a pager:
1133 			 * must be a "large page" object.  We do not deal
1134 			 * with VM pages for this object.
1135 			 */
1136 			caller_lookup = FALSE;
1137 			m = VM_PAGE_NULL;
1138 			goto phys_contig_object;
1139 		}
1140 
1141 		if (object->blocked_access) {
1142 			/*
1143 			 * Access to this VM object has been blocked.
1144 			 * Replace our "paging_in_progress" reference with
1145 			 * a "activity_in_progress" reference and wait for
1146 			 * access to be unblocked.
1147 			 */
1148 			caller_lookup = FALSE; /* no longer valid after sleep */
1149 			vm_object_activity_begin(object);
1150 			vm_object_paging_end(object);
1151 			while (object->blocked_access) {
1152 				vm_object_sleep(object,
1153 				    VM_OBJECT_EVENT_UNBLOCKED,
1154 				    THREAD_UNINT);
1155 			}
1156 			vm_fault_page_blocked_access++;
1157 			vm_object_paging_begin(object);
1158 			vm_object_activity_end(object);
1159 		}
1160 
1161 		/*
1162 		 * See whether the page at 'offset' is resident
1163 		 */
1164 		if (caller_lookup == TRUE) {
1165 			/*
1166 			 * The caller has already looked up the page
1167 			 * and gave us the result in "result_page".
1168 			 * We can use this for the first lookup but
1169 			 * it loses its validity as soon as we unlock
1170 			 * the object.
1171 			 */
1172 			m = *result_page;
1173 			caller_lookup = FALSE; /* no longer valid after that */
1174 		} else {
1175 			m = vm_page_lookup(object, vm_object_trunc_page(offset));
1176 		}
1177 #if TRACEFAULTPAGE
1178 		dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1179 #endif
1180 		if (m != VM_PAGE_NULL) {
1181 			if (m->vmp_busy) {
1182 				/*
1183 				 * The page is being brought in,
1184 				 * wait for it and then retry.
1185 				 */
1186 #if TRACEFAULTPAGE
1187 				dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1188 #endif
1189 				wait_result = PAGE_SLEEP(object, m, interruptible);
1190 
1191 				if (wait_result != THREAD_AWAKENED) {
1192 					vm_fault_cleanup(object, first_m);
1193 					thread_interrupt_level(interruptible_state);
1194 
1195 					if (wait_result == THREAD_RESTART) {
1196 						return VM_FAULT_RETRY;
1197 					} else {
1198 						ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_BUSYPAGE_WAIT_INTERRUPTED), 0 /* arg */);
1199 						return VM_FAULT_INTERRUPTED;
1200 					}
1201 				}
1202 				continue;
1203 			}
1204 			if (m->vmp_laundry) {
1205 				m->vmp_free_when_done = FALSE;
1206 
1207 				if (!m->vmp_cleaning) {
1208 					vm_pageout_steal_laundry(m, FALSE);
1209 				}
1210 			}
1211 			vm_object_lock_assert_exclusive(VM_PAGE_OBJECT(m));
1212 			if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
1213 				/*
1214 				 * Guard page: off limits !
1215 				 */
1216 				if (fault_type == VM_PROT_NONE) {
1217 					/*
1218 					 * The fault is not requesting any
1219 					 * access to the guard page, so it must
1220 					 * be just to wire or unwire it.
1221 					 * Let's pretend it succeeded...
1222 					 */
1223 					m->vmp_busy = TRUE;
1224 					*result_page = m;
1225 					assert(first_m == VM_PAGE_NULL);
1226 					*top_page = first_m;
1227 					if (type_of_fault) {
1228 						*type_of_fault = DBG_GUARD_FAULT;
1229 					}
1230 					thread_interrupt_level(interruptible_state);
1231 					return VM_FAULT_SUCCESS;
1232 				} else {
1233 					/*
1234 					 * The fault requests access to the
1235 					 * guard page: let's deny that !
1236 					 */
1237 					vm_fault_cleanup(object, first_m);
1238 					thread_interrupt_level(interruptible_state);
1239 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_GUARDPAGE_FAULT), 0 /* arg */);
1240 					return VM_FAULT_MEMORY_ERROR;
1241 				}
1242 			}
1243 
1244 
1245 			if (VMP_ERROR_GET(m)) {
1246 				/*
1247 				 * The page is in error, give up now.
1248 				 */
1249 #if TRACEFAULTPAGE
1250 				dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1251 #endif
1252 				if (error_code) {
1253 					*error_code = KERN_MEMORY_ERROR;
1254 				}
1255 				VM_PAGE_FREE(m);
1256 
1257 				vm_fault_cleanup(object, first_m);
1258 				thread_interrupt_level(interruptible_state);
1259 
1260 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PAGE_HAS_ERROR), 0 /* arg */);
1261 				return VM_FAULT_MEMORY_ERROR;
1262 			}
1263 			if (m->vmp_restart) {
1264 				/*
1265 				 * The pager wants us to restart
1266 				 * at the top of the chain,
1267 				 * typically because it has moved the
1268 				 * page to another pager, then do so.
1269 				 */
1270 #if TRACEFAULTPAGE
1271 				dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1272 #endif
1273 				VM_PAGE_FREE(m);
1274 
1275 				vm_fault_cleanup(object, first_m);
1276 				thread_interrupt_level(interruptible_state);
1277 
1278 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PAGE_HAS_RESTART), 0 /* arg */);
1279 				return VM_FAULT_RETRY;
1280 			}
1281 			if (m->vmp_absent) {
1282 				/*
1283 				 * The page isn't busy, but is absent,
1284 				 * therefore it's deemed "unavailable".
1285 				 *
1286 				 * Remove the non-existent page (unless it's
1287 				 * in the top object) and move on down to the
1288 				 * next object (if there is one).
1289 				 */
1290 #if TRACEFAULTPAGE
1291 				dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1292 #endif
1293 				next_object = object->shadow;
1294 
1295 				if (next_object == VM_OBJECT_NULL) {
1296 					/*
1297 					 * Absent page at bottom of shadow
1298 					 * chain; zero fill the page we left
1299 					 * busy in the first object, and free
1300 					 * the absent page.
1301 					 */
1302 					assert(!must_be_resident);
1303 
1304 					/*
1305 					 * check for any conditions that prevent
1306 					 * us from creating a new zero-fill page
1307 					 * vm_fault_check will do all of the
1308 					 * fault cleanup in the case of an error condition
1309 					 * including resetting the thread_interrupt_level
1310 					 */
1311 					error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1312 
1313 					if (error != VM_FAULT_SUCCESS) {
1314 						return error;
1315 					}
1316 
1317 					if (object != first_object) {
1318 						/*
1319 						 * free the absent page we just found
1320 						 */
1321 						VM_PAGE_FREE(m);
1322 
1323 						/*
1324 						 * drop reference and lock on current object
1325 						 */
1326 						vm_object_paging_end(object);
1327 						vm_object_unlock(object);
1328 
1329 						/*
1330 						 * grab the original page we
1331 						 * 'soldered' in place and
1332 						 * retake lock on 'first_object'
1333 						 */
1334 						m = first_m;
1335 						first_m = VM_PAGE_NULL;
1336 
1337 						object = first_object;
1338 						offset = first_offset;
1339 
1340 						vm_object_lock(object);
1341 					} else {
1342 						/*
1343 						 * we're going to use the absent page we just found
1344 						 * so convert it to a 'busy' page
1345 						 */
1346 						m->vmp_absent = FALSE;
1347 						m->vmp_busy = TRUE;
1348 					}
1349 					if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1350 						m->vmp_absent = TRUE;
1351 						clear_absent_on_error = true;
1352 					}
1353 					/*
1354 					 * zero-fill the page and put it on
1355 					 * the correct paging queue
1356 					 */
1357 					my_fault = vm_fault_zero_page(m, no_zero_fill);
1358 
1359 					break;
1360 				} else {
1361 					if (must_be_resident) {
1362 						vm_object_paging_end(object);
1363 					} else if (object != first_object) {
1364 						vm_object_paging_end(object);
1365 						VM_PAGE_FREE(m);
1366 					} else {
1367 						first_m = m;
1368 						m->vmp_absent = FALSE;
1369 						m->vmp_busy = TRUE;
1370 
1371 						vm_page_lockspin_queues();
1372 						vm_page_queues_remove(m, FALSE);
1373 						vm_page_unlock_queues();
1374 					}
1375 
1376 					offset += object->vo_shadow_offset;
1377 					fault_info->lo_offset += object->vo_shadow_offset;
1378 					fault_info->hi_offset += object->vo_shadow_offset;
1379 					access_required = VM_PROT_READ;
1380 
1381 					vm_object_lock(next_object);
1382 					vm_object_unlock(object);
1383 					object = next_object;
1384 					vm_object_paging_begin(object);
1385 
1386 					/*
1387 					 * reset to default type of fault
1388 					 */
1389 					my_fault = DBG_CACHE_HIT_FAULT;
1390 
1391 					continue;
1392 				}
1393 			}
1394 			if ((m->vmp_cleaning)
1395 			    && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1396 			    && (fault_type & VM_PROT_WRITE)) {
1397 				/*
1398 				 * This is a copy-on-write fault that will
1399 				 * cause us to revoke access to this page, but
1400 				 * this page is in the process of being cleaned
1401 				 * in a clustered pageout. We must wait until
1402 				 * the cleaning operation completes before
1403 				 * revoking access to the original page,
1404 				 * otherwise we might attempt to remove a
1405 				 * wired mapping.
1406 				 */
1407 #if TRACEFAULTPAGE
1408 				dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1409 #endif
1410 				/*
1411 				 * take an extra ref so that object won't die
1412 				 */
1413 				vm_object_reference_locked(object);
1414 
1415 				vm_fault_cleanup(object, first_m);
1416 
1417 				vm_object_lock(object);
1418 				assert(object->ref_count > 0);
1419 
1420 				m = vm_page_lookup(object, vm_object_trunc_page(offset));
1421 
1422 				if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1423 					PAGE_ASSERT_WAIT(m, interruptible);
1424 
1425 					vm_object_unlock(object);
1426 					wait_result = thread_block(THREAD_CONTINUE_NULL);
1427 					vm_object_deallocate(object);
1428 
1429 					goto backoff;
1430 				} else {
1431 					vm_object_unlock(object);
1432 
1433 					vm_object_deallocate(object);
1434 					thread_interrupt_level(interruptible_state);
1435 
1436 					return VM_FAULT_RETRY;
1437 				}
1438 			}
1439 			if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1440 			    !(fault_info != NULL && fault_info->stealth)) {
1441 				/*
1442 				 * If we were passed a non-NULL pointer for
1443 				 * "type_of_fault", than we came from
1444 				 * vm_fault... we'll let it deal with
1445 				 * this condition, since it
1446 				 * needs to see m->vmp_speculative to correctly
1447 				 * account the pageins, otherwise...
1448 				 * take it off the speculative queue, we'll
1449 				 * let the caller of vm_fault_page deal
1450 				 * with getting it onto the correct queue
1451 				 *
1452 				 * If the caller specified in fault_info that
1453 				 * it wants a "stealth" fault, we also leave
1454 				 * the page in the speculative queue.
1455 				 */
1456 				vm_page_lockspin_queues();
1457 				if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
1458 					vm_page_queues_remove(m, FALSE);
1459 				}
1460 				vm_page_unlock_queues();
1461 			}
1462 			assert(object == VM_PAGE_OBJECT(m));
1463 
1464 			if (object->code_signed) {
1465 				/*
1466 				 * CODE SIGNING:
1467 				 * We just paged in a page from a signed
1468 				 * memory object but we don't need to
1469 				 * validate it now.  We'll validate it if
1470 				 * when it gets mapped into a user address
1471 				 * space for the first time or when the page
1472 				 * gets copied to another object as a result
1473 				 * of a copy-on-write.
1474 				 */
1475 			}
1476 
1477 			/*
1478 			 * We mark the page busy and leave it on
1479 			 * the pageout queues.  If the pageout
1480 			 * deamon comes across it, then it will
1481 			 * remove the page from the queue, but not the object
1482 			 */
1483 #if TRACEFAULTPAGE
1484 			dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1485 #endif
1486 			assert(!m->vmp_busy);
1487 			assert(!m->vmp_absent);
1488 
1489 			m->vmp_busy = TRUE;
1490 			break;
1491 		}
1492 
1493 		/*
1494 		 * we get here when there is no page present in the object at
1495 		 * the offset we're interested in... we'll allocate a page
1496 		 * at this point if the pager associated with
1497 		 * this object can provide the data or we're the top object...
1498 		 * object is locked;  m == NULL
1499 		 */
1500 
1501 		if (must_be_resident) {
1502 			if (fault_type == VM_PROT_NONE &&
1503 			    object == kernel_object) {
1504 				/*
1505 				 * We've been called from vm_fault_unwire()
1506 				 * while removing a map entry that was allocated
1507 				 * with KMA_KOBJECT and KMA_VAONLY.  This page
1508 				 * is not present and there's nothing more to
1509 				 * do here (nothing to unwire).
1510 				 */
1511 				vm_fault_cleanup(object, first_m);
1512 				thread_interrupt_level(interruptible_state);
1513 
1514 				return VM_FAULT_MEMORY_ERROR;
1515 			}
1516 
1517 			goto dont_look_for_page;
1518 		}
1519 
1520 		/* Don't expect to fault pages into the kernel object. */
1521 		assert(object != kernel_object);
1522 
1523 		look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE));
1524 
1525 #if TRACEFAULTPAGE
1526 		dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1527 #endif
1528 		if (!look_for_page && object == first_object && !object->phys_contiguous) {
1529 			/*
1530 			 * Allocate a new page for this object/offset pair as a placeholder
1531 			 */
1532 			m = vm_page_grab_options(grab_options);
1533 #if TRACEFAULTPAGE
1534 			dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1535 #endif
1536 			if (m == VM_PAGE_NULL) {
1537 				vm_fault_cleanup(object, first_m);
1538 				thread_interrupt_level(interruptible_state);
1539 
1540 				return VM_FAULT_MEMORY_SHORTAGE;
1541 			}
1542 
1543 			if (fault_info && fault_info->batch_pmap_op == TRUE) {
1544 				vm_page_insert_internal(m, object,
1545 				    vm_object_trunc_page(offset),
1546 				    VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1547 			} else {
1548 				vm_page_insert(m, object, vm_object_trunc_page(offset));
1549 			}
1550 		}
1551 		if (look_for_page) {
1552 			kern_return_t   rc;
1553 			int             my_fault_type;
1554 
1555 			/*
1556 			 *	If the memory manager is not ready, we
1557 			 *	cannot make requests.
1558 			 */
1559 			if (!object->pager_ready) {
1560 #if TRACEFAULTPAGE
1561 				dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1562 #endif
1563 				if (m != VM_PAGE_NULL) {
1564 					VM_PAGE_FREE(m);
1565 				}
1566 
1567 				/*
1568 				 * take an extra ref so object won't die
1569 				 */
1570 				vm_object_reference_locked(object);
1571 				vm_fault_cleanup(object, first_m);
1572 
1573 				vm_object_lock(object);
1574 				assert(object->ref_count > 0);
1575 
1576 				if (!object->pager_ready) {
1577 					wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1578 
1579 					vm_object_unlock(object);
1580 					if (wait_result == THREAD_WAITING) {
1581 						wait_result = thread_block(THREAD_CONTINUE_NULL);
1582 					}
1583 					vm_object_deallocate(object);
1584 
1585 					goto backoff;
1586 				} else {
1587 					vm_object_unlock(object);
1588 					vm_object_deallocate(object);
1589 					thread_interrupt_level(interruptible_state);
1590 
1591 					return VM_FAULT_RETRY;
1592 				}
1593 			}
1594 			if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1595 				/*
1596 				 * If there are too many outstanding page
1597 				 * requests pending on this external object, we
1598 				 * wait for them to be resolved now.
1599 				 */
1600 #if TRACEFAULTPAGE
1601 				dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1602 #endif
1603 				if (m != VM_PAGE_NULL) {
1604 					VM_PAGE_FREE(m);
1605 				}
1606 				/*
1607 				 * take an extra ref so object won't die
1608 				 */
1609 				vm_object_reference_locked(object);
1610 
1611 				vm_fault_cleanup(object, first_m);
1612 
1613 				vm_object_lock(object);
1614 				assert(object->ref_count > 0);
1615 
1616 				if (object->paging_in_progress >= vm_object_pagein_throttle) {
1617 					vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1618 
1619 					vm_object_unlock(object);
1620 					wait_result = thread_block(THREAD_CONTINUE_NULL);
1621 					vm_object_deallocate(object);
1622 
1623 					goto backoff;
1624 				} else {
1625 					vm_object_unlock(object);
1626 					vm_object_deallocate(object);
1627 					thread_interrupt_level(interruptible_state);
1628 
1629 					return VM_FAULT_RETRY;
1630 				}
1631 			}
1632 			if (object->internal) {
1633 				int compressed_count_delta;
1634 
1635 				assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1636 
1637 				if (m == VM_PAGE_NULL) {
1638 					/*
1639 					 * Allocate a new page for this object/offset pair as a placeholder
1640 					 */
1641 					m = vm_page_grab_options(grab_options);
1642 #if TRACEFAULTPAGE
1643 					dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1644 #endif
1645 					if (m == VM_PAGE_NULL) {
1646 						vm_fault_cleanup(object, first_m);
1647 						thread_interrupt_level(interruptible_state);
1648 
1649 						return VM_FAULT_MEMORY_SHORTAGE;
1650 					}
1651 
1652 					m->vmp_absent = TRUE;
1653 					if (fault_info && fault_info->batch_pmap_op == TRUE) {
1654 						vm_page_insert_internal(m, object, vm_object_trunc_page(offset), VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1655 					} else {
1656 						vm_page_insert(m, object, vm_object_trunc_page(offset));
1657 					}
1658 				}
1659 				assert(m->vmp_busy);
1660 
1661 				m->vmp_absent = TRUE;
1662 				pager = object->pager;
1663 
1664 				assert(object->paging_in_progress > 0);
1665 				vm_object_unlock(object);
1666 
1667 				rc = vm_compressor_pager_get(
1668 					pager,
1669 					offset + object->paging_offset,
1670 					VM_PAGE_GET_PHYS_PAGE(m),
1671 					&my_fault_type,
1672 					0,
1673 					&compressed_count_delta);
1674 
1675 				if (type_of_fault == NULL) {
1676 					int     throttle_delay;
1677 
1678 					/*
1679 					 * we weren't called from vm_fault, so we
1680 					 * need to apply page creation throttling
1681 					 * do it before we re-acquire any locks
1682 					 */
1683 					if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1684 						if ((throttle_delay = vm_page_throttled(TRUE))) {
1685 							VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1686 							__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
1687 						}
1688 					}
1689 				}
1690 				vm_object_lock(object);
1691 				assert(object->paging_in_progress > 0);
1692 
1693 				vm_compressor_pager_count(
1694 					pager,
1695 					compressed_count_delta,
1696 					FALSE, /* shared_lock */
1697 					object);
1698 
1699 				switch (rc) {
1700 				case KERN_SUCCESS:
1701 					m->vmp_absent = FALSE;
1702 					m->vmp_dirty = TRUE;
1703 					if ((object->wimg_bits &
1704 					    VM_WIMG_MASK) !=
1705 					    VM_WIMG_USE_DEFAULT) {
1706 						/*
1707 						 * If the page is not cacheable,
1708 						 * we can't let its contents
1709 						 * linger in the data cache
1710 						 * after the decompression.
1711 						 */
1712 						pmap_sync_page_attributes_phys(
1713 							VM_PAGE_GET_PHYS_PAGE(m));
1714 					} else {
1715 						m->vmp_written_by_kernel = TRUE;
1716 					}
1717 
1718 					/*
1719 					 * If the object is purgeable, its
1720 					 * owner's purgeable ledgers have been
1721 					 * updated in vm_page_insert() but the
1722 					 * page was also accounted for in a
1723 					 * "compressed purgeable" ledger, so
1724 					 * update that now.
1725 					 */
1726 					if (((object->purgable !=
1727 					    VM_PURGABLE_DENY) ||
1728 					    object->vo_ledger_tag) &&
1729 					    (object->vo_owner !=
1730 					    NULL)) {
1731 						/*
1732 						 * One less compressed
1733 						 * purgeable/tagged page.
1734 						 */
1735 						vm_object_owner_compressed_update(
1736 							object,
1737 							-1);
1738 					}
1739 
1740 					break;
1741 				case KERN_MEMORY_FAILURE:
1742 					m->vmp_unusual = TRUE;
1743 					m->vmp_error = TRUE;
1744 					m->vmp_absent = FALSE;
1745 					break;
1746 				case KERN_MEMORY_ERROR:
1747 					assert(m->vmp_absent);
1748 					break;
1749 				default:
1750 					panic("vm_fault_page(): unexpected "
1751 					    "error %d from "
1752 					    "vm_compressor_pager_get()\n",
1753 					    rc);
1754 				}
1755 				PAGE_WAKEUP_DONE(m);
1756 
1757 				rc = KERN_SUCCESS;
1758 				goto data_requested;
1759 			}
1760 			my_fault_type = DBG_PAGEIN_FAULT;
1761 
1762 			if (m != VM_PAGE_NULL) {
1763 				VM_PAGE_FREE(m);
1764 				m = VM_PAGE_NULL;
1765 			}
1766 
1767 #if TRACEFAULTPAGE
1768 			dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1769 #endif
1770 
1771 			/*
1772 			 * It's possible someone called vm_object_destroy while we weren't
1773 			 * holding the object lock.  If that has happened, then bail out
1774 			 * here.
1775 			 */
1776 
1777 			pager = object->pager;
1778 
1779 			if (pager == MEMORY_OBJECT_NULL) {
1780 				vm_fault_cleanup(object, first_m);
1781 				thread_interrupt_level(interruptible_state);
1782 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_NO_PAGER), 0 /* arg */);
1783 				return VM_FAULT_MEMORY_ERROR;
1784 			}
1785 
1786 			/*
1787 			 * We have an absent page in place for the faulting offset,
1788 			 * so we can release the object lock.
1789 			 */
1790 
1791 			if (object->object_is_shared_cache) {
1792 				token = thread_priority_floor_start();
1793 				/*
1794 				 * A non-native shared cache object might
1795 				 * be getting set up in parallel with this
1796 				 * fault and so we can't assume that this
1797 				 * check will be valid after we drop the
1798 				 * object lock below.
1799 				 */
1800 				drop_floor = true;
1801 			}
1802 
1803 			vm_object_unlock(object);
1804 
1805 			/*
1806 			 * If this object uses a copy_call strategy,
1807 			 * and we are interested in a copy of this object
1808 			 * (having gotten here only by following a
1809 			 * shadow chain), then tell the memory manager
1810 			 * via a flag added to the desired_access
1811 			 * parameter, so that it can detect a race
1812 			 * between our walking down the shadow chain
1813 			 * and its pushing pages up into a copy of
1814 			 * the object that it manages.
1815 			 */
1816 			if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
1817 				wants_copy_flag = VM_PROT_WANTS_COPY;
1818 			} else {
1819 				wants_copy_flag = VM_PROT_NONE;
1820 			}
1821 
1822 			if (object->copy == first_object) {
1823 				/*
1824 				 * if we issue the memory_object_data_request in
1825 				 * this state, we are subject to a deadlock with
1826 				 * the underlying filesystem if it is trying to
1827 				 * shrink the file resulting in a push of pages
1828 				 * into the copy object...  that push will stall
1829 				 * on the placeholder page, and if the pushing thread
1830 				 * is holding a lock that is required on the pagein
1831 				 * path (such as a truncate lock), we'll deadlock...
1832 				 * to avoid this potential deadlock, we throw away
1833 				 * our placeholder page before calling memory_object_data_request
1834 				 * and force this thread to retry the vm_fault_page after
1835 				 * we have issued the I/O.  the second time through this path
1836 				 * we will find the page already in the cache (presumably still
1837 				 * busy waiting for the I/O to complete) and then complete
1838 				 * the fault w/o having to go through memory_object_data_request again
1839 				 */
1840 				assert(first_m != VM_PAGE_NULL);
1841 				assert(VM_PAGE_OBJECT(first_m) == first_object);
1842 
1843 				vm_object_lock(first_object);
1844 				VM_PAGE_FREE(first_m);
1845 				vm_object_paging_end(first_object);
1846 				vm_object_unlock(first_object);
1847 
1848 				first_m = VM_PAGE_NULL;
1849 				force_fault_retry = TRUE;
1850 
1851 				vm_fault_page_forced_retry++;
1852 			}
1853 
1854 			if (data_already_requested == TRUE) {
1855 				orig_behavior = fault_info->behavior;
1856 				orig_cluster_size = fault_info->cluster_size;
1857 
1858 				fault_info->behavior = VM_BEHAVIOR_RANDOM;
1859 				fault_info->cluster_size = PAGE_SIZE;
1860 			}
1861 			/*
1862 			 * Call the memory manager to retrieve the data.
1863 			 */
1864 			rc = memory_object_data_request(
1865 				pager,
1866 				vm_object_trunc_page(offset) + object->paging_offset,
1867 				PAGE_SIZE,
1868 				access_required | wants_copy_flag,
1869 				(memory_object_fault_info_t)fault_info);
1870 
1871 			if (data_already_requested == TRUE) {
1872 				fault_info->behavior = orig_behavior;
1873 				fault_info->cluster_size = orig_cluster_size;
1874 			} else {
1875 				data_already_requested = TRUE;
1876 			}
1877 
1878 			DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1879 #if TRACEFAULTPAGE
1880 			dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1881 #endif
1882 			vm_object_lock(object);
1883 
1884 			if (drop_floor && object->object_is_shared_cache) {
1885 				thread_priority_floor_end(&token);
1886 				drop_floor = false;
1887 			}
1888 
1889 data_requested:
1890 			if (rc != KERN_SUCCESS) {
1891 				vm_fault_cleanup(object, first_m);
1892 				thread_interrupt_level(interruptible_state);
1893 
1894 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NO_DATA), 0 /* arg */);
1895 
1896 				return (rc == MACH_SEND_INTERRUPTED) ?
1897 				       VM_FAULT_INTERRUPTED :
1898 				       VM_FAULT_MEMORY_ERROR;
1899 			} else {
1900 				clock_sec_t     tv_sec;
1901 				clock_usec_t    tv_usec;
1902 
1903 				if (my_fault_type == DBG_PAGEIN_FAULT) {
1904 					clock_get_system_microtime(&tv_sec, &tv_usec);
1905 					current_thread()->t_page_creation_time = tv_sec;
1906 					current_thread()->t_page_creation_count = 0;
1907 				}
1908 			}
1909 			if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1910 				vm_fault_cleanup(object, first_m);
1911 				thread_interrupt_level(interruptible_state);
1912 
1913 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
1914 				return VM_FAULT_INTERRUPTED;
1915 			}
1916 			if (force_fault_retry == TRUE) {
1917 				vm_fault_cleanup(object, first_m);
1918 				thread_interrupt_level(interruptible_state);
1919 
1920 				return VM_FAULT_RETRY;
1921 			}
1922 			if (m == VM_PAGE_NULL && object->phys_contiguous) {
1923 				/*
1924 				 * No page here means that the object we
1925 				 * initially looked up was "physically
1926 				 * contiguous" (i.e. device memory).  However,
1927 				 * with Virtual VRAM, the object might not
1928 				 * be backed by that device memory anymore,
1929 				 * so we're done here only if the object is
1930 				 * still "phys_contiguous".
1931 				 * Otherwise, if the object is no longer
1932 				 * "phys_contiguous", we need to retry the
1933 				 * page fault against the object's new backing
1934 				 * store (different memory object).
1935 				 */
1936 phys_contig_object:
1937 				goto done;
1938 			}
1939 			/*
1940 			 * potentially a pagein fault
1941 			 * if we make it through the state checks
1942 			 * above, than we'll count it as such
1943 			 */
1944 			my_fault = my_fault_type;
1945 
1946 			/*
1947 			 * Retry with same object/offset, since new data may
1948 			 * be in a different page (i.e., m is meaningless at
1949 			 * this point).
1950 			 */
1951 			continue;
1952 		}
1953 dont_look_for_page:
1954 		/*
1955 		 * We get here if the object has no pager, or an existence map
1956 		 * exists and indicates the page isn't present on the pager
1957 		 * or we're unwiring a page.  If a pager exists, but there
1958 		 * is no existence map, then the m->vmp_absent case above handles
1959 		 * the ZF case when the pager can't provide the page
1960 		 */
1961 #if TRACEFAULTPAGE
1962 		dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1963 #endif
1964 		if (object == first_object) {
1965 			first_m = m;
1966 		} else {
1967 			assert(m == VM_PAGE_NULL);
1968 		}
1969 
1970 		next_object = object->shadow;
1971 
1972 		if (next_object == VM_OBJECT_NULL) {
1973 			/*
1974 			 * we've hit the bottom of the shadown chain,
1975 			 * fill the page in the top object with zeros.
1976 			 */
1977 			assert(!must_be_resident);
1978 
1979 			if (object != first_object) {
1980 				vm_object_paging_end(object);
1981 				vm_object_unlock(object);
1982 
1983 				object = first_object;
1984 				offset = first_offset;
1985 				vm_object_lock(object);
1986 			}
1987 			m = first_m;
1988 			assert(VM_PAGE_OBJECT(m) == object);
1989 			first_m = VM_PAGE_NULL;
1990 
1991 			/*
1992 			 * check for any conditions that prevent
1993 			 * us from creating a new zero-fill page
1994 			 * vm_fault_check will do all of the
1995 			 * fault cleanup in the case of an error condition
1996 			 * including resetting the thread_interrupt_level
1997 			 */
1998 			error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1999 
2000 			if (error != VM_FAULT_SUCCESS) {
2001 				return error;
2002 			}
2003 
2004 			if (m == VM_PAGE_NULL) {
2005 				m = vm_page_grab_options(grab_options);
2006 
2007 				if (m == VM_PAGE_NULL) {
2008 					vm_fault_cleanup(object, VM_PAGE_NULL);
2009 					thread_interrupt_level(interruptible_state);
2010 
2011 					return VM_FAULT_MEMORY_SHORTAGE;
2012 				}
2013 				vm_page_insert(m, object, vm_object_trunc_page(offset));
2014 			}
2015 			if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
2016 				m->vmp_absent = TRUE;
2017 				clear_absent_on_error = true;
2018 			}
2019 
2020 			my_fault = vm_fault_zero_page(m, no_zero_fill);
2021 
2022 			break;
2023 		} else {
2024 			/*
2025 			 * Move on to the next object.  Lock the next
2026 			 * object before unlocking the current one.
2027 			 */
2028 			if ((object != first_object) || must_be_resident) {
2029 				vm_object_paging_end(object);
2030 			}
2031 
2032 			offset += object->vo_shadow_offset;
2033 			fault_info->lo_offset += object->vo_shadow_offset;
2034 			fault_info->hi_offset += object->vo_shadow_offset;
2035 			access_required = VM_PROT_READ;
2036 
2037 			vm_object_lock(next_object);
2038 			vm_object_unlock(object);
2039 
2040 			object = next_object;
2041 			vm_object_paging_begin(object);
2042 		}
2043 	}
2044 
2045 	/*
2046 	 *	PAGE HAS BEEN FOUND.
2047 	 *
2048 	 *	This page (m) is:
2049 	 *		busy, so that we can play with it;
2050 	 *		not absent, so that nobody else will fill it;
2051 	 *		possibly eligible for pageout;
2052 	 *
2053 	 *	The top-level page (first_m) is:
2054 	 *		VM_PAGE_NULL if the page was found in the
2055 	 *		 top-level object;
2056 	 *		busy, not absent, and ineligible for pageout.
2057 	 *
2058 	 *	The current object (object) is locked.  A paging
2059 	 *	reference is held for the current and top-level
2060 	 *	objects.
2061 	 */
2062 
2063 #if TRACEFAULTPAGE
2064 	dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
2065 #endif
2066 #if     EXTRA_ASSERTIONS
2067 	assert(m->vmp_busy && !m->vmp_absent);
2068 	assert((first_m == VM_PAGE_NULL) ||
2069 	    (first_m->vmp_busy && !first_m->vmp_absent &&
2070 	    !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
2071 #endif  /* EXTRA_ASSERTIONS */
2072 
2073 	/*
2074 	 * If the page is being written, but isn't
2075 	 * already owned by the top-level object,
2076 	 * we have to copy it into a new page owned
2077 	 * by the top-level object.
2078 	 */
2079 	if (object != first_object) {
2080 #if TRACEFAULTPAGE
2081 		dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
2082 #endif
2083 		if (fault_type & VM_PROT_WRITE) {
2084 			vm_page_t copy_m;
2085 
2086 			/*
2087 			 * We only really need to copy if we
2088 			 * want to write it.
2089 			 */
2090 			assert(!must_be_resident);
2091 
2092 			/*
2093 			 * If we try to collapse first_object at this
2094 			 * point, we may deadlock when we try to get
2095 			 * the lock on an intermediate object (since we
2096 			 * have the bottom object locked).  We can't
2097 			 * unlock the bottom object, because the page
2098 			 * we found may move (by collapse) if we do.
2099 			 *
2100 			 * Instead, we first copy the page.  Then, when
2101 			 * we have no more use for the bottom object,
2102 			 * we unlock it and try to collapse.
2103 			 *
2104 			 * Note that we copy the page even if we didn't
2105 			 * need to... that's the breaks.
2106 			 */
2107 
2108 			/*
2109 			 * Allocate a page for the copy
2110 			 */
2111 			copy_m = vm_page_grab_options(grab_options);
2112 
2113 			if (copy_m == VM_PAGE_NULL) {
2114 				RELEASE_PAGE(m);
2115 
2116 				vm_fault_cleanup(object, first_m);
2117 				thread_interrupt_level(interruptible_state);
2118 
2119 				return VM_FAULT_MEMORY_SHORTAGE;
2120 			}
2121 
2122 			vm_page_copy(m, copy_m);
2123 
2124 			/*
2125 			 * If another map is truly sharing this
2126 			 * page with us, we have to flush all
2127 			 * uses of the original page, since we
2128 			 * can't distinguish those which want the
2129 			 * original from those which need the
2130 			 * new copy.
2131 			 *
2132 			 * XXXO If we know that only one map has
2133 			 * access to this page, then we could
2134 			 * avoid the pmap_disconnect() call.
2135 			 */
2136 			if (m->vmp_pmapped) {
2137 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2138 			}
2139 
2140 			if (m->vmp_clustered) {
2141 				VM_PAGE_COUNT_AS_PAGEIN(m);
2142 				VM_PAGE_CONSUME_CLUSTERED(m);
2143 			}
2144 			assert(!m->vmp_cleaning);
2145 
2146 			/*
2147 			 * We no longer need the old page or object.
2148 			 */
2149 			RELEASE_PAGE(m);
2150 
2151 			/*
2152 			 * This check helps with marking the object as having a sequential pattern
2153 			 * Normally we'll miss doing this below because this fault is about COW to
2154 			 * the first_object i.e. bring page in from disk, push to object above but
2155 			 * don't update the file object's sequential pattern.
2156 			 */
2157 			if (object->internal == FALSE) {
2158 				vm_fault_is_sequential(object, offset, fault_info->behavior);
2159 			}
2160 
2161 			vm_object_paging_end(object);
2162 			vm_object_unlock(object);
2163 
2164 			my_fault = DBG_COW_FAULT;
2165 			counter_inc(&vm_statistics_cow_faults);
2166 			DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2167 			counter_inc(&current_task()->cow_faults);
2168 
2169 			object = first_object;
2170 			offset = first_offset;
2171 
2172 			vm_object_lock(object);
2173 			/*
2174 			 * get rid of the place holder
2175 			 * page that we soldered in earlier
2176 			 */
2177 			VM_PAGE_FREE(first_m);
2178 			first_m = VM_PAGE_NULL;
2179 
2180 			/*
2181 			 * and replace it with the
2182 			 * page we just copied into
2183 			 */
2184 			assert(copy_m->vmp_busy);
2185 			vm_page_insert(copy_m, object, vm_object_trunc_page(offset));
2186 			SET_PAGE_DIRTY(copy_m, TRUE);
2187 
2188 			m = copy_m;
2189 			/*
2190 			 * Now that we've gotten the copy out of the
2191 			 * way, let's try to collapse the top object.
2192 			 * But we have to play ugly games with
2193 			 * paging_in_progress to do that...
2194 			 */
2195 			vm_object_paging_end(object);
2196 			vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
2197 			vm_object_paging_begin(object);
2198 		} else {
2199 			*protection &= (~VM_PROT_WRITE);
2200 		}
2201 	}
2202 	/*
2203 	 * Now check whether the page needs to be pushed into the
2204 	 * copy object.  The use of asymmetric copy on write for
2205 	 * shared temporary objects means that we may do two copies to
2206 	 * satisfy the fault; one above to get the page from a
2207 	 * shadowed object, and one here to push it into the copy.
2208 	 */
2209 	try_failed_count = 0;
2210 
2211 	while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2212 		vm_object_offset_t      copy_offset;
2213 		vm_page_t               copy_m;
2214 
2215 #if TRACEFAULTPAGE
2216 		dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
2217 #endif
2218 		/*
2219 		 * If the page is being written, but hasn't been
2220 		 * copied to the copy-object, we have to copy it there.
2221 		 */
2222 		if ((fault_type & VM_PROT_WRITE) == 0) {
2223 			*protection &= ~VM_PROT_WRITE;
2224 			break;
2225 		}
2226 
2227 		/*
2228 		 * If the page was guaranteed to be resident,
2229 		 * we must have already performed the copy.
2230 		 */
2231 		if (must_be_resident) {
2232 			break;
2233 		}
2234 
2235 		/*
2236 		 * Try to get the lock on the copy_object.
2237 		 */
2238 		if (!vm_object_lock_try(copy_object)) {
2239 			vm_object_unlock(object);
2240 			try_failed_count++;
2241 
2242 			mutex_pause(try_failed_count);  /* wait a bit */
2243 			vm_object_lock(object);
2244 
2245 			continue;
2246 		}
2247 		try_failed_count = 0;
2248 
2249 		/*
2250 		 * Make another reference to the copy-object,
2251 		 * to keep it from disappearing during the
2252 		 * copy.
2253 		 */
2254 		vm_object_reference_locked(copy_object);
2255 
2256 		/*
2257 		 * Does the page exist in the copy?
2258 		 */
2259 		copy_offset = first_offset - copy_object->vo_shadow_offset;
2260 		copy_offset = vm_object_trunc_page(copy_offset);
2261 
2262 		if (copy_object->vo_size <= copy_offset) {
2263 			/*
2264 			 * Copy object doesn't cover this page -- do nothing.
2265 			 */
2266 			;
2267 		} else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2268 			/*
2269 			 * Page currently exists in the copy object
2270 			 */
2271 			if (copy_m->vmp_busy) {
2272 				/*
2273 				 * If the page is being brought
2274 				 * in, wait for it and then retry.
2275 				 */
2276 				RELEASE_PAGE(m);
2277 
2278 				/*
2279 				 * take an extra ref so object won't die
2280 				 */
2281 				vm_object_reference_locked(copy_object);
2282 				vm_object_unlock(copy_object);
2283 				vm_fault_cleanup(object, first_m);
2284 
2285 				vm_object_lock(copy_object);
2286 				assert(copy_object->ref_count > 0);
2287 				vm_object_lock_assert_exclusive(copy_object);
2288 				copy_object->ref_count--;
2289 				assert(copy_object->ref_count > 0);
2290 				copy_m = vm_page_lookup(copy_object, copy_offset);
2291 
2292 				if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2293 					PAGE_ASSERT_WAIT(copy_m, interruptible);
2294 
2295 					vm_object_unlock(copy_object);
2296 					wait_result = thread_block(THREAD_CONTINUE_NULL);
2297 					vm_object_deallocate(copy_object);
2298 
2299 					goto backoff;
2300 				} else {
2301 					vm_object_unlock(copy_object);
2302 					vm_object_deallocate(copy_object);
2303 					thread_interrupt_level(interruptible_state);
2304 
2305 					return VM_FAULT_RETRY;
2306 				}
2307 			}
2308 		} else if (!PAGED_OUT(copy_object, copy_offset)) {
2309 			/*
2310 			 * If PAGED_OUT is TRUE, then the page used to exist
2311 			 * in the copy-object, and has already been paged out.
2312 			 * We don't need to repeat this. If PAGED_OUT is
2313 			 * FALSE, then either we don't know (!pager_created,
2314 			 * for example) or it hasn't been paged out.
2315 			 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2316 			 * We must copy the page to the copy object.
2317 			 *
2318 			 * Allocate a page for the copy
2319 			 */
2320 			copy_m = vm_page_alloc(copy_object, copy_offset);
2321 
2322 			if (copy_m == VM_PAGE_NULL) {
2323 				RELEASE_PAGE(m);
2324 
2325 				vm_object_lock_assert_exclusive(copy_object);
2326 				copy_object->ref_count--;
2327 				assert(copy_object->ref_count > 0);
2328 
2329 				vm_object_unlock(copy_object);
2330 				vm_fault_cleanup(object, first_m);
2331 				thread_interrupt_level(interruptible_state);
2332 
2333 				return VM_FAULT_MEMORY_SHORTAGE;
2334 			}
2335 			/*
2336 			 * Must copy page into copy-object.
2337 			 */
2338 			vm_page_copy(m, copy_m);
2339 
2340 			/*
2341 			 * If the old page was in use by any users
2342 			 * of the copy-object, it must be removed
2343 			 * from all pmaps.  (We can't know which
2344 			 * pmaps use it.)
2345 			 */
2346 			if (m->vmp_pmapped) {
2347 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2348 			}
2349 
2350 			if (m->vmp_clustered) {
2351 				VM_PAGE_COUNT_AS_PAGEIN(m);
2352 				VM_PAGE_CONSUME_CLUSTERED(m);
2353 			}
2354 			/*
2355 			 * If there's a pager, then immediately
2356 			 * page out this page, using the "initialize"
2357 			 * option.  Else, we use the copy.
2358 			 */
2359 			if ((!copy_object->pager_ready)
2360 			    || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2361 			    ) {
2362 				vm_page_lockspin_queues();
2363 				assert(!m->vmp_cleaning);
2364 				vm_page_activate(copy_m);
2365 				vm_page_unlock_queues();
2366 
2367 				SET_PAGE_DIRTY(copy_m, TRUE);
2368 				PAGE_WAKEUP_DONE(copy_m);
2369 			} else {
2370 				assert(copy_m->vmp_busy == TRUE);
2371 				assert(!m->vmp_cleaning);
2372 
2373 				/*
2374 				 * dirty is protected by the object lock
2375 				 */
2376 				SET_PAGE_DIRTY(copy_m, TRUE);
2377 
2378 				/*
2379 				 * The page is already ready for pageout:
2380 				 * not on pageout queues and busy.
2381 				 * Unlock everything except the
2382 				 * copy_object itself.
2383 				 */
2384 				vm_object_unlock(object);
2385 
2386 				/*
2387 				 * Write the page to the copy-object,
2388 				 * flushing it from the kernel.
2389 				 */
2390 				vm_pageout_initialize_page(copy_m);
2391 
2392 				/*
2393 				 * Since the pageout may have
2394 				 * temporarily dropped the
2395 				 * copy_object's lock, we
2396 				 * check whether we'll have
2397 				 * to deallocate the hard way.
2398 				 */
2399 				if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2400 					vm_object_unlock(copy_object);
2401 					vm_object_deallocate(copy_object);
2402 					vm_object_lock(object);
2403 
2404 					continue;
2405 				}
2406 				/*
2407 				 * Pick back up the old object's
2408 				 * lock.  [It is safe to do so,
2409 				 * since it must be deeper in the
2410 				 * object tree.]
2411 				 */
2412 				vm_object_lock(object);
2413 			}
2414 
2415 			/*
2416 			 * Because we're pushing a page upward
2417 			 * in the object tree, we must restart
2418 			 * any faults that are waiting here.
2419 			 * [Note that this is an expansion of
2420 			 * PAGE_WAKEUP that uses the THREAD_RESTART
2421 			 * wait result].  Can't turn off the page's
2422 			 * busy bit because we're not done with it.
2423 			 */
2424 			if (m->vmp_wanted) {
2425 				m->vmp_wanted = FALSE;
2426 				thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2427 			}
2428 		}
2429 		/*
2430 		 * The reference count on copy_object must be
2431 		 * at least 2: one for our extra reference,
2432 		 * and at least one from the outside world
2433 		 * (we checked that when we last locked
2434 		 * copy_object).
2435 		 */
2436 		vm_object_lock_assert_exclusive(copy_object);
2437 		copy_object->ref_count--;
2438 		assert(copy_object->ref_count > 0);
2439 
2440 		vm_object_unlock(copy_object);
2441 
2442 		break;
2443 	}
2444 
2445 done:
2446 	*result_page = m;
2447 	*top_page = first_m;
2448 
2449 	if (m != VM_PAGE_NULL) {
2450 		assert(VM_PAGE_OBJECT(m) == object);
2451 
2452 		retval = VM_FAULT_SUCCESS;
2453 
2454 		if (my_fault == DBG_PAGEIN_FAULT) {
2455 			VM_PAGE_COUNT_AS_PAGEIN(m);
2456 
2457 			if (object->internal) {
2458 				my_fault = DBG_PAGEIND_FAULT;
2459 			} else {
2460 				my_fault = DBG_PAGEINV_FAULT;
2461 			}
2462 
2463 			/*
2464 			 * evaluate access pattern and update state
2465 			 * vm_fault_deactivate_behind depends on the
2466 			 * state being up to date
2467 			 */
2468 			vm_fault_is_sequential(object, offset, fault_info->behavior);
2469 			vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2470 		} else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2471 			/*
2472 			 * we weren't called from vm_fault, so handle the
2473 			 * accounting here for hits in the cache
2474 			 */
2475 			if (m->vmp_clustered) {
2476 				VM_PAGE_COUNT_AS_PAGEIN(m);
2477 				VM_PAGE_CONSUME_CLUSTERED(m);
2478 			}
2479 			vm_fault_is_sequential(object, offset, fault_info->behavior);
2480 			vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2481 		} else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2482 			VM_STAT_DECOMPRESSIONS();
2483 		}
2484 		if (type_of_fault) {
2485 			*type_of_fault = my_fault;
2486 		}
2487 	} else {
2488 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUCCESS_NO_PAGE), 0 /* arg */);
2489 		retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2490 		assert(first_m == VM_PAGE_NULL);
2491 		assert(object == first_object);
2492 	}
2493 
2494 	thread_interrupt_level(interruptible_state);
2495 
2496 #if TRACEFAULTPAGE
2497 	dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2498 #endif
2499 	return retval;
2500 
2501 backoff:
2502 	thread_interrupt_level(interruptible_state);
2503 
2504 	if (wait_result == THREAD_INTERRUPTED) {
2505 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
2506 		return VM_FAULT_INTERRUPTED;
2507 	}
2508 	return VM_FAULT_RETRY;
2509 
2510 #undef  RELEASE_PAGE
2511 }
2512 
2513 #if MACH_ASSERT && (XNU_PLATFORM_WatchOS || __x86_64__)
2514 #define PANIC_ON_CS_KILLED_DEFAULT true
2515 #else
2516 #define PANIC_ON_CS_KILLED_DEFAULT false
2517 #endif
2518 static TUNABLE(bool, panic_on_cs_killed, "panic_on_cs_killed",
2519     PANIC_ON_CS_KILLED_DEFAULT);
2520 
2521 extern int proc_selfpid(void);
2522 extern char *proc_name_address(void *p);
2523 unsigned long cs_enter_tainted_rejected = 0;
2524 unsigned long cs_enter_tainted_accepted = 0;
2525 
2526 /*
2527  * CODE SIGNING:
2528  * When soft faulting a page, we have to validate the page if:
2529  * 1. the page is being mapped in user space
2530  * 2. the page hasn't already been found to be "tainted"
2531  * 3. the page belongs to a code-signed object
2532  * 4. the page has not been validated yet or has been mapped for write.
2533  */
2534 static bool
vm_fault_cs_need_validation(pmap_t pmap,vm_page_t page,vm_object_t page_obj,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)2535 vm_fault_cs_need_validation(
2536 	pmap_t pmap,
2537 	vm_page_t page,
2538 	vm_object_t page_obj,
2539 	vm_map_size_t fault_page_size,
2540 	vm_map_offset_t fault_phys_offset)
2541 {
2542 	if (pmap == kernel_pmap) {
2543 		/* 1 - not user space */
2544 		return false;
2545 	}
2546 	if (!page_obj->code_signed) {
2547 		/* 3 - page does not belong to a code-signed object */
2548 		return false;
2549 	}
2550 	if (fault_page_size == PAGE_SIZE) {
2551 		/* looking at the whole page */
2552 		assertf(fault_phys_offset == 0,
2553 		    "fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
2554 		    (uint64_t)fault_page_size,
2555 		    (uint64_t)fault_phys_offset);
2556 		if (page->vmp_cs_tainted == VMP_CS_ALL_TRUE) {
2557 			/* 2 - page is all tainted */
2558 			return false;
2559 		}
2560 		if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
2561 		    !page->vmp_wpmapped) {
2562 			/* 4 - already fully validated and never mapped writable */
2563 			return false;
2564 		}
2565 	} else {
2566 		/* looking at a specific sub-page */
2567 		if (VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
2568 			/* 2 - sub-page was already marked as tainted */
2569 			return false;
2570 		}
2571 		if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) &&
2572 		    !page->vmp_wpmapped) {
2573 			/* 4 - already validated and never mapped writable */
2574 			return false;
2575 		}
2576 	}
2577 	/* page needs to be validated */
2578 	return true;
2579 }
2580 
2581 
2582 static bool
vm_fault_cs_page_immutable(vm_page_t m,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_prot_t prot __unused)2583 vm_fault_cs_page_immutable(
2584 	vm_page_t m,
2585 	vm_map_size_t fault_page_size,
2586 	vm_map_offset_t fault_phys_offset,
2587 	vm_prot_t prot __unused)
2588 {
2589 	if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)
2590 	    /*&& ((prot) & VM_PROT_EXECUTE)*/) {
2591 		return true;
2592 	}
2593 	return false;
2594 }
2595 
2596 static bool
vm_fault_cs_page_nx(vm_page_t m,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)2597 vm_fault_cs_page_nx(
2598 	vm_page_t m,
2599 	vm_map_size_t fault_page_size,
2600 	vm_map_offset_t fault_phys_offset)
2601 {
2602 	return VMP_CS_NX(m, fault_page_size, fault_phys_offset);
2603 }
2604 
2605 /*
2606  * Check if the page being entered into the pmap violates code signing.
2607  */
2608 static kern_return_t
vm_fault_cs_check_violation(bool cs_bypass,vm_object_t object,vm_page_t m,pmap_t pmap,vm_prot_t prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_object_fault_info_t fault_info,bool map_is_switched,bool map_is_switch_protected,bool * cs_violation)2609 vm_fault_cs_check_violation(
2610 	bool cs_bypass,
2611 	vm_object_t object,
2612 	vm_page_t m,
2613 	pmap_t pmap,
2614 	vm_prot_t prot,
2615 	vm_prot_t caller_prot,
2616 	vm_map_size_t fault_page_size,
2617 	vm_map_offset_t fault_phys_offset,
2618 	vm_object_fault_info_t fault_info,
2619 	bool map_is_switched,
2620 	bool map_is_switch_protected,
2621 	bool *cs_violation)
2622 {
2623 #if !PMAP_CS
2624 #pragma unused(caller_prot)
2625 #pragma unused(fault_info)
2626 #endif /* !PMAP_CS */
2627 	int             cs_enforcement_enabled;
2628 	if (!cs_bypass &&
2629 	    vm_fault_cs_need_validation(pmap, m, object,
2630 	    fault_page_size, fault_phys_offset)) {
2631 		vm_object_lock_assert_exclusive(object);
2632 
2633 		if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)) {
2634 			vm_cs_revalidates++;
2635 		}
2636 
2637 		/* VM map is locked, so 1 ref will remain on VM object -
2638 		 * so no harm if vm_page_validate_cs drops the object lock */
2639 
2640 		vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
2641 	}
2642 
2643 	/* If the map is switched, and is switch-protected, we must protect
2644 	 * some pages from being write-faulted: immutable pages because by
2645 	 * definition they may not be written, and executable pages because that
2646 	 * would provide a way to inject unsigned code.
2647 	 * If the page is immutable, we can simply return. However, we can't
2648 	 * immediately determine whether a page is executable anywhere. But,
2649 	 * we can disconnect it everywhere and remove the executable protection
2650 	 * from the current map. We do that below right before we do the
2651 	 * PMAP_ENTER.
2652 	 */
2653 	if (pmap == kernel_pmap) {
2654 		/* kernel fault: cs_enforcement does not apply */
2655 		cs_enforcement_enabled = 0;
2656 	} else {
2657 		cs_enforcement_enabled = pmap_get_vm_map_cs_enforced(pmap);
2658 	}
2659 
2660 	if (cs_enforcement_enabled && map_is_switched &&
2661 	    map_is_switch_protected &&
2662 	    vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2663 	    (prot & VM_PROT_WRITE)) {
2664 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAILED_IMMUTABLE_PAGE_WRITE), 0 /* arg */);
2665 		return KERN_CODESIGN_ERROR;
2666 	}
2667 
2668 	if (cs_enforcement_enabled &&
2669 	    vm_fault_cs_page_nx(m, fault_page_size, fault_phys_offset) &&
2670 	    (prot & VM_PROT_EXECUTE)) {
2671 		if (cs_debug) {
2672 			printf("page marked to be NX, not letting it be mapped EXEC\n");
2673 		}
2674 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAILED_NX_PAGE_EXEC_MAPPING), 0 /* arg */);
2675 		return KERN_CODESIGN_ERROR;
2676 	}
2677 
2678 	/* A page could be tainted, or pose a risk of being tainted later.
2679 	 * Check whether the receiving process wants it, and make it feel
2680 	 * the consequences (that hapens in cs_invalid_page()).
2681 	 * For CS Enforcement, two other conditions will
2682 	 * cause that page to be tainted as well:
2683 	 * - pmapping an unsigned page executable - this means unsigned code;
2684 	 * - writeable mapping of a validated page - the content of that page
2685 	 *   can be changed without the kernel noticing, therefore unsigned
2686 	 *   code can be created
2687 	 */
2688 	if (cs_bypass) {
2689 		/* code-signing is bypassed */
2690 		*cs_violation = FALSE;
2691 	} else if (VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
2692 		/* tainted page */
2693 		*cs_violation = TRUE;
2694 	} else if (!cs_enforcement_enabled) {
2695 		/* no further code-signing enforcement */
2696 		*cs_violation = FALSE;
2697 	} else if (vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2698 	    ((prot & VM_PROT_WRITE) ||
2699 	    m->vmp_wpmapped)) {
2700 		/*
2701 		 * The page should be immutable, but is in danger of being
2702 		 * modified.
2703 		 * This is the case where we want policy from the code
2704 		 * directory - is the page immutable or not? For now we have
2705 		 * to assume that code pages will be immutable, data pages not.
2706 		 * We'll assume a page is a code page if it has a code directory
2707 		 * and we fault for execution.
2708 		 * That is good enough since if we faulted the code page for
2709 		 * writing in another map before, it is wpmapped; if we fault
2710 		 * it for writing in this map later it will also be faulted for
2711 		 * executing at the same time; and if we fault for writing in
2712 		 * another map later, we will disconnect it from this pmap so
2713 		 * we'll notice the change.
2714 		 */
2715 		*cs_violation = TRUE;
2716 	} else if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2717 	    (prot & VM_PROT_EXECUTE)
2718 	    ) {
2719 		*cs_violation = TRUE;
2720 	} else {
2721 		*cs_violation = FALSE;
2722 	}
2723 	return KERN_SUCCESS;
2724 }
2725 
2726 /*
2727  * Handles a code signing violation by either rejecting the page or forcing a disconnect.
2728  * @param must_disconnect This value will be set to true if the caller must disconnect
2729  * this page.
2730  * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
2731  */
2732 static kern_return_t
vm_fault_cs_handle_violation(vm_object_t object,vm_page_t m,pmap_t pmap,vm_prot_t prot,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,bool map_is_switched,bool map_is_switch_protected,bool * must_disconnect)2733 vm_fault_cs_handle_violation(
2734 	vm_object_t object,
2735 	vm_page_t m,
2736 	pmap_t pmap,
2737 	vm_prot_t prot,
2738 	vm_map_offset_t vaddr,
2739 	vm_map_size_t fault_page_size,
2740 	vm_map_offset_t fault_phys_offset,
2741 	bool map_is_switched,
2742 	bool map_is_switch_protected,
2743 	bool *must_disconnect)
2744 {
2745 #if !MACH_ASSERT
2746 #pragma unused(pmap)
2747 #pragma unused(map_is_switch_protected)
2748 #endif /* !MACH_ASSERT */
2749 	/*
2750 	 * We will have a tainted page. Have to handle the special case
2751 	 * of a switched map now. If the map is not switched, standard
2752 	 * procedure applies - call cs_invalid_page().
2753 	 * If the map is switched, the real owner is invalid already.
2754 	 * There is no point in invalidating the switching process since
2755 	 * it will not be executing from the map. So we don't call
2756 	 * cs_invalid_page() in that case.
2757 	 */
2758 	boolean_t reject_page, cs_killed;
2759 	kern_return_t kr;
2760 	if (map_is_switched) {
2761 		assert(pmap == vm_map_pmap(current_thread()->map));
2762 		assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2763 		reject_page = FALSE;
2764 	} else {
2765 		if (cs_debug > 5) {
2766 			printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2767 			    object->code_signed ? "yes" : "no",
2768 			    VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2769 			    VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2770 			    m->vmp_wpmapped ? "yes" : "no",
2771 			    (int)prot);
2772 		}
2773 		reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2774 	}
2775 
2776 	if (reject_page) {
2777 		/* reject the invalid page: abort the page fault */
2778 		int                     pid;
2779 		const char              *procname;
2780 		task_t                  task;
2781 		vm_object_t             file_object, shadow;
2782 		vm_object_offset_t      file_offset;
2783 		char                    *pathname, *filename;
2784 		vm_size_t               pathname_len, filename_len;
2785 		boolean_t               truncated_path;
2786 #define __PATH_MAX 1024
2787 		struct timespec         mtime, cs_mtime;
2788 		int                     shadow_depth;
2789 		os_reason_t             codesigning_exit_reason = OS_REASON_NULL;
2790 
2791 		kr = KERN_CODESIGN_ERROR;
2792 		cs_enter_tainted_rejected++;
2793 
2794 		/* get process name and pid */
2795 		procname = "?";
2796 		task = current_task();
2797 		pid = proc_selfpid();
2798 		if (get_bsdtask_info(task) != NULL) {
2799 			procname = proc_name_address(get_bsdtask_info(task));
2800 		}
2801 
2802 		/* get file's VM object */
2803 		file_object = object;
2804 		file_offset = m->vmp_offset;
2805 		for (shadow = file_object->shadow,
2806 		    shadow_depth = 0;
2807 		    shadow != VM_OBJECT_NULL;
2808 		    shadow = file_object->shadow,
2809 		    shadow_depth++) {
2810 			vm_object_lock_shared(shadow);
2811 			if (file_object != object) {
2812 				vm_object_unlock(file_object);
2813 			}
2814 			file_offset += file_object->vo_shadow_offset;
2815 			file_object = shadow;
2816 		}
2817 
2818 		mtime.tv_sec = 0;
2819 		mtime.tv_nsec = 0;
2820 		cs_mtime.tv_sec = 0;
2821 		cs_mtime.tv_nsec = 0;
2822 
2823 		/* get file's pathname and/or filename */
2824 		pathname = NULL;
2825 		filename = NULL;
2826 		pathname_len = 0;
2827 		filename_len = 0;
2828 		truncated_path = FALSE;
2829 		/* no pager -> no file -> no pathname, use "<nil>" in that case */
2830 		if (file_object->pager != NULL) {
2831 			pathname = kalloc_data(__PATH_MAX * 2, Z_WAITOK);
2832 			if (pathname) {
2833 				pathname[0] = '\0';
2834 				pathname_len = __PATH_MAX;
2835 				filename = pathname + pathname_len;
2836 				filename_len = __PATH_MAX;
2837 
2838 				if (vnode_pager_get_object_name(file_object->pager,
2839 				    pathname,
2840 				    pathname_len,
2841 				    filename,
2842 				    filename_len,
2843 				    &truncated_path) == KERN_SUCCESS) {
2844 					/* safety first... */
2845 					pathname[__PATH_MAX - 1] = '\0';
2846 					filename[__PATH_MAX - 1] = '\0';
2847 
2848 					vnode_pager_get_object_mtime(file_object->pager,
2849 					    &mtime,
2850 					    &cs_mtime);
2851 				} else {
2852 					kfree_data(pathname, __PATH_MAX * 2);
2853 					pathname = NULL;
2854 					filename = NULL;
2855 					pathname_len = 0;
2856 					filename_len = 0;
2857 					truncated_path = FALSE;
2858 				}
2859 			}
2860 		}
2861 		printf("CODE SIGNING: process %d[%s]: "
2862 		    "rejecting invalid page at address 0x%llx "
2863 		    "from offset 0x%llx in file \"%s%s%s\" "
2864 		    "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2865 		    "(signed:%d validated:%d tainted:%d nx:%d "
2866 		    "wpmapped:%d dirty:%d depth:%d)\n",
2867 		    pid, procname, (addr64_t) vaddr,
2868 		    file_offset,
2869 		    (pathname ? pathname : "<nil>"),
2870 		    (truncated_path ? "/.../" : ""),
2871 		    (truncated_path ? filename : ""),
2872 		    cs_mtime.tv_sec, cs_mtime.tv_nsec,
2873 		    ((cs_mtime.tv_sec == mtime.tv_sec &&
2874 		    cs_mtime.tv_nsec == mtime.tv_nsec)
2875 		    ? "=="
2876 		    : "!="),
2877 		    mtime.tv_sec, mtime.tv_nsec,
2878 		    object->code_signed,
2879 		    VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
2880 		    VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
2881 		    VMP_CS_NX(m, fault_page_size, fault_phys_offset),
2882 		    m->vmp_wpmapped,
2883 		    m->vmp_dirty,
2884 		    shadow_depth);
2885 
2886 		/*
2887 		 * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2888 		 * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2889 		 * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2890 		 * will deal with the segmentation fault.
2891 		 */
2892 		if (cs_killed) {
2893 			KDBG(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
2894 			    pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2895 
2896 			codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2897 			if (codesigning_exit_reason == NULL) {
2898 				printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
2899 			} else {
2900 				mach_vm_address_t data_addr = 0;
2901 				struct codesigning_exit_reason_info *ceri = NULL;
2902 				uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
2903 
2904 				if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
2905 					printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2906 				} else {
2907 					if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
2908 					    EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
2909 						ceri = (struct codesigning_exit_reason_info *)data_addr;
2910 						static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
2911 
2912 						ceri->ceri_virt_addr = vaddr;
2913 						ceri->ceri_file_offset = file_offset;
2914 						if (pathname) {
2915 							strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
2916 						} else {
2917 							ceri->ceri_pathname[0] = '\0';
2918 						}
2919 						if (filename) {
2920 							strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
2921 						} else {
2922 							ceri->ceri_filename[0] = '\0';
2923 						}
2924 						ceri->ceri_path_truncated = (truncated_path ? 1 : 0);
2925 						ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
2926 						ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
2927 						ceri->ceri_page_modtime_secs = mtime.tv_sec;
2928 						ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
2929 						ceri->ceri_object_codesigned = (object->code_signed);
2930 						ceri->ceri_page_codesig_validated = VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset);
2931 						ceri->ceri_page_codesig_tainted = VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset);
2932 						ceri->ceri_page_codesig_nx = VMP_CS_NX(m, fault_page_size, fault_phys_offset);
2933 						ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
2934 						ceri->ceri_page_slid = 0;
2935 						ceri->ceri_page_dirty = (m->vmp_dirty);
2936 						ceri->ceri_page_shadow_depth = shadow_depth;
2937 					} else {
2938 #if DEBUG || DEVELOPMENT
2939 						panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
2940 #else
2941 						printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
2942 #endif /* DEBUG || DEVELOPMENT */
2943 						/* Free the buffer */
2944 						os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
2945 					}
2946 				}
2947 			}
2948 
2949 			set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
2950 		}
2951 		if (panic_on_cs_killed &&
2952 		    object->object_is_shared_cache) {
2953 			char *tainted_contents;
2954 			vm_map_offset_t src_vaddr;
2955 			src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
2956 			tainted_contents = kalloc_data(PAGE_SIZE, Z_WAITOK);
2957 			bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
2958 			printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
2959 			panic("CODE SIGNING: process %d[%s]: "
2960 			    "rejecting invalid page (phys#0x%x) at address 0x%llx "
2961 			    "from offset 0x%llx in file \"%s%s%s\" "
2962 			    "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2963 			    "(signed:%d validated:%d tainted:%d nx:%d"
2964 			    "wpmapped:%d dirty:%d depth:%d)\n",
2965 			    pid, procname,
2966 			    VM_PAGE_GET_PHYS_PAGE(m),
2967 			    (addr64_t) vaddr,
2968 			    file_offset,
2969 			    (pathname ? pathname : "<nil>"),
2970 			    (truncated_path ? "/.../" : ""),
2971 			    (truncated_path ? filename : ""),
2972 			    cs_mtime.tv_sec, cs_mtime.tv_nsec,
2973 			    ((cs_mtime.tv_sec == mtime.tv_sec &&
2974 			    cs_mtime.tv_nsec == mtime.tv_nsec)
2975 			    ? "=="
2976 			    : "!="),
2977 			    mtime.tv_sec, mtime.tv_nsec,
2978 			    object->code_signed,
2979 			    VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
2980 			    VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
2981 			    VMP_CS_NX(m, fault_page_size, fault_phys_offset),
2982 			    m->vmp_wpmapped,
2983 			    m->vmp_dirty,
2984 			    shadow_depth);
2985 		}
2986 
2987 		if (file_object != object) {
2988 			vm_object_unlock(file_object);
2989 		}
2990 		if (pathname_len != 0) {
2991 			kfree_data(pathname, __PATH_MAX * 2);
2992 			pathname = NULL;
2993 			filename = NULL;
2994 		}
2995 	} else {
2996 		/* proceed with the invalid page */
2997 		kr = KERN_SUCCESS;
2998 		if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2999 		    !object->code_signed) {
3000 			/*
3001 			 * This page has not been (fully) validated but
3002 			 * does not belong to a code-signed object
3003 			 * so it should not be forcefully considered
3004 			 * as tainted.
3005 			 * We're just concerned about it here because
3006 			 * we've been asked to "execute" it but that
3007 			 * does not mean that it should cause other
3008 			 * accesses to fail.
3009 			 * This happens when a debugger sets a
3010 			 * breakpoint and we then execute code in
3011 			 * that page.  Marking the page as "tainted"
3012 			 * would cause any inspection tool ("leaks",
3013 			 * "vmmap", "CrashReporter", ...) to get killed
3014 			 * due to code-signing violation on that page,
3015 			 * even though they're just reading it and not
3016 			 * executing from it.
3017 			 */
3018 		} else {
3019 			/*
3020 			 * Page might have been tainted before or not;
3021 			 * now it definitively is. If the page wasn't
3022 			 * tainted, we must disconnect it from all
3023 			 * pmaps later, to force existing mappings
3024 			 * through that code path for re-consideration
3025 			 * of the validity of that page.
3026 			 */
3027 			if (!VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
3028 				*must_disconnect = TRUE;
3029 				VMP_CS_SET_TAINTED(m, fault_page_size, fault_phys_offset, TRUE);
3030 			}
3031 		}
3032 		cs_enter_tainted_accepted++;
3033 	}
3034 	if (kr != KERN_SUCCESS) {
3035 		if (cs_debug) {
3036 			printf("CODESIGNING: vm_fault_enter(0x%llx): "
3037 			    "*** INVALID PAGE ***\n",
3038 			    (long long)vaddr);
3039 		}
3040 #if !SECURE_KERNEL
3041 		if (cs_enforcement_panic) {
3042 			panic("CODESIGNING: panicking on invalid page");
3043 		}
3044 #endif
3045 	}
3046 	return kr;
3047 }
3048 
3049 /*
3050  * Check that the code signature is valid for the given page being inserted into
3051  * the pmap.
3052  *
3053  * @param must_disconnect This value will be set to true if the caller must disconnect
3054  * this page.
3055  * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
3056  */
3057 static kern_return_t
vm_fault_validate_cs(bool cs_bypass,vm_object_t object,vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_prot_t prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_object_fault_info_t fault_info,bool * must_disconnect)3058 vm_fault_validate_cs(
3059 	bool cs_bypass,
3060 	vm_object_t object,
3061 	vm_page_t m,
3062 	pmap_t pmap,
3063 	vm_map_offset_t vaddr,
3064 	vm_prot_t prot,
3065 	vm_prot_t caller_prot,
3066 	vm_map_size_t fault_page_size,
3067 	vm_map_offset_t fault_phys_offset,
3068 	vm_object_fault_info_t fault_info,
3069 	bool *must_disconnect)
3070 {
3071 	bool map_is_switched, map_is_switch_protected, cs_violation;
3072 	kern_return_t kr;
3073 	/* Validate code signature if necessary. */
3074 	map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
3075 	    (pmap == vm_map_pmap(current_thread()->map)));
3076 	map_is_switch_protected = current_thread()->map->switch_protect;
3077 	kr = vm_fault_cs_check_violation(cs_bypass, object, m, pmap,
3078 	    prot, caller_prot, fault_page_size, fault_phys_offset, fault_info,
3079 	    map_is_switched, map_is_switch_protected, &cs_violation);
3080 	if (kr != KERN_SUCCESS) {
3081 		return kr;
3082 	}
3083 	if (cs_violation) {
3084 		kr = vm_fault_cs_handle_violation(object, m, pmap, prot, vaddr,
3085 		    fault_page_size, fault_phys_offset,
3086 		    map_is_switched, map_is_switch_protected, must_disconnect);
3087 	}
3088 	return kr;
3089 }
3090 
3091 /*
3092  * Enqueue the page on the appropriate paging queue.
3093  */
3094 static void
vm_fault_enqueue_page(vm_object_t object,vm_page_t m,bool wired,bool change_wiring,vm_tag_t wire_tag,bool no_cache,int * type_of_fault,kern_return_t kr)3095 vm_fault_enqueue_page(
3096 	vm_object_t object,
3097 	vm_page_t m,
3098 	bool wired,
3099 	bool change_wiring,
3100 	vm_tag_t wire_tag,
3101 	bool no_cache,
3102 	int *type_of_fault,
3103 	kern_return_t kr)
3104 {
3105 	assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
3106 	boolean_t       page_queues_locked = FALSE;
3107 	boolean_t       previously_pmapped = m->vmp_pmapped;
3108 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()   \
3109 MACRO_BEGIN                                     \
3110 	if (! page_queues_locked) {             \
3111 	        page_queues_locked = TRUE;      \
3112 	        vm_page_lockspin_queues();      \
3113 	}                                       \
3114 MACRO_END
3115 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()     \
3116 MACRO_BEGIN                                     \
3117 	if (page_queues_locked) {               \
3118 	        page_queues_locked = FALSE;     \
3119 	        vm_page_unlock_queues();        \
3120 	}                                       \
3121 MACRO_END
3122 
3123 	vm_page_update_special_state(m);
3124 	if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
3125 		/*
3126 		 * Compressor pages are neither wired
3127 		 * nor pageable and should never change.
3128 		 */
3129 		assert(object == compressor_object);
3130 	} else if (change_wiring) {
3131 		__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3132 
3133 		if (wired) {
3134 			if (kr == KERN_SUCCESS) {
3135 				vm_page_wire(m, wire_tag, TRUE);
3136 			}
3137 		} else {
3138 			vm_page_unwire(m, TRUE);
3139 		}
3140 		/* we keep the page queues lock, if we need it later */
3141 	} else {
3142 		if (object->internal == TRUE) {
3143 			/*
3144 			 * don't allow anonymous pages on
3145 			 * the speculative queues
3146 			 */
3147 			no_cache = FALSE;
3148 		}
3149 		if (kr != KERN_SUCCESS) {
3150 			__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3151 			vm_page_deactivate(m);
3152 			/* we keep the page queues lock, if we need it later */
3153 		} else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
3154 		    (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3155 		    (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
3156 		    ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3157 		    !VM_PAGE_WIRED(m)) {
3158 			if (vm_page_local_q &&
3159 			    (*type_of_fault == DBG_COW_FAULT ||
3160 			    *type_of_fault == DBG_ZERO_FILL_FAULT)) {
3161 				struct vpl      *lq;
3162 				uint32_t        lid;
3163 
3164 				assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3165 
3166 				__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3167 				vm_object_lock_assert_exclusive(object);
3168 
3169 				/*
3170 				 * we got a local queue to stuff this
3171 				 * new page on...
3172 				 * its safe to manipulate local and
3173 				 * local_id at this point since we're
3174 				 * behind an exclusive object lock and
3175 				 * the page is not on any global queue.
3176 				 *
3177 				 * we'll use the current cpu number to
3178 				 * select the queue note that we don't
3179 				 * need to disable preemption... we're
3180 				 * going to be behind the local queue's
3181 				 * lock to do the real work
3182 				 */
3183 				lid = cpu_number();
3184 
3185 				lq = zpercpu_get_cpu(vm_page_local_q, lid);
3186 
3187 				VPL_LOCK(&lq->vpl_lock);
3188 
3189 				vm_page_check_pageable_safe(m);
3190 				vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
3191 				m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3192 				m->vmp_local_id = lid;
3193 				lq->vpl_count++;
3194 
3195 				if (object->internal) {
3196 					lq->vpl_internal_count++;
3197 				} else {
3198 					lq->vpl_external_count++;
3199 				}
3200 
3201 				VPL_UNLOCK(&lq->vpl_lock);
3202 
3203 				if (lq->vpl_count > vm_page_local_q_soft_limit) {
3204 					/*
3205 					 * we're beyond the soft limit
3206 					 * for the local queue
3207 					 * vm_page_reactivate_local will
3208 					 * 'try' to take the global page
3209 					 * queue lock... if it can't
3210 					 * that's ok... we'll let the
3211 					 * queue continue to grow up
3212 					 * to the hard limit... at that
3213 					 * point we'll wait for the
3214 					 * lock... once we've got the
3215 					 * lock, we'll transfer all of
3216 					 * the pages from the local
3217 					 * queue to the global active
3218 					 * queue
3219 					 */
3220 					vm_page_reactivate_local(lid, FALSE, FALSE);
3221 				}
3222 			} else {
3223 				__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3224 
3225 				/*
3226 				 * test again now that we hold the
3227 				 * page queue lock
3228 				 */
3229 				if (!VM_PAGE_WIRED(m)) {
3230 					if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3231 						vm_page_queues_remove(m, FALSE);
3232 
3233 						VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3234 						VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
3235 					}
3236 
3237 					if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
3238 					    no_cache) {
3239 						/*
3240 						 * If this is a no_cache mapping
3241 						 * and the page has never been
3242 						 * mapped before or was
3243 						 * previously a no_cache page,
3244 						 * then we want to leave pages
3245 						 * in the speculative state so
3246 						 * that they can be readily
3247 						 * recycled if free memory runs
3248 						 * low.  Otherwise the page is
3249 						 * activated as normal.
3250 						 */
3251 
3252 						if (no_cache &&
3253 						    (!previously_pmapped ||
3254 						    m->vmp_no_cache)) {
3255 							m->vmp_no_cache = TRUE;
3256 
3257 							if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
3258 								vm_page_speculate(m, FALSE);
3259 							}
3260 						} else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3261 							vm_page_activate(m);
3262 						}
3263 					}
3264 				}
3265 				/* we keep the page queues lock, if we need it later */
3266 			}
3267 		}
3268 	}
3269 	/* we're done with the page queues lock, if we ever took it */
3270 	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3271 }
3272 
3273 /*
3274  * Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
3275  * @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
3276  * before being inserted into the pmap.
3277  */
3278 static bool
vm_fault_enter_set_mapped(vm_object_t object,vm_page_t m,vm_prot_t prot,vm_prot_t fault_type)3279 vm_fault_enter_set_mapped(
3280 	vm_object_t object,
3281 	vm_page_t m,
3282 	vm_prot_t prot,
3283 	vm_prot_t fault_type)
3284 {
3285 	bool page_needs_sync = false;
3286 	/*
3287 	 * NOTE: we may only hold the vm_object lock SHARED
3288 	 * at this point, so we need the phys_page lock to
3289 	 * properly serialize updating the pmapped and
3290 	 * xpmapped bits
3291 	 */
3292 	if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3293 		ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3294 
3295 		pmap_lock_phys_page(phys_page);
3296 		m->vmp_pmapped = TRUE;
3297 
3298 		if (!m->vmp_xpmapped) {
3299 			m->vmp_xpmapped = TRUE;
3300 
3301 			pmap_unlock_phys_page(phys_page);
3302 
3303 			if (!object->internal) {
3304 				OSAddAtomic(1, &vm_page_xpmapped_external_count);
3305 			}
3306 
3307 #if defined(__arm64__)
3308 			page_needs_sync = true;
3309 #else
3310 			if (object->internal &&
3311 			    object->pager != NULL) {
3312 				/*
3313 				 * This page could have been
3314 				 * uncompressed by the
3315 				 * compressor pager and its
3316 				 * contents might be only in
3317 				 * the data cache.
3318 				 * Since it's being mapped for
3319 				 * "execute" for the fist time,
3320 				 * make sure the icache is in
3321 				 * sync.
3322 				 */
3323 				assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3324 				page_needs_sync = true;
3325 			}
3326 #endif
3327 		} else {
3328 			pmap_unlock_phys_page(phys_page);
3329 		}
3330 	} else {
3331 		if (m->vmp_pmapped == FALSE) {
3332 			ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3333 
3334 			pmap_lock_phys_page(phys_page);
3335 			m->vmp_pmapped = TRUE;
3336 			pmap_unlock_phys_page(phys_page);
3337 		}
3338 	}
3339 
3340 	if (fault_type & VM_PROT_WRITE) {
3341 		if (m->vmp_wpmapped == FALSE) {
3342 			vm_object_lock_assert_exclusive(object);
3343 			if (!object->internal && object->pager) {
3344 				task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3345 			}
3346 			m->vmp_wpmapped = TRUE;
3347 		}
3348 	}
3349 	return page_needs_sync;
3350 }
3351 
3352 /*
3353  * Try to enter the given page into the pmap.
3354  * Will retry without execute permission iff PMAP_CS is enabled and we encounter
3355  * a codesigning failure on a non-execute fault.
3356  */
3357 static kern_return_t
vm_fault_attempt_pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options)3358 vm_fault_attempt_pmap_enter(
3359 	pmap_t pmap,
3360 	vm_map_offset_t vaddr,
3361 	vm_map_size_t fault_page_size,
3362 	vm_map_offset_t fault_phys_offset,
3363 	vm_page_t m,
3364 	vm_prot_t *prot,
3365 	vm_prot_t caller_prot,
3366 	vm_prot_t fault_type,
3367 	bool wired,
3368 	int pmap_options)
3369 {
3370 #if !PMAP_CS
3371 #pragma unused(caller_prot)
3372 #endif /* !PMAP_CS */
3373 	kern_return_t kr;
3374 	if (fault_page_size != PAGE_SIZE) {
3375 		DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, *prot, fault_type);
3376 		assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
3377 		    fault_phys_offset < PAGE_SIZE),
3378 		    "0x%llx\n", (uint64_t)fault_phys_offset);
3379 	} else {
3380 		assertf(fault_phys_offset == 0,
3381 		    "0x%llx\n", (uint64_t)fault_phys_offset);
3382 	}
3383 
3384 	PMAP_ENTER_OPTIONS(pmap, vaddr,
3385 	    fault_phys_offset,
3386 	    m, *prot, fault_type, 0,
3387 	    wired,
3388 	    pmap_options,
3389 	    kr);
3390 	return kr;
3391 }
3392 
3393 /*
3394  * Enter the given page into the pmap.
3395  * The map must be locked shared.
3396  * The vm object must NOT be locked.
3397  *
3398  * @param need_retry if not null, avoid making a (potentially) blocking call into
3399  * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3400  */
3401 static kern_return_t
vm_fault_pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options,boolean_t * need_retry)3402 vm_fault_pmap_enter(
3403 	pmap_t pmap,
3404 	vm_map_offset_t vaddr,
3405 	vm_map_size_t fault_page_size,
3406 	vm_map_offset_t fault_phys_offset,
3407 	vm_page_t m,
3408 	vm_prot_t *prot,
3409 	vm_prot_t caller_prot,
3410 	vm_prot_t fault_type,
3411 	bool wired,
3412 	int pmap_options,
3413 	boolean_t *need_retry)
3414 {
3415 	kern_return_t kr;
3416 	if (need_retry != NULL) {
3417 		/*
3418 		 * Although we don't hold a lock on this object, we hold a lock
3419 		 * on the top object in the chain. To prevent a deadlock, we
3420 		 * can't allow the pmap layer to block.
3421 		 */
3422 		pmap_options |= PMAP_OPTIONS_NOWAIT;
3423 	}
3424 	kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3425 	    fault_page_size, fault_phys_offset,
3426 	    m, prot, caller_prot, fault_type, wired, pmap_options);
3427 	if (kr == KERN_RESOURCE_SHORTAGE) {
3428 		if (need_retry) {
3429 			/*
3430 			 * There's nothing we can do here since we hold the
3431 			 * lock on the top object in the chain. The caller
3432 			 * will need to deal with this by dropping that lock and retrying.
3433 			 */
3434 			*need_retry = TRUE;
3435 			vm_pmap_enter_retried++;
3436 		}
3437 	}
3438 	return kr;
3439 }
3440 
3441 /*
3442  * Enter the given page into the pmap.
3443  * The vm map must be locked shared.
3444  * The vm object must be locked exclusive, unless this is a soft fault.
3445  * For a soft fault, the object must be locked shared or exclusive.
3446  *
3447  * @param need_retry if not null, avoid making a (potentially) blocking call into
3448  * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3449  */
3450 static kern_return_t
vm_fault_pmap_enter_with_object_lock(vm_object_t object,pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options,boolean_t * need_retry)3451 vm_fault_pmap_enter_with_object_lock(
3452 	vm_object_t object,
3453 	pmap_t pmap,
3454 	vm_map_offset_t vaddr,
3455 	vm_map_size_t fault_page_size,
3456 	vm_map_offset_t fault_phys_offset,
3457 	vm_page_t m,
3458 	vm_prot_t *prot,
3459 	vm_prot_t caller_prot,
3460 	vm_prot_t fault_type,
3461 	bool wired,
3462 	int pmap_options,
3463 	boolean_t *need_retry)
3464 {
3465 	kern_return_t kr;
3466 	/*
3467 	 * Prevent a deadlock by not
3468 	 * holding the object lock if we need to wait for a page in
3469 	 * pmap_enter() - <rdar://problem/7138958>
3470 	 */
3471 	kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3472 	    fault_page_size, fault_phys_offset,
3473 	    m, prot, caller_prot, fault_type, wired, pmap_options | PMAP_OPTIONS_NOWAIT);
3474 #if __x86_64__
3475 	if (kr == KERN_INVALID_ARGUMENT &&
3476 	    pmap == PMAP_NULL &&
3477 	    wired) {
3478 		/*
3479 		 * Wiring a page in a pmap-less VM map:
3480 		 * VMware's "vmmon" kernel extension does this
3481 		 * to grab pages.
3482 		 * Let it proceed even though the PMAP_ENTER() failed.
3483 		 */
3484 		kr = KERN_SUCCESS;
3485 	}
3486 #endif /* __x86_64__ */
3487 
3488 	if (kr == KERN_RESOURCE_SHORTAGE) {
3489 		if (need_retry) {
3490 			/*
3491 			 * this will be non-null in the case where we hold the lock
3492 			 * on the top-object in this chain... we can't just drop
3493 			 * the lock on the object we're inserting the page into
3494 			 * and recall the PMAP_ENTER since we can still cause
3495 			 * a deadlock if one of the critical paths tries to
3496 			 * acquire the lock on the top-object and we're blocked
3497 			 * in PMAP_ENTER waiting for memory... our only recourse
3498 			 * is to deal with it at a higher level where we can
3499 			 * drop both locks.
3500 			 */
3501 			*need_retry = TRUE;
3502 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PMAP_ENTER_RESOURCE_SHORTAGE), 0 /* arg */);
3503 			vm_pmap_enter_retried++;
3504 			goto done;
3505 		}
3506 		/*
3507 		 * The nonblocking version of pmap_enter did not succeed.
3508 		 * and we don't need to drop other locks and retry
3509 		 * at the level above us, so
3510 		 * use the blocking version instead. Requires marking
3511 		 * the page busy and unlocking the object
3512 		 */
3513 		boolean_t was_busy = m->vmp_busy;
3514 
3515 		vm_object_lock_assert_exclusive(object);
3516 
3517 		m->vmp_busy = TRUE;
3518 		vm_object_unlock(object);
3519 
3520 		PMAP_ENTER_OPTIONS(pmap, vaddr,
3521 		    fault_phys_offset,
3522 		    m, *prot, fault_type,
3523 		    0, wired,
3524 		    pmap_options, kr);
3525 
3526 		assert(VM_PAGE_OBJECT(m) == object);
3527 
3528 		/* Take the object lock again. */
3529 		vm_object_lock(object);
3530 
3531 		/* If the page was busy, someone else will wake it up.
3532 		 * Otherwise, we have to do it now. */
3533 		assert(m->vmp_busy);
3534 		if (!was_busy) {
3535 			PAGE_WAKEUP_DONE(m);
3536 		}
3537 		vm_pmap_enter_blocked++;
3538 	}
3539 
3540 done:
3541 	return kr;
3542 }
3543 
3544 /*
3545  * Prepare to enter a page into the pmap by checking CS, protection bits,
3546  * and setting mapped bits on the page_t.
3547  * Does not modify the page's paging queue.
3548  *
3549  * page queue lock must NOT be held
3550  * m->vmp_object must be locked
3551  *
3552  * NOTE: m->vmp_object could be locked "shared" only if we are called
3553  * from vm_fault() as part of a soft fault.
3554  */
3555 static kern_return_t
vm_fault_enter_prepare(vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_prot_t * prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,boolean_t change_wiring,vm_prot_t fault_type,vm_object_fault_info_t fault_info,int * type_of_fault,bool * page_needs_data_sync)3556 vm_fault_enter_prepare(
3557 	vm_page_t m,
3558 	pmap_t pmap,
3559 	vm_map_offset_t vaddr,
3560 	vm_prot_t *prot,
3561 	vm_prot_t caller_prot,
3562 	vm_map_size_t fault_page_size,
3563 	vm_map_offset_t fault_phys_offset,
3564 	boolean_t change_wiring,
3565 	vm_prot_t fault_type,
3566 	vm_object_fault_info_t fault_info,
3567 	int *type_of_fault,
3568 	bool *page_needs_data_sync)
3569 {
3570 	kern_return_t   kr;
3571 	bool            is_tainted = false;
3572 	vm_object_t     object;
3573 	boolean_t       cs_bypass = fault_info->cs_bypass;
3574 
3575 	object = VM_PAGE_OBJECT(m);
3576 
3577 	vm_object_lock_assert_held(object);
3578 
3579 #if KASAN
3580 	if (pmap == kernel_pmap) {
3581 		kasan_notify_address(vaddr, PAGE_SIZE);
3582 	}
3583 #endif
3584 
3585 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
3586 
3587 	if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
3588 		vm_object_lock_assert_exclusive(object);
3589 	} else if ((fault_type & VM_PROT_WRITE) == 0 &&
3590 	    !change_wiring &&
3591 	    (!m->vmp_wpmapped
3592 #if VM_OBJECT_ACCESS_TRACKING
3593 	    || object->access_tracking
3594 #endif /* VM_OBJECT_ACCESS_TRACKING */
3595 	    )) {
3596 		/*
3597 		 * This is not a "write" fault, so we
3598 		 * might not have taken the object lock
3599 		 * exclusively and we might not be able
3600 		 * to update the "wpmapped" bit in
3601 		 * vm_fault_enter().
3602 		 * Let's just grant read access to
3603 		 * the page for now and we'll
3604 		 * soft-fault again if we need write
3605 		 * access later...
3606 		 */
3607 
3608 		/* This had better not be a JIT page. */
3609 		if (!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
3610 			*prot &= ~VM_PROT_WRITE;
3611 		} else {
3612 			assert(cs_bypass);
3613 		}
3614 	}
3615 	if (m->vmp_pmapped == FALSE) {
3616 		if (m->vmp_clustered) {
3617 			if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
3618 				/*
3619 				 * found it in the cache, but this
3620 				 * is the first fault-in of the page (m->vmp_pmapped == FALSE)
3621 				 * so it must have come in as part of
3622 				 * a cluster... account 1 pagein against it
3623 				 */
3624 				if (object->internal) {
3625 					*type_of_fault = DBG_PAGEIND_FAULT;
3626 				} else {
3627 					*type_of_fault = DBG_PAGEINV_FAULT;
3628 				}
3629 
3630 				VM_PAGE_COUNT_AS_PAGEIN(m);
3631 			}
3632 			VM_PAGE_CONSUME_CLUSTERED(m);
3633 		}
3634 	}
3635 
3636 	if (*type_of_fault != DBG_COW_FAULT) {
3637 		DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
3638 
3639 		if (pmap == kernel_pmap) {
3640 			DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
3641 		}
3642 	}
3643 
3644 	kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr,
3645 	    *prot, caller_prot, fault_page_size, fault_phys_offset,
3646 	    fault_info, &is_tainted);
3647 	if (kr == KERN_SUCCESS) {
3648 		/*
3649 		 * We either have a good page, or a tainted page that has been accepted by the process.
3650 		 * In both cases the page will be entered into the pmap.
3651 		 */
3652 		*page_needs_data_sync = vm_fault_enter_set_mapped(object, m, *prot, fault_type);
3653 		if ((fault_type & VM_PROT_WRITE) && is_tainted) {
3654 			/*
3655 			 * This page is tainted but we're inserting it anyways.
3656 			 * Since it's writeable, we need to disconnect it from other pmaps
3657 			 * now so those processes can take note.
3658 			 */
3659 
3660 			/*
3661 			 * We can only get here
3662 			 * because of the CSE logic
3663 			 */
3664 			assert(pmap_get_vm_map_cs_enforced(pmap));
3665 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3666 			/*
3667 			 * If we are faulting for a write, we can clear
3668 			 * the execute bit - that will ensure the page is
3669 			 * checked again before being executable, which
3670 			 * protects against a map switch.
3671 			 * This only happens the first time the page
3672 			 * gets tainted, so we won't get stuck here
3673 			 * to make an already writeable page executable.
3674 			 */
3675 			if (!cs_bypass) {
3676 				assert(!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot));
3677 				*prot &= ~VM_PROT_EXECUTE;
3678 			}
3679 		}
3680 		assert(VM_PAGE_OBJECT(m) == object);
3681 
3682 #if VM_OBJECT_ACCESS_TRACKING
3683 		if (object->access_tracking) {
3684 			DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
3685 			if (fault_type & VM_PROT_WRITE) {
3686 				object->access_tracking_writes++;
3687 				vm_object_access_tracking_writes++;
3688 			} else {
3689 				object->access_tracking_reads++;
3690 				vm_object_access_tracking_reads++;
3691 			}
3692 		}
3693 #endif /* VM_OBJECT_ACCESS_TRACKING */
3694 	}
3695 
3696 	return kr;
3697 }
3698 
3699 /*
3700  * page queue lock must NOT be held
3701  * m->vmp_object must be locked
3702  *
3703  * NOTE: m->vmp_object could be locked "shared" only if we are called
3704  * from vm_fault() as part of a soft fault.  If so, we must be
3705  * careful not to modify the VM object in any way that is not
3706  * legal under a shared lock...
3707  */
3708 kern_return_t
vm_fault_enter(vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_prot_t prot,vm_prot_t caller_prot,boolean_t wired,boolean_t change_wiring,vm_tag_t wire_tag,vm_object_fault_info_t fault_info,boolean_t * need_retry,int * type_of_fault)3709 vm_fault_enter(
3710 	vm_page_t m,
3711 	pmap_t pmap,
3712 	vm_map_offset_t vaddr,
3713 	vm_map_size_t fault_page_size,
3714 	vm_map_offset_t fault_phys_offset,
3715 	vm_prot_t prot,
3716 	vm_prot_t caller_prot,
3717 	boolean_t wired,
3718 	boolean_t change_wiring,
3719 	vm_tag_t  wire_tag,
3720 	vm_object_fault_info_t fault_info,
3721 	boolean_t *need_retry,
3722 	int *type_of_fault)
3723 {
3724 	kern_return_t   kr;
3725 	vm_object_t     object;
3726 	bool            page_needs_data_sync;
3727 	vm_prot_t       fault_type;
3728 	int             pmap_options = fault_info->pmap_options;
3729 
3730 	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
3731 		assert(m->vmp_fictitious);
3732 		return KERN_SUCCESS;
3733 	}
3734 
3735 	fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
3736 
3737 	assertf(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL, "m=%p", m);
3738 	kr = vm_fault_enter_prepare(m, pmap, vaddr, &prot, caller_prot,
3739 	    fault_page_size, fault_phys_offset, change_wiring, fault_type,
3740 	    fault_info, type_of_fault, &page_needs_data_sync);
3741 	object = VM_PAGE_OBJECT(m);
3742 
3743 	vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info->no_cache, type_of_fault, kr);
3744 
3745 	if (kr == KERN_SUCCESS) {
3746 		if (page_needs_data_sync) {
3747 			pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
3748 		}
3749 
3750 		kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
3751 		    fault_page_size, fault_phys_offset, m,
3752 		    &prot, caller_prot, fault_type, wired, pmap_options, need_retry);
3753 	}
3754 
3755 	return kr;
3756 }
3757 
3758 void
vm_pre_fault(vm_map_offset_t vaddr,vm_prot_t prot)3759 vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
3760 {
3761 	if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3762 		vm_fault(current_map(),      /* map */
3763 		    vaddr,                   /* vaddr */
3764 		    prot,                    /* fault_type */
3765 		    FALSE,                   /* change_wiring */
3766 		    VM_KERN_MEMORY_NONE,     /* tag - not wiring */
3767 		    THREAD_UNINT,            /* interruptible */
3768 		    NULL,                    /* caller_pmap */
3769 		    0 /* caller_pmap_addr */);
3770 	}
3771 }
3772 
3773 
3774 /*
3775  *	Routine:	vm_fault
3776  *	Purpose:
3777  *		Handle page faults, including pseudo-faults
3778  *		used to change the wiring status of pages.
3779  *	Returns:
3780  *		Explicit continuations have been removed.
3781  *	Implementation:
3782  *		vm_fault and vm_fault_page save mucho state
3783  *		in the moral equivalent of a closure.  The state
3784  *		structure is allocated when first entering vm_fault
3785  *		and deallocated when leaving vm_fault.
3786  */
3787 
3788 extern uint64_t get_current_unique_pid(void);
3789 
3790 unsigned long vm_fault_collapse_total = 0;
3791 unsigned long vm_fault_collapse_skipped = 0;
3792 
3793 
3794 kern_return_t
vm_fault_external(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t fault_type,boolean_t change_wiring,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr)3795 vm_fault_external(
3796 	vm_map_t        map,
3797 	vm_map_offset_t vaddr,
3798 	vm_prot_t       fault_type,
3799 	boolean_t       change_wiring,
3800 	int             interruptible,
3801 	pmap_t          caller_pmap,
3802 	vm_map_offset_t caller_pmap_addr)
3803 {
3804 	return vm_fault_internal(map, vaddr, fault_type, change_wiring,
3805 	           change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
3806 	           interruptible, caller_pmap, caller_pmap_addr,
3807 	           NULL);
3808 }
3809 
3810 kern_return_t
vm_fault(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t fault_type,boolean_t change_wiring,vm_tag_t wire_tag,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr)3811 vm_fault(
3812 	vm_map_t        map,
3813 	vm_map_offset_t vaddr,
3814 	vm_prot_t       fault_type,
3815 	boolean_t       change_wiring,
3816 	vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3817 	int             interruptible,
3818 	pmap_t          caller_pmap,
3819 	vm_map_offset_t caller_pmap_addr)
3820 {
3821 	return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
3822 	           interruptible, caller_pmap, caller_pmap_addr,
3823 	           NULL);
3824 }
3825 
3826 static boolean_t
current_proc_is_privileged(void)3827 current_proc_is_privileged(void)
3828 {
3829 	return csproc_get_platform_binary(current_proc());
3830 }
3831 
3832 uint64_t vm_copied_on_read = 0;
3833 
3834 /*
3835  * Cleanup after a vm_fault_enter.
3836  * At this point, the fault should either have failed (kr != KERN_SUCCESS)
3837  * or the page should be in the pmap and on the correct paging queue.
3838  *
3839  * Precondition:
3840  * map must be locked shared.
3841  * m_object must be locked.
3842  * If top_object != VM_OBJECT_NULL, it must be locked.
3843  * real_map must be locked.
3844  *
3845  * Postcondition:
3846  * map will be unlocked
3847  * m_object will be unlocked
3848  * top_object will be unlocked
3849  * If real_map != map, it will be unlocked
3850  */
3851 static void
vm_fault_complete(vm_map_t map,vm_map_t real_map,vm_object_t object,vm_object_t m_object,vm_page_t m,vm_map_offset_t offset,vm_map_offset_t trace_real_vaddr,vm_object_fault_info_t fault_info,vm_prot_t caller_prot,vm_map_offset_t real_vaddr,int type_of_fault,boolean_t need_retry,kern_return_t kr,ppnum_t * physpage_p,vm_prot_t prot,vm_object_t top_object,boolean_t need_collapse,vm_map_offset_t cur_offset,vm_prot_t fault_type,vm_object_t * written_on_object,memory_object_t * written_on_pager,vm_object_offset_t * written_on_offset)3852 vm_fault_complete(
3853 	vm_map_t map,
3854 	vm_map_t real_map,
3855 	vm_object_t object,
3856 	vm_object_t m_object,
3857 	vm_page_t m,
3858 	vm_map_offset_t offset,
3859 	vm_map_offset_t trace_real_vaddr,
3860 	vm_object_fault_info_t fault_info,
3861 	vm_prot_t caller_prot,
3862 #if CONFIG_DTRACE
3863 	vm_map_offset_t real_vaddr,
3864 #else
3865 	__unused vm_map_offset_t real_vaddr,
3866 #endif /* CONFIG_DTRACE */
3867 	int type_of_fault,
3868 	boolean_t need_retry,
3869 	kern_return_t kr,
3870 	ppnum_t *physpage_p,
3871 	vm_prot_t prot,
3872 	vm_object_t top_object,
3873 	boolean_t need_collapse,
3874 	vm_map_offset_t cur_offset,
3875 	vm_prot_t fault_type,
3876 	vm_object_t *written_on_object,
3877 	memory_object_t *written_on_pager,
3878 	vm_object_offset_t *written_on_offset)
3879 {
3880 	int     event_code = 0;
3881 	vm_map_lock_assert_shared(map);
3882 	vm_object_lock_assert_held(m_object);
3883 	if (top_object != VM_OBJECT_NULL) {
3884 		vm_object_lock_assert_held(top_object);
3885 	}
3886 	vm_map_lock_assert_held(real_map);
3887 
3888 	if (m_object->internal) {
3889 		event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
3890 	} else if (m_object->object_is_shared_cache) {
3891 		event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
3892 	} else {
3893 		event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
3894 	}
3895 	KDBG_RELEASE(event_code | DBG_FUNC_NONE, trace_real_vaddr, (fault_info->user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid());
3896 	if (need_retry == FALSE) {
3897 		KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid());
3898 	}
3899 	DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
3900 	if (kr == KERN_SUCCESS &&
3901 	    physpage_p != NULL) {
3902 		/* for vm_map_wire_and_extract() */
3903 		*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
3904 		if (prot & VM_PROT_WRITE) {
3905 			vm_object_lock_assert_exclusive(m_object);
3906 			m->vmp_dirty = TRUE;
3907 		}
3908 	}
3909 
3910 	if (top_object != VM_OBJECT_NULL) {
3911 		/*
3912 		 * It's safe to drop the top object
3913 		 * now that we've done our
3914 		 * vm_fault_enter().  Any other fault
3915 		 * in progress for that virtual
3916 		 * address will either find our page
3917 		 * and translation or put in a new page
3918 		 * and translation.
3919 		 */
3920 		vm_object_unlock(top_object);
3921 		top_object = VM_OBJECT_NULL;
3922 	}
3923 
3924 	if (need_collapse == TRUE) {
3925 		vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
3926 	}
3927 
3928 	if (need_retry == FALSE &&
3929 	    (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3930 		/*
3931 		 * evaluate access pattern and update state
3932 		 * vm_fault_deactivate_behind depends on the
3933 		 * state being up to date
3934 		 */
3935 		vm_fault_is_sequential(m_object, cur_offset, fault_info->behavior);
3936 
3937 		vm_fault_deactivate_behind(m_object, cur_offset, fault_info->behavior);
3938 	}
3939 	/*
3940 	 * That's it, clean up and return.
3941 	 */
3942 	if (m->vmp_busy) {
3943 		vm_object_lock_assert_exclusive(m_object);
3944 		PAGE_WAKEUP_DONE(m);
3945 	}
3946 
3947 	if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
3948 		vm_object_paging_begin(m_object);
3949 
3950 		assert(*written_on_object == VM_OBJECT_NULL);
3951 		*written_on_object = m_object;
3952 		*written_on_pager = m_object->pager;
3953 		*written_on_offset = m_object->paging_offset + m->vmp_offset;
3954 	}
3955 	vm_object_unlock(object);
3956 
3957 	vm_map_unlock_read(map);
3958 	if (real_map != map) {
3959 		vm_map_unlock(real_map);
3960 	}
3961 }
3962 
3963 static inline int
vm_fault_type_for_tracing(boolean_t need_copy_on_read,int type_of_fault)3964 vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
3965 {
3966 	if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
3967 		return DBG_COR_FAULT;
3968 	}
3969 	return type_of_fault;
3970 }
3971 
3972 uint64_t vm_fault_resilient_media_initiate = 0;
3973 uint64_t vm_fault_resilient_media_retry = 0;
3974 uint64_t vm_fault_resilient_media_proceed = 0;
3975 uint64_t vm_fault_resilient_media_release = 0;
3976 uint64_t vm_fault_resilient_media_abort1 = 0;
3977 uint64_t vm_fault_resilient_media_abort2 = 0;
3978 
3979 #if MACH_ASSERT
3980 int vm_fault_resilient_media_inject_error1_rate = 0;
3981 int vm_fault_resilient_media_inject_error1 = 0;
3982 int vm_fault_resilient_media_inject_error2_rate = 0;
3983 int vm_fault_resilient_media_inject_error2 = 0;
3984 int vm_fault_resilient_media_inject_error3_rate = 0;
3985 int vm_fault_resilient_media_inject_error3 = 0;
3986 #endif /* MACH_ASSERT */
3987 
3988 kern_return_t
vm_fault_internal(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t caller_prot,boolean_t change_wiring,vm_tag_t wire_tag,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr,ppnum_t * physpage_p)3989 vm_fault_internal(
3990 	vm_map_t        map,
3991 	vm_map_offset_t vaddr,
3992 	vm_prot_t       caller_prot,
3993 	boolean_t       change_wiring,
3994 	vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3995 	int             interruptible,
3996 	pmap_t          caller_pmap,
3997 	vm_map_offset_t caller_pmap_addr,
3998 	ppnum_t         *physpage_p)
3999 {
4000 	vm_map_version_t        version;        /* Map version for verificiation */
4001 	boolean_t               wired;          /* Should mapping be wired down? */
4002 	vm_object_t             object;         /* Top-level object */
4003 	vm_object_offset_t      offset;         /* Top-level offset */
4004 	vm_prot_t               prot;           /* Protection for mapping */
4005 	vm_object_t             old_copy_object; /* Saved copy object */
4006 	vm_page_t               result_page;    /* Result of vm_fault_page */
4007 	vm_page_t               top_page;       /* Placeholder page */
4008 	kern_return_t           kr;
4009 
4010 	vm_page_t               m;      /* Fast access to result_page */
4011 	kern_return_t           error_code;
4012 	vm_object_t             cur_object;
4013 	vm_object_t             m_object = NULL;
4014 	vm_object_offset_t      cur_offset;
4015 	vm_page_t               cur_m;
4016 	vm_object_t             new_object;
4017 	int                     type_of_fault;
4018 	pmap_t                  pmap;
4019 	wait_interrupt_t        interruptible_state;
4020 	vm_map_t                real_map = map;
4021 	vm_map_t                original_map = map;
4022 	bool                    object_locks_dropped = FALSE;
4023 	vm_prot_t               fault_type;
4024 	vm_prot_t               original_fault_type;
4025 	struct vm_object_fault_info fault_info = {};
4026 	bool                    need_collapse = FALSE;
4027 	boolean_t               need_retry = FALSE;
4028 	boolean_t               *need_retry_ptr = NULL;
4029 	uint8_t                 object_lock_type = 0;
4030 	uint8_t                 cur_object_lock_type;
4031 	vm_object_t             top_object = VM_OBJECT_NULL;
4032 	vm_object_t             written_on_object = VM_OBJECT_NULL;
4033 	memory_object_t         written_on_pager = NULL;
4034 	vm_object_offset_t      written_on_offset = 0;
4035 	int                     throttle_delay;
4036 	int                     compressed_count_delta;
4037 	uint8_t                 grab_options;
4038 	bool                    need_copy;
4039 	bool                    need_copy_on_read;
4040 	vm_map_offset_t         trace_vaddr;
4041 	vm_map_offset_t         trace_real_vaddr;
4042 	vm_map_size_t           fault_page_size;
4043 	vm_map_size_t           fault_page_mask;
4044 	int                     fault_page_shift;
4045 	vm_map_offset_t         fault_phys_offset;
4046 	vm_map_offset_t         real_vaddr;
4047 	bool                    resilient_media_retry = false;
4048 	bool                    resilient_media_ref_transfer = false;
4049 	vm_object_t             resilient_media_object = VM_OBJECT_NULL;
4050 	vm_object_offset_t      resilient_media_offset = (vm_object_offset_t)-1;
4051 	bool                    page_needs_data_sync = false;
4052 	/*
4053 	 * Was the VM object contended when vm_map_lookup_and_lock_object locked it?
4054 	 * If so, the zero fill path will drop the lock
4055 	 * NB: Ideally we would always drop the lock rather than rely on
4056 	 * this heuristic, but vm_object_unlock currently takes > 30 cycles.
4057 	 */
4058 	bool                    object_is_contended = false;
4059 
4060 	real_vaddr = vaddr;
4061 	trace_real_vaddr = vaddr;
4062 
4063 	/*
4064 	 * Some (kernel) submaps are marked with "should never fault".
4065 	 *
4066 	 * We do this for two reasons:
4067 	 * - PGZ which is inside the zone map range can't go down the normal
4068 	 *   lookup path (vm_map_lookup_entry() would panic).
4069 	 *
4070 	 * - we want for guard pages to not have to use fictitious pages at all
4071 	 *   to prevent from ZFOD pages to be made.
4072 	 *
4073 	 * We also want capture the fault address easily so that the zone
4074 	 * allocator might present an enhanced panic log.
4075 	 */
4076 	if (map->never_faults || (pgz_owned(vaddr) && map->pmap == kernel_pmap)) {
4077 		assert(map->pmap == kernel_pmap);
4078 		panic_fault_address = vaddr;
4079 		return KERN_INVALID_ADDRESS;
4080 	}
4081 
4082 	if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
4083 		fault_phys_offset = (vm_map_offset_t)-1;
4084 		fault_page_size = VM_MAP_PAGE_SIZE(original_map);
4085 		fault_page_mask = VM_MAP_PAGE_MASK(original_map);
4086 		fault_page_shift = VM_MAP_PAGE_SHIFT(original_map);
4087 		if (fault_page_size < PAGE_SIZE) {
4088 			DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
4089 			vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
4090 		}
4091 	} else {
4092 		fault_phys_offset = 0;
4093 		fault_page_size = PAGE_SIZE;
4094 		fault_page_mask = PAGE_MASK;
4095 		fault_page_shift = PAGE_SHIFT;
4096 		vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
4097 	}
4098 
4099 	if (map == kernel_map) {
4100 		trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
4101 		trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
4102 	} else {
4103 		trace_vaddr = vaddr;
4104 	}
4105 
4106 	KDBG_RELEASE(
4107 		(MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
4108 		((uint64_t)trace_vaddr >> 32),
4109 		trace_vaddr,
4110 		(map == kernel_map));
4111 
4112 	if (get_preemption_level() != 0) {
4113 		KDBG_RELEASE(
4114 			(MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4115 			((uint64_t)trace_vaddr >> 32),
4116 			trace_vaddr,
4117 			KERN_FAILURE);
4118 
4119 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NONZERO_PREEMPTION_LEVEL), 0 /* arg */);
4120 		return KERN_FAILURE;
4121 	}
4122 
4123 	thread_t cthread = current_thread();
4124 	bool      rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
4125 	uint64_t fstart = 0;
4126 
4127 	if (rtfault) {
4128 		fstart = mach_continuous_time();
4129 	}
4130 
4131 	interruptible_state = thread_interrupt_level(interruptible);
4132 
4133 	fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
4134 
4135 	counter_inc(&vm_statistics_faults);
4136 	counter_inc(&current_task()->faults);
4137 	original_fault_type = fault_type;
4138 
4139 	need_copy = FALSE;
4140 	if (fault_type & VM_PROT_WRITE) {
4141 		need_copy = TRUE;
4142 	}
4143 
4144 	if (need_copy || change_wiring) {
4145 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4146 	} else {
4147 		object_lock_type = OBJECT_LOCK_SHARED;
4148 	}
4149 
4150 	cur_object_lock_type = OBJECT_LOCK_SHARED;
4151 
4152 	if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
4153 		if (compressor_map) {
4154 			if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
4155 				panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
4156 			}
4157 		}
4158 	}
4159 RetryFault:
4160 	assert(written_on_object == VM_OBJECT_NULL);
4161 
4162 	/*
4163 	 * assume we will hit a page in the cache
4164 	 * otherwise, explicitly override with
4165 	 * the real fault type once we determine it
4166 	 */
4167 	type_of_fault = DBG_CACHE_HIT_FAULT;
4168 
4169 	/*
4170 	 *	Find the backing store object and offset into
4171 	 *	it to begin the search.
4172 	 */
4173 	fault_type = original_fault_type;
4174 	map = original_map;
4175 	vm_map_lock_read(map);
4176 
4177 	if (resilient_media_retry) {
4178 		/*
4179 		 * If we have to insert a fake zero-filled page to hide
4180 		 * a media failure to provide the real page, we need to
4181 		 * resolve any pending copy-on-write on this mapping.
4182 		 * VM_PROT_COPY tells vm_map_lookup_and_lock_object() to deal
4183 		 * with that even if this is not a "write" fault.
4184 		 */
4185 		need_copy = TRUE;
4186 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4187 		vm_fault_resilient_media_retry++;
4188 	}
4189 
4190 	kr = vm_map_lookup_and_lock_object(&map, vaddr,
4191 	    (fault_type | (need_copy ? VM_PROT_COPY : 0)),
4192 	    object_lock_type, &version,
4193 	    &object, &offset, &prot, &wired,
4194 	    &fault_info,
4195 	    &real_map,
4196 	    &object_is_contended);
4197 
4198 	if (kr != KERN_SUCCESS) {
4199 		vm_map_unlock_read(map);
4200 		/*
4201 		 * This can be seen in a crash report if indeed the
4202 		 * thread is crashing due to an invalid access in a non-existent
4203 		 * range.
4204 		 * Turning this OFF for now because it is noisy and not always fatal
4205 		 * eg prefaulting.
4206 		 *
4207 		 * if (kr == KERN_INVALID_ADDRESS) {
4208 		 *	ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_ADDRESS_NOT_FOUND), 0);
4209 		 * }
4210 		 */
4211 		goto done;
4212 	}
4213 
4214 
4215 	pmap = real_map->pmap;
4216 	fault_info.interruptible = interruptible;
4217 	fault_info.stealth = FALSE;
4218 	fault_info.io_sync = FALSE;
4219 	fault_info.mark_zf_absent = FALSE;
4220 	fault_info.batch_pmap_op = FALSE;
4221 
4222 	if (resilient_media_retry) {
4223 		/*
4224 		 * We're retrying this fault after having detected a media
4225 		 * failure from a "resilient_media" mapping.
4226 		 * Check that the mapping is still pointing at the object
4227 		 * that just failed to provide a page.
4228 		 */
4229 		assert(resilient_media_object != VM_OBJECT_NULL);
4230 		assert(resilient_media_offset != (vm_object_offset_t)-1);
4231 		if ((object != VM_OBJECT_NULL &&
4232 		    object == resilient_media_object &&
4233 		    offset == resilient_media_offset &&
4234 		    fault_info.resilient_media)
4235 #if MACH_ASSERT
4236 		    && (vm_fault_resilient_media_inject_error1_rate == 0 ||
4237 		    (++vm_fault_resilient_media_inject_error1 % vm_fault_resilient_media_inject_error1_rate) != 0)
4238 #endif /* MACH_ASSERT */
4239 		    ) {
4240 			/*
4241 			 * This mapping still points at the same object
4242 			 * and is still "resilient_media": proceed in
4243 			 * "recovery-from-media-failure" mode, where we'll
4244 			 * insert a zero-filled page in the top object.
4245 			 */
4246 //                     printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
4247 			vm_fault_resilient_media_proceed++;
4248 		} else {
4249 			/* not recovering: reset state and retry fault */
4250 //                     printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
4251 			vm_object_unlock(object);
4252 			if (real_map != map) {
4253 				vm_map_unlock(real_map);
4254 			}
4255 			vm_map_unlock_read(map);
4256 			/* release our extra reference on failed object */
4257 //                     printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
4258 			vm_object_lock_assert_notheld(resilient_media_object);
4259 			vm_object_deallocate(resilient_media_object);
4260 			resilient_media_object = VM_OBJECT_NULL;
4261 			resilient_media_offset = (vm_object_offset_t)-1;
4262 			resilient_media_retry = false;
4263 			vm_fault_resilient_media_abort1++;
4264 			goto RetryFault;
4265 		}
4266 	} else {
4267 		assert(resilient_media_object == VM_OBJECT_NULL);
4268 		resilient_media_offset = (vm_object_offset_t)-1;
4269 	}
4270 
4271 	/*
4272 	 * If the page is wired, we must fault for the current protection
4273 	 * value, to avoid further faults.
4274 	 */
4275 	if (wired) {
4276 		fault_type = prot | VM_PROT_WRITE;
4277 	}
4278 	if (wired || need_copy) {
4279 		/*
4280 		 * since we're treating this fault as a 'write'
4281 		 * we must hold the top object lock exclusively
4282 		 */
4283 		if (object_lock_type == OBJECT_LOCK_SHARED) {
4284 			object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4285 
4286 			if (vm_object_lock_upgrade(object) == FALSE) {
4287 				/*
4288 				 * couldn't upgrade, so explictly
4289 				 * take the lock exclusively
4290 				 */
4291 				vm_object_lock(object);
4292 			}
4293 		}
4294 	}
4295 
4296 #if     VM_FAULT_CLASSIFY
4297 	/*
4298 	 *	Temporary data gathering code
4299 	 */
4300 	vm_fault_classify(object, offset, fault_type);
4301 #endif
4302 	/*
4303 	 *	Fast fault code.  The basic idea is to do as much as
4304 	 *	possible while holding the map lock and object locks.
4305 	 *      Busy pages are not used until the object lock has to
4306 	 *	be dropped to do something (copy, zero fill, pmap enter).
4307 	 *	Similarly, paging references aren't acquired until that
4308 	 *	point, and object references aren't used.
4309 	 *
4310 	 *	If we can figure out what to do
4311 	 *	(zero fill, copy on write, pmap enter) while holding
4312 	 *	the locks, then it gets done.  Otherwise, we give up,
4313 	 *	and use the original fault path (which doesn't hold
4314 	 *	the map lock, and relies on busy pages).
4315 	 *	The give up cases include:
4316 	 *              - Have to talk to pager.
4317 	 *		- Page is busy, absent or in error.
4318 	 *		- Pager has locked out desired access.
4319 	 *		- Fault needs to be restarted.
4320 	 *		- Have to push page into copy object.
4321 	 *
4322 	 *	The code is an infinite loop that moves one level down
4323 	 *	the shadow chain each time.  cur_object and cur_offset
4324 	 *      refer to the current object being examined. object and offset
4325 	 *	are the original object from the map.  The loop is at the
4326 	 *	top level if and only if object and cur_object are the same.
4327 	 *
4328 	 *	Invariants:  Map lock is held throughout.  Lock is held on
4329 	 *		original object and cur_object (if different) when
4330 	 *		continuing or exiting loop.
4331 	 *
4332 	 */
4333 
4334 #if defined(__arm64__)
4335 	/*
4336 	 * Fail if reading an execute-only page in a
4337 	 * pmap that enforces execute-only protection.
4338 	 */
4339 	if (fault_type == VM_PROT_READ &&
4340 	    (prot & VM_PROT_EXECUTE) &&
4341 	    !(prot & VM_PROT_READ) &&
4342 	    pmap_enforces_execute_only(pmap)) {
4343 		vm_object_unlock(object);
4344 		vm_map_unlock_read(map);
4345 		if (real_map != map) {
4346 			vm_map_unlock(real_map);
4347 		}
4348 		kr = KERN_PROTECTION_FAILURE;
4349 		goto done;
4350 	}
4351 #endif
4352 
4353 	fault_phys_offset = (vm_map_offset_t)offset - vm_map_trunc_page((vm_map_offset_t)offset, PAGE_MASK);
4354 
4355 	/*
4356 	 * If this page is to be inserted in a copy delay object
4357 	 * for writing, and if the object has a copy, then the
4358 	 * copy delay strategy is implemented in the slow fault page.
4359 	 */
4360 	if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
4361 	    object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
4362 		goto handle_copy_delay;
4363 	}
4364 
4365 	cur_object = object;
4366 	cur_offset = offset;
4367 
4368 	grab_options = 0;
4369 #if CONFIG_SECLUDED_MEMORY
4370 	if (object->can_grab_secluded) {
4371 		grab_options |= VM_PAGE_GRAB_SECLUDED;
4372 	}
4373 #endif /* CONFIG_SECLUDED_MEMORY */
4374 
4375 	while (TRUE) {
4376 		if (!cur_object->pager_created &&
4377 		    cur_object->phys_contiguous) { /* superpage */
4378 			break;
4379 		}
4380 
4381 		if (cur_object->blocked_access) {
4382 			/*
4383 			 * Access to this VM object has been blocked.
4384 			 * Let the slow path handle it.
4385 			 */
4386 			break;
4387 		}
4388 
4389 		m = vm_page_lookup(cur_object, vm_object_trunc_page(cur_offset));
4390 		m_object = NULL;
4391 
4392 		if (m != VM_PAGE_NULL) {
4393 			m_object = cur_object;
4394 
4395 			if (m->vmp_busy) {
4396 				wait_result_t   result;
4397 
4398 				/*
4399 				 * in order to do the PAGE_ASSERT_WAIT, we must
4400 				 * have object that 'm' belongs to locked exclusively
4401 				 */
4402 				if (object != cur_object) {
4403 					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4404 						cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4405 
4406 						if (vm_object_lock_upgrade(cur_object) == FALSE) {
4407 							/*
4408 							 * couldn't upgrade so go do a full retry
4409 							 * immediately since we can no longer be
4410 							 * certain about cur_object (since we
4411 							 * don't hold a reference on it)...
4412 							 * first drop the top object lock
4413 							 */
4414 							vm_object_unlock(object);
4415 
4416 							vm_map_unlock_read(map);
4417 							if (real_map != map) {
4418 								vm_map_unlock(real_map);
4419 							}
4420 
4421 							goto RetryFault;
4422 						}
4423 					}
4424 				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
4425 					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4426 
4427 					if (vm_object_lock_upgrade(object) == FALSE) {
4428 						/*
4429 						 * couldn't upgrade, so explictly take the lock
4430 						 * exclusively and go relookup the page since we
4431 						 * will have dropped the object lock and
4432 						 * a different thread could have inserted
4433 						 * a page at this offset
4434 						 * no need for a full retry since we're
4435 						 * at the top level of the object chain
4436 						 */
4437 						vm_object_lock(object);
4438 
4439 						continue;
4440 					}
4441 				}
4442 				if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
4443 					/*
4444 					 * m->vmp_busy == TRUE and the object is locked exclusively
4445 					 * if m->pageout_queue == TRUE after we acquire the
4446 					 * queues lock, we are guaranteed that it is stable on
4447 					 * the pageout queue and therefore reclaimable
4448 					 *
4449 					 * NOTE: this is only true for the internal pageout queue
4450 					 * in the compressor world
4451 					 */
4452 					assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
4453 
4454 					vm_page_lock_queues();
4455 
4456 					if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
4457 						vm_pageout_throttle_up(m);
4458 						vm_page_unlock_queues();
4459 
4460 						PAGE_WAKEUP_DONE(m);
4461 						goto reclaimed_from_pageout;
4462 					}
4463 					vm_page_unlock_queues();
4464 				}
4465 				if (object != cur_object) {
4466 					vm_object_unlock(object);
4467 				}
4468 
4469 				vm_map_unlock_read(map);
4470 				if (real_map != map) {
4471 					vm_map_unlock(real_map);
4472 				}
4473 
4474 				result = PAGE_ASSERT_WAIT(m, interruptible);
4475 
4476 				vm_object_unlock(cur_object);
4477 
4478 				if (result == THREAD_WAITING) {
4479 					result = thread_block(THREAD_CONTINUE_NULL);
4480 				}
4481 				if (result == THREAD_AWAKENED || result == THREAD_RESTART) {
4482 					goto RetryFault;
4483 				}
4484 
4485 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_BUSYPAGE_WAIT_INTERRUPTED), 0 /* arg */);
4486 				kr = KERN_ABORTED;
4487 				goto done;
4488 			}
4489 reclaimed_from_pageout:
4490 			if (m->vmp_laundry) {
4491 				if (object != cur_object) {
4492 					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4493 						cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4494 
4495 						vm_object_unlock(object);
4496 						vm_object_unlock(cur_object);
4497 
4498 						vm_map_unlock_read(map);
4499 						if (real_map != map) {
4500 							vm_map_unlock(real_map);
4501 						}
4502 
4503 						goto RetryFault;
4504 					}
4505 				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
4506 					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4507 
4508 					if (vm_object_lock_upgrade(object) == FALSE) {
4509 						/*
4510 						 * couldn't upgrade, so explictly take the lock
4511 						 * exclusively and go relookup the page since we
4512 						 * will have dropped the object lock and
4513 						 * a different thread could have inserted
4514 						 * a page at this offset
4515 						 * no need for a full retry since we're
4516 						 * at the top level of the object chain
4517 						 */
4518 						vm_object_lock(object);
4519 
4520 						continue;
4521 					}
4522 				}
4523 				vm_object_lock_assert_exclusive(VM_PAGE_OBJECT(m));
4524 				vm_pageout_steal_laundry(m, FALSE);
4525 			}
4526 
4527 
4528 			if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
4529 				/*
4530 				 * Guard page: let the slow path deal with it
4531 				 */
4532 				break;
4533 			}
4534 			if (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent)) {
4535 				/*
4536 				 * Unusual case... let the slow path deal with it
4537 				 */
4538 				break;
4539 			}
4540 			if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
4541 				if (object != cur_object) {
4542 					vm_object_unlock(object);
4543 				}
4544 				vm_map_unlock_read(map);
4545 				if (real_map != map) {
4546 					vm_map_unlock(real_map);
4547 				}
4548 				vm_object_unlock(cur_object);
4549 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
4550 				kr = KERN_MEMORY_ERROR;
4551 				goto done;
4552 			}
4553 			assert(m_object == VM_PAGE_OBJECT(m));
4554 
4555 			if (vm_fault_cs_need_validation(map->pmap, m, m_object,
4556 			    PAGE_SIZE, 0) ||
4557 			    (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
4558 upgrade_lock_and_retry:
4559 				/*
4560 				 * We might need to validate this page
4561 				 * against its code signature, so we
4562 				 * want to hold the VM object exclusively.
4563 				 */
4564 				if (object != cur_object) {
4565 					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4566 						vm_object_unlock(object);
4567 						vm_object_unlock(cur_object);
4568 
4569 						cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4570 
4571 						vm_map_unlock_read(map);
4572 						if (real_map != map) {
4573 							vm_map_unlock(real_map);
4574 						}
4575 
4576 						goto RetryFault;
4577 					}
4578 				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
4579 					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4580 
4581 					if (vm_object_lock_upgrade(object) == FALSE) {
4582 						/*
4583 						 * couldn't upgrade, so explictly take the lock
4584 						 * exclusively and go relookup the page since we
4585 						 * will have dropped the object lock and
4586 						 * a different thread could have inserted
4587 						 * a page at this offset
4588 						 * no need for a full retry since we're
4589 						 * at the top level of the object chain
4590 						 */
4591 						vm_object_lock(object);
4592 
4593 						continue;
4594 					}
4595 				}
4596 			}
4597 			/*
4598 			 *	Two cases of map in faults:
4599 			 *	    - At top level w/o copy object.
4600 			 *	    - Read fault anywhere.
4601 			 *		--> must disallow write.
4602 			 */
4603 
4604 			if (object == cur_object && object->copy == VM_OBJECT_NULL) {
4605 				goto FastPmapEnter;
4606 			}
4607 
4608 			if (!need_copy &&
4609 			    !fault_info.no_copy_on_read &&
4610 			    cur_object != object &&
4611 			    !cur_object->internal &&
4612 			    !cur_object->pager_trusted &&
4613 			    vm_protect_privileged_from_untrusted &&
4614 			    !cur_object->code_signed &&
4615 			    current_proc_is_privileged()) {
4616 				/*
4617 				 * We're faulting on a page in "object" and
4618 				 * went down the shadow chain to "cur_object"
4619 				 * to find out that "cur_object"'s pager
4620 				 * is not "trusted", i.e. we can not trust it
4621 				 * to always return the same contents.
4622 				 * Since the target is a "privileged" process,
4623 				 * let's treat this as a copy-on-read fault, as
4624 				 * if it was a copy-on-write fault.
4625 				 * Once "object" gets a copy of this page, it
4626 				 * won't have to rely on "cur_object" to
4627 				 * provide the contents again.
4628 				 *
4629 				 * This is done by setting "need_copy" and
4630 				 * retrying the fault from the top with the
4631 				 * appropriate locking.
4632 				 *
4633 				 * Special case: if the mapping is executable
4634 				 * and the untrusted object is code-signed and
4635 				 * the process is "cs_enforced", we do not
4636 				 * copy-on-read because that would break
4637 				 * code-signing enforcement expectations (an
4638 				 * executable page must belong to a code-signed
4639 				 * object) and we can rely on code-signing
4640 				 * to re-validate the page if it gets evicted
4641 				 * and paged back in.
4642 				 */
4643 //				printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
4644 				vm_copied_on_read++;
4645 				need_copy = TRUE;
4646 
4647 				vm_object_unlock(object);
4648 				vm_object_unlock(cur_object);
4649 				object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4650 				vm_map_unlock_read(map);
4651 				if (real_map != map) {
4652 					vm_map_unlock(real_map);
4653 				}
4654 				goto RetryFault;
4655 			}
4656 
4657 			if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
4658 				if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
4659 					prot &= ~VM_PROT_WRITE;
4660 				} else {
4661 					/*
4662 					 * For a protection that the pmap cares
4663 					 * about, we must hand over the full
4664 					 * set of protections (so that the pmap
4665 					 * layer can apply any desired policy).
4666 					 * This means that cs_bypass must be
4667 					 * set, as this can force us to pass
4668 					 * RWX.
4669 					 */
4670 					assert(fault_info.cs_bypass);
4671 				}
4672 
4673 				if (object != cur_object) {
4674 					/*
4675 					 * We still need to hold the top object
4676 					 * lock here to prevent a race between
4677 					 * a read fault (taking only "shared"
4678 					 * locks) and a write fault (taking
4679 					 * an "exclusive" lock on the top
4680 					 * object.
4681 					 * Otherwise, as soon as we release the
4682 					 * top lock, the write fault could
4683 					 * proceed and actually complete before
4684 					 * the read fault, and the copied page's
4685 					 * translation could then be overwritten
4686 					 * by the read fault's translation for
4687 					 * the original page.
4688 					 *
4689 					 * Let's just record what the top object
4690 					 * is and we'll release it later.
4691 					 */
4692 					top_object = object;
4693 
4694 					/*
4695 					 * switch to the object that has the new page
4696 					 */
4697 					object = cur_object;
4698 					object_lock_type = cur_object_lock_type;
4699 				}
4700 FastPmapEnter:
4701 				assert(m_object == VM_PAGE_OBJECT(m));
4702 
4703 				/*
4704 				 * prepare for the pmap_enter...
4705 				 * object and map are both locked
4706 				 * m contains valid data
4707 				 * object == m->vmp_object
4708 				 * cur_object == NULL or it's been unlocked
4709 				 * no paging references on either object or cur_object
4710 				 */
4711 				if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
4712 					need_retry_ptr = &need_retry;
4713 				} else {
4714 					need_retry_ptr = NULL;
4715 				}
4716 
4717 				if (fault_page_size < PAGE_SIZE) {
4718 					DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, caller_pmap, (uint64_t)caller_pmap_addr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
4719 					assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
4720 					    fault_phys_offset < PAGE_SIZE),
4721 					    "0x%llx\n", (uint64_t)fault_phys_offset);
4722 				} else {
4723 					assertf(fault_phys_offset == 0,
4724 					    "0x%llx\n", (uint64_t)fault_phys_offset);
4725 				}
4726 
4727 				assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p object=%p", m, m_object, object);
4728 				assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
4729 				if (caller_pmap) {
4730 					kr = vm_fault_enter(m,
4731 					    caller_pmap,
4732 					    caller_pmap_addr,
4733 					    fault_page_size,
4734 					    fault_phys_offset,
4735 					    prot,
4736 					    caller_prot,
4737 					    wired,
4738 					    change_wiring,
4739 					    wire_tag,
4740 					    &fault_info,
4741 					    need_retry_ptr,
4742 					    &type_of_fault);
4743 				} else {
4744 					kr = vm_fault_enter(m,
4745 					    pmap,
4746 					    vaddr,
4747 					    fault_page_size,
4748 					    fault_phys_offset,
4749 					    prot,
4750 					    caller_prot,
4751 					    wired,
4752 					    change_wiring,
4753 					    wire_tag,
4754 					    &fault_info,
4755 					    need_retry_ptr,
4756 					    &type_of_fault);
4757 				}
4758 
4759 				vm_fault_complete(
4760 					map,
4761 					real_map,
4762 					object,
4763 					m_object,
4764 					m,
4765 					offset,
4766 					trace_real_vaddr,
4767 					&fault_info,
4768 					caller_prot,
4769 					real_vaddr,
4770 					vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
4771 					need_retry,
4772 					kr,
4773 					physpage_p,
4774 					prot,
4775 					top_object,
4776 					need_collapse,
4777 					cur_offset,
4778 					fault_type,
4779 					&written_on_object,
4780 					&written_on_pager,
4781 					&written_on_offset);
4782 				top_object = VM_OBJECT_NULL;
4783 				if (need_retry == TRUE) {
4784 					/*
4785 					 * vm_fault_enter couldn't complete the PMAP_ENTER...
4786 					 * at this point we don't hold any locks so it's safe
4787 					 * to ask the pmap layer to expand the page table to
4788 					 * accommodate this mapping... once expanded, we'll
4789 					 * re-drive the fault which should result in vm_fault_enter
4790 					 * being able to successfully enter the mapping this time around
4791 					 */
4792 					(void)pmap_enter_options(
4793 						pmap, vaddr, 0, 0, 0, 0, 0,
4794 						PMAP_OPTIONS_NOENTER, NULL);
4795 
4796 					need_retry = FALSE;
4797 					goto RetryFault;
4798 				}
4799 				goto done;
4800 			}
4801 			/*
4802 			 * COPY ON WRITE FAULT
4803 			 */
4804 			assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
4805 
4806 			/*
4807 			 * If objects match, then
4808 			 * object->copy must not be NULL (else control
4809 			 * would be in previous code block), and we
4810 			 * have a potential push into the copy object
4811 			 * with which we can't cope with here.
4812 			 */
4813 			if (cur_object == object) {
4814 				/*
4815 				 * must take the slow path to
4816 				 * deal with the copy push
4817 				 */
4818 				break;
4819 			}
4820 
4821 			/*
4822 			 * This is now a shadow based copy on write
4823 			 * fault -- it requires a copy up the shadow
4824 			 * chain.
4825 			 */
4826 			assert(m_object == VM_PAGE_OBJECT(m));
4827 
4828 			if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
4829 			    vm_fault_cs_need_validation(NULL, m, m_object,
4830 			    PAGE_SIZE, 0)) {
4831 				goto upgrade_lock_and_retry;
4832 			}
4833 
4834 #if MACH_ASSERT
4835 			if (resilient_media_retry &&
4836 			    vm_fault_resilient_media_inject_error2_rate != 0 &&
4837 			    (++vm_fault_resilient_media_inject_error2 % vm_fault_resilient_media_inject_error2_rate) == 0) {
4838 				/* inject an error */
4839 				cur_m = m;
4840 				m = VM_PAGE_NULL;
4841 				m_object = VM_OBJECT_NULL;
4842 				break;
4843 			}
4844 #endif /* MACH_ASSERT */
4845 			/*
4846 			 * Allocate a page in the original top level
4847 			 * object. Give up if allocate fails.  Also
4848 			 * need to remember current page, as it's the
4849 			 * source of the copy.
4850 			 *
4851 			 * at this point we hold locks on both
4852 			 * object and cur_object... no need to take
4853 			 * paging refs or mark pages BUSY since
4854 			 * we don't drop either object lock until
4855 			 * the page has been copied and inserted
4856 			 */
4857 			cur_m = m;
4858 			m = vm_page_grab_options(grab_options);
4859 			m_object = NULL;
4860 
4861 			if (m == VM_PAGE_NULL) {
4862 				/*
4863 				 * no free page currently available...
4864 				 * must take the slow path
4865 				 */
4866 				break;
4867 			}
4868 
4869 			/*
4870 			 * Now do the copy.  Mark the source page busy...
4871 			 *
4872 			 *	NOTE: This code holds the map lock across
4873 			 *	the page copy.
4874 			 */
4875 			vm_page_copy(cur_m, m);
4876 			vm_page_insert(m, object, vm_object_trunc_page(offset));
4877 			if (VM_MAP_PAGE_MASK(map) != PAGE_MASK) {
4878 				DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map, (uint64_t)vaddr, cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
4879 			}
4880 			m_object = object;
4881 			SET_PAGE_DIRTY(m, FALSE);
4882 
4883 			/*
4884 			 * Now cope with the source page and object
4885 			 */
4886 			if (object->ref_count > 1 && cur_m->vmp_pmapped) {
4887 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4888 			} else if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
4889 				/*
4890 				 * We've copied the full 16K page but we're
4891 				 * about to call vm_fault_enter() only for
4892 				 * the 4K chunk we're faulting on.  The other
4893 				 * three 4K chunks in that page could still
4894 				 * be pmapped in this pmap.
4895 				 * Since the VM object layer thinks that the
4896 				 * entire page has been dealt with and the
4897 				 * original page might no longer be needed,
4898 				 * it might collapse/bypass the original VM
4899 				 * object and free its pages, which would be
4900 				 * bad (and would trigger pmap_verify_free()
4901 				 * assertions) if the other 4K chunks are still
4902 				 * pmapped.
4903 				 */
4904 				/*
4905 				 * XXX FBDP TODO4K: to be revisisted
4906 				 * Technically, we need to pmap_disconnect()
4907 				 * only the target pmap's mappings for the 4K
4908 				 * chunks of this 16K VM page.  If other pmaps
4909 				 * have PTEs on these chunks, that means that
4910 				 * the associated VM map must have a reference
4911 				 * on the VM object, so no need to worry about
4912 				 * those.
4913 				 * pmap_protect() for each 4K chunk would be
4914 				 * better but we'd have to check which chunks
4915 				 * are actually mapped before and after this
4916 				 * one.
4917 				 * A full-blown pmap_disconnect() is easier
4918 				 * for now but not efficient.
4919 				 */
4920 				DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, VM_PAGE_GET_PHYS_PAGE(cur_m));
4921 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4922 			}
4923 
4924 			if (cur_m->vmp_clustered) {
4925 				VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4926 				VM_PAGE_CONSUME_CLUSTERED(cur_m);
4927 				vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
4928 			}
4929 			need_collapse = TRUE;
4930 
4931 			if (!cur_object->internal &&
4932 			    cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4933 				/*
4934 				 * The object from which we've just
4935 				 * copied a page is most probably backed
4936 				 * by a vnode.  We don't want to waste too
4937 				 * much time trying to collapse the VM objects
4938 				 * and create a bottleneck when several tasks
4939 				 * map the same file.
4940 				 */
4941 				if (cur_object->copy == object) {
4942 					/*
4943 					 * Shared mapping or no COW yet.
4944 					 * We can never collapse a copy
4945 					 * object into its backing object.
4946 					 */
4947 					need_collapse = FALSE;
4948 				} else if (cur_object->copy == object->shadow &&
4949 				    object->shadow->resident_page_count == 0) {
4950 					/*
4951 					 * Shared mapping after a COW occurred.
4952 					 */
4953 					need_collapse = FALSE;
4954 				}
4955 			}
4956 			vm_object_unlock(cur_object);
4957 
4958 			if (need_collapse == FALSE) {
4959 				vm_fault_collapse_skipped++;
4960 			}
4961 			vm_fault_collapse_total++;
4962 
4963 			type_of_fault = DBG_COW_FAULT;
4964 			counter_inc(&vm_statistics_cow_faults);
4965 			DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4966 			counter_inc(&current_task()->cow_faults);
4967 
4968 			goto FastPmapEnter;
4969 		} else {
4970 			/*
4971 			 * No page at cur_object, cur_offset... m == NULL
4972 			 */
4973 			if (cur_object->pager_created) {
4974 				vm_external_state_t compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4975 
4976 				if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4977 					int             my_fault_type;
4978 					uint8_t         c_flags = C_DONT_BLOCK;
4979 					bool            insert_cur_object = FALSE;
4980 
4981 					/*
4982 					 * May have to talk to a pager...
4983 					 * if so, take the slow path by
4984 					 * doing a 'break' from the while (TRUE) loop
4985 					 *
4986 					 * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4987 					 * if the compressor is active and the page exists there
4988 					 */
4989 					if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
4990 						break;
4991 					}
4992 
4993 					if (map == kernel_map || real_map == kernel_map) {
4994 						/*
4995 						 * can't call into the compressor with the kernel_map
4996 						 * lock held, since the compressor may try to operate
4997 						 * on the kernel map in order to return an empty c_segment
4998 						 */
4999 						break;
5000 					}
5001 					if (object != cur_object) {
5002 						if (fault_type & VM_PROT_WRITE) {
5003 							c_flags |= C_KEEP;
5004 						} else {
5005 							insert_cur_object = TRUE;
5006 						}
5007 					}
5008 					if (insert_cur_object == TRUE) {
5009 						if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5010 							cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5011 
5012 							if (vm_object_lock_upgrade(cur_object) == FALSE) {
5013 								/*
5014 								 * couldn't upgrade so go do a full retry
5015 								 * immediately since we can no longer be
5016 								 * certain about cur_object (since we
5017 								 * don't hold a reference on it)...
5018 								 * first drop the top object lock
5019 								 */
5020 								vm_object_unlock(object);
5021 
5022 								vm_map_unlock_read(map);
5023 								if (real_map != map) {
5024 									vm_map_unlock(real_map);
5025 								}
5026 
5027 								goto RetryFault;
5028 							}
5029 						}
5030 					} else if (object_lock_type == OBJECT_LOCK_SHARED) {
5031 						object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5032 
5033 						if (object != cur_object) {
5034 							/*
5035 							 * we can't go for the upgrade on the top
5036 							 * lock since the upgrade may block waiting
5037 							 * for readers to drain... since we hold
5038 							 * cur_object locked at this point, waiting
5039 							 * for the readers to drain would represent
5040 							 * a lock order inversion since the lock order
5041 							 * for objects is the reference order in the
5042 							 * shadown chain
5043 							 */
5044 							vm_object_unlock(object);
5045 							vm_object_unlock(cur_object);
5046 
5047 							vm_map_unlock_read(map);
5048 							if (real_map != map) {
5049 								vm_map_unlock(real_map);
5050 							}
5051 
5052 							goto RetryFault;
5053 						}
5054 						if (vm_object_lock_upgrade(object) == FALSE) {
5055 							/*
5056 							 * couldn't upgrade, so explictly take the lock
5057 							 * exclusively and go relookup the page since we
5058 							 * will have dropped the object lock and
5059 							 * a different thread could have inserted
5060 							 * a page at this offset
5061 							 * no need for a full retry since we're
5062 							 * at the top level of the object chain
5063 							 */
5064 							vm_object_lock(object);
5065 
5066 							continue;
5067 						}
5068 					}
5069 					m = vm_page_grab_options(grab_options);
5070 					m_object = NULL;
5071 
5072 					if (m == VM_PAGE_NULL) {
5073 						/*
5074 						 * no free page currently available...
5075 						 * must take the slow path
5076 						 */
5077 						break;
5078 					}
5079 
5080 					/*
5081 					 * The object is and remains locked
5082 					 * so no need to take a
5083 					 * "paging_in_progress" reference.
5084 					 */
5085 					bool      shared_lock;
5086 					if ((object == cur_object &&
5087 					    object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
5088 					    (object != cur_object &&
5089 					    cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
5090 						shared_lock = FALSE;
5091 					} else {
5092 						shared_lock = TRUE;
5093 					}
5094 
5095 					kr = vm_compressor_pager_get(
5096 						cur_object->pager,
5097 						(vm_object_trunc_page(cur_offset)
5098 						+ cur_object->paging_offset),
5099 						VM_PAGE_GET_PHYS_PAGE(m),
5100 						&my_fault_type,
5101 						c_flags,
5102 						&compressed_count_delta);
5103 
5104 					vm_compressor_pager_count(
5105 						cur_object->pager,
5106 						compressed_count_delta,
5107 						shared_lock,
5108 						cur_object);
5109 
5110 					if (kr != KERN_SUCCESS) {
5111 						vm_page_release(m, FALSE);
5112 						m = VM_PAGE_NULL;
5113 					}
5114 					/*
5115 					 * If vm_compressor_pager_get() returns
5116 					 * KERN_MEMORY_FAILURE, then the
5117 					 * compressed data is permanently lost,
5118 					 * so return this error immediately.
5119 					 */
5120 					if (kr == KERN_MEMORY_FAILURE) {
5121 						if (object != cur_object) {
5122 							vm_object_unlock(cur_object);
5123 						}
5124 						vm_object_unlock(object);
5125 						vm_map_unlock_read(map);
5126 						if (real_map != map) {
5127 							vm_map_unlock(real_map);
5128 						}
5129 
5130 						goto done;
5131 					} else if (kr != KERN_SUCCESS) {
5132 						break;
5133 					}
5134 					m->vmp_dirty = TRUE;
5135 
5136 					/*
5137 					 * If the object is purgeable, its
5138 					 * owner's purgeable ledgers will be
5139 					 * updated in vm_page_insert() but the
5140 					 * page was also accounted for in a
5141 					 * "compressed purgeable" ledger, so
5142 					 * update that now.
5143 					 */
5144 					if (object != cur_object &&
5145 					    !insert_cur_object) {
5146 						/*
5147 						 * We're not going to insert
5148 						 * the decompressed page into
5149 						 * the object it came from.
5150 						 *
5151 						 * We're dealing with a
5152 						 * copy-on-write fault on
5153 						 * "object".
5154 						 * We're going to decompress
5155 						 * the page directly into the
5156 						 * target "object" while
5157 						 * keepin the compressed
5158 						 * page for "cur_object", so
5159 						 * no ledger update in that
5160 						 * case.
5161 						 */
5162 					} else if (((cur_object->purgable ==
5163 					    VM_PURGABLE_DENY) &&
5164 					    (!cur_object->vo_ledger_tag)) ||
5165 					    (cur_object->vo_owner ==
5166 					    NULL)) {
5167 						/*
5168 						 * "cur_object" is not purgeable
5169 						 * and is not ledger-taged, or
5170 						 * there's no owner for it,
5171 						 * so no owner's ledgers to
5172 						 * update.
5173 						 */
5174 					} else {
5175 						/*
5176 						 * One less compressed
5177 						 * purgeable/tagged page for
5178 						 * cur_object's owner.
5179 						 */
5180 						vm_object_owner_compressed_update(
5181 							cur_object,
5182 							-1);
5183 					}
5184 
5185 					if (insert_cur_object) {
5186 						vm_page_insert(m, cur_object, vm_object_trunc_page(cur_offset));
5187 						m_object = cur_object;
5188 					} else {
5189 						vm_page_insert(m, object, vm_object_trunc_page(offset));
5190 						m_object = object;
5191 					}
5192 
5193 					if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
5194 						/*
5195 						 * If the page is not cacheable,
5196 						 * we can't let its contents
5197 						 * linger in the data cache
5198 						 * after the decompression.
5199 						 */
5200 						pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
5201 					}
5202 
5203 					type_of_fault = my_fault_type;
5204 
5205 					VM_STAT_DECOMPRESSIONS();
5206 
5207 					if (cur_object != object) {
5208 						if (insert_cur_object) {
5209 							top_object = object;
5210 							/*
5211 							 * switch to the object that has the new page
5212 							 */
5213 							object = cur_object;
5214 							object_lock_type = cur_object_lock_type;
5215 						} else {
5216 							vm_object_unlock(cur_object);
5217 							cur_object = object;
5218 						}
5219 					}
5220 					goto FastPmapEnter;
5221 				}
5222 				/*
5223 				 * existence map present and indicates
5224 				 * that the pager doesn't have this page
5225 				 */
5226 			}
5227 			if (cur_object->shadow == VM_OBJECT_NULL ||
5228 			    resilient_media_retry) {
5229 				/*
5230 				 * Zero fill fault.  Page gets
5231 				 * inserted into the original object.
5232 				 */
5233 				if (cur_object->shadow_severed ||
5234 				    VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
5235 				    cur_object == compressor_object ||
5236 				    cur_object == kernel_object) {
5237 					if (object != cur_object) {
5238 						vm_object_unlock(cur_object);
5239 					}
5240 					vm_object_unlock(object);
5241 
5242 					vm_map_unlock_read(map);
5243 					if (real_map != map) {
5244 						vm_map_unlock(real_map);
5245 					}
5246 					if (VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object)) {
5247 						ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
5248 					}
5249 
5250 					if (cur_object->shadow_severed) {
5251 						ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_SHADOW_SEVERED), 0 /* arg */);
5252 					}
5253 
5254 					kr = KERN_MEMORY_ERROR;
5255 					goto done;
5256 				}
5257 				if (cur_object != object) {
5258 					vm_object_unlock(cur_object);
5259 
5260 					cur_object = object;
5261 				}
5262 				if (object_lock_type == OBJECT_LOCK_SHARED) {
5263 					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5264 
5265 					if (vm_object_lock_upgrade(object) == FALSE) {
5266 						/*
5267 						 * couldn't upgrade so do a full retry on the fault
5268 						 * since we dropped the object lock which
5269 						 * could allow another thread to insert
5270 						 * a page at this offset
5271 						 */
5272 						vm_map_unlock_read(map);
5273 						if (real_map != map) {
5274 							vm_map_unlock(real_map);
5275 						}
5276 
5277 						goto RetryFault;
5278 					}
5279 				}
5280 				if (!object->internal) {
5281 					panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
5282 				}
5283 #if MACH_ASSERT
5284 				if (resilient_media_retry &&
5285 				    vm_fault_resilient_media_inject_error3_rate != 0 &&
5286 				    (++vm_fault_resilient_media_inject_error3 % vm_fault_resilient_media_inject_error3_rate) == 0) {
5287 					/* inject an error */
5288 					m_object = NULL;
5289 					break;
5290 				}
5291 #endif /* MACH_ASSERT */
5292 				m = vm_page_alloc(object, vm_object_trunc_page(offset));
5293 				m_object = NULL;
5294 
5295 				if (m == VM_PAGE_NULL) {
5296 					/*
5297 					 * no free page currently available...
5298 					 * must take the slow path
5299 					 */
5300 					break;
5301 				}
5302 				m_object = object;
5303 
5304 				if ((prot & VM_PROT_WRITE) &&
5305 				    !(fault_type & VM_PROT_WRITE) &&
5306 				    object->copy != VM_OBJECT_NULL) {
5307 					/*
5308 					 * This is not a write fault and
5309 					 * we might have a copy-on-write
5310 					 * obligation to honor (copy object or
5311 					 * "needs_copy" map entry), so do not
5312 					 * give write access yet.
5313 					 * We'll need to catch the first write
5314 					 * to resolve the copy-on-write by
5315 					 * pushing this page to a copy object
5316 					 * or making a shadow object.
5317 					 */
5318 					if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
5319 						prot &= ~VM_PROT_WRITE;
5320 					} else {
5321 						assert(fault_info.cs_bypass);
5322 					}
5323 				}
5324 
5325 				/*
5326 				 * Zeroing the page and entering into it into the pmap
5327 				 * represents a significant amount of the zero fill fault handler's work.
5328 				 *
5329 				 * To improve fault scalability, we'll drop the object lock, if it appears contended,
5330 				 * now that we've inserted the page into the vm object.
5331 				 * Before dropping the lock, we need to check protection bits and set the
5332 				 * mapped bits on the page. Then we can mark the page busy, drop the lock,
5333 				 * zero it, and do the pmap enter. We'll need to reacquire the lock
5334 				 * to clear the busy bit and wake up any waiters.
5335 				 */
5336 				vm_fault_cs_clear(m);
5337 				m->vmp_pmapped = TRUE;
5338 				if (map->no_zero_fill) {
5339 					type_of_fault = DBG_NZF_PAGE_FAULT;
5340 				} else {
5341 					type_of_fault = DBG_ZERO_FILL_FAULT;
5342 				}
5343 				{
5344 					pmap_t destination_pmap;
5345 					vm_map_offset_t destination_pmap_vaddr;
5346 					vm_prot_t enter_fault_type;
5347 					if (caller_pmap) {
5348 						destination_pmap = caller_pmap;
5349 						destination_pmap_vaddr = caller_pmap_addr;
5350 					} else {
5351 						destination_pmap = pmap;
5352 						destination_pmap_vaddr = vaddr;
5353 					}
5354 					if (change_wiring) {
5355 						enter_fault_type = VM_PROT_NONE;
5356 					} else {
5357 						enter_fault_type = caller_prot;
5358 					}
5359 					assertf(VM_PAGE_OBJECT(m) == object, "m=%p object=%p", m, object);
5360 					kr = vm_fault_enter_prepare(m,
5361 					    destination_pmap,
5362 					    destination_pmap_vaddr,
5363 					    &prot,
5364 					    caller_prot,
5365 					    fault_page_size,
5366 					    fault_phys_offset,
5367 					    change_wiring,
5368 					    enter_fault_type,
5369 					    &fault_info,
5370 					    &type_of_fault,
5371 					    &page_needs_data_sync);
5372 					if (kr != KERN_SUCCESS) {
5373 						goto zero_fill_cleanup;
5374 					}
5375 
5376 					if (object_is_contended) {
5377 						/*
5378 						 * At this point the page is in the vm object, but not on a paging queue.
5379 						 * Since it's accessible to another thread but its contents are invalid
5380 						 * (it hasn't been zeroed) mark it busy before dropping the object lock.
5381 						 */
5382 						m->vmp_busy = TRUE;
5383 						vm_object_unlock(object);
5384 					}
5385 					if (type_of_fault == DBG_ZERO_FILL_FAULT) {
5386 						/*
5387 						 * Now zero fill page...
5388 						 * the page is probably going to
5389 						 * be written soon, so don't bother
5390 						 * to clear the modified bit
5391 						 *
5392 						 *   NOTE: This code holds the map
5393 						 *   lock across the zero fill.
5394 						 */
5395 						vm_page_zero_fill(m);
5396 						counter_inc(&vm_statistics_zero_fill_count);
5397 						DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
5398 					}
5399 					if (page_needs_data_sync) {
5400 						pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
5401 					}
5402 
5403 					if (top_object != VM_OBJECT_NULL) {
5404 						need_retry_ptr = &need_retry;
5405 					} else {
5406 						need_retry_ptr = NULL;
5407 					}
5408 					if (object_is_contended) {
5409 						kr = vm_fault_pmap_enter(destination_pmap, destination_pmap_vaddr,
5410 						    fault_page_size, fault_phys_offset,
5411 						    m, &prot, caller_prot, enter_fault_type, wired,
5412 						    fault_info.pmap_options, need_retry_ptr);
5413 						vm_object_lock(object);
5414 					} else {
5415 						kr = vm_fault_pmap_enter_with_object_lock(object, destination_pmap, destination_pmap_vaddr,
5416 						    fault_page_size, fault_phys_offset,
5417 						    m, &prot, caller_prot, enter_fault_type, wired,
5418 						    fault_info.pmap_options, need_retry_ptr);
5419 					}
5420 				}
5421 zero_fill_cleanup:
5422 				if (!VM_DYNAMIC_PAGING_ENABLED() &&
5423 				    (object->purgable == VM_PURGABLE_DENY ||
5424 				    object->purgable == VM_PURGABLE_NONVOLATILE ||
5425 				    object->purgable == VM_PURGABLE_VOLATILE)) {
5426 					vm_page_lockspin_queues();
5427 					if (!VM_DYNAMIC_PAGING_ENABLED()) {
5428 						vm_fault_enqueue_throttled_locked(m);
5429 					}
5430 					vm_page_unlock_queues();
5431 				}
5432 				vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info.no_cache, &type_of_fault, kr);
5433 
5434 				vm_fault_complete(
5435 					map,
5436 					real_map,
5437 					object,
5438 					m_object,
5439 					m,
5440 					offset,
5441 					trace_real_vaddr,
5442 					&fault_info,
5443 					caller_prot,
5444 					real_vaddr,
5445 					type_of_fault,
5446 					need_retry,
5447 					kr,
5448 					physpage_p,
5449 					prot,
5450 					top_object,
5451 					need_collapse,
5452 					cur_offset,
5453 					fault_type,
5454 					&written_on_object,
5455 					&written_on_pager,
5456 					&written_on_offset);
5457 				top_object = VM_OBJECT_NULL;
5458 				if (need_retry == TRUE) {
5459 					/*
5460 					 * vm_fault_enter couldn't complete the PMAP_ENTER...
5461 					 * at this point we don't hold any locks so it's safe
5462 					 * to ask the pmap layer to expand the page table to
5463 					 * accommodate this mapping... once expanded, we'll
5464 					 * re-drive the fault which should result in vm_fault_enter
5465 					 * being able to successfully enter the mapping this time around
5466 					 */
5467 					(void)pmap_enter_options(
5468 						pmap, vaddr, 0, 0, 0, 0, 0,
5469 						PMAP_OPTIONS_NOENTER, NULL);
5470 
5471 					need_retry = FALSE;
5472 					goto RetryFault;
5473 				}
5474 				goto done;
5475 			}
5476 			/*
5477 			 * On to the next level in the shadow chain
5478 			 */
5479 			cur_offset += cur_object->vo_shadow_offset;
5480 			new_object = cur_object->shadow;
5481 			fault_phys_offset = cur_offset - vm_object_trunc_page(cur_offset);
5482 
5483 			/*
5484 			 * take the new_object's lock with the indicated state
5485 			 */
5486 			if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5487 				vm_object_lock_shared(new_object);
5488 			} else {
5489 				vm_object_lock(new_object);
5490 			}
5491 
5492 			if (cur_object != object) {
5493 				vm_object_unlock(cur_object);
5494 			}
5495 
5496 			cur_object = new_object;
5497 
5498 			continue;
5499 		}
5500 	}
5501 	/*
5502 	 * Cleanup from fast fault failure.  Drop any object
5503 	 * lock other than original and drop map lock.
5504 	 */
5505 	if (object != cur_object) {
5506 		vm_object_unlock(cur_object);
5507 	}
5508 
5509 	/*
5510 	 * must own the object lock exclusively at this point
5511 	 */
5512 	if (object_lock_type == OBJECT_LOCK_SHARED) {
5513 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5514 
5515 		if (vm_object_lock_upgrade(object) == FALSE) {
5516 			/*
5517 			 * couldn't upgrade, so explictly
5518 			 * take the lock exclusively
5519 			 * no need to retry the fault at this
5520 			 * point since "vm_fault_page" will
5521 			 * completely re-evaluate the state
5522 			 */
5523 			vm_object_lock(object);
5524 		}
5525 	}
5526 
5527 handle_copy_delay:
5528 	vm_map_unlock_read(map);
5529 	if (real_map != map) {
5530 		vm_map_unlock(real_map);
5531 	}
5532 
5533 	if (__improbable(object == compressor_object ||
5534 	    object == kernel_object)) {
5535 		/*
5536 		 * These objects are explicitly managed and populated by the
5537 		 * kernel.  The virtual ranges backed by these objects should
5538 		 * either have wired pages or "holes" that are not supposed to
5539 		 * be accessed at all until they get explicitly populated.
5540 		 * We should never have to resolve a fault on a mapping backed
5541 		 * by one of these VM objects and providing a zero-filled page
5542 		 * would be wrong here, so let's fail the fault and let the
5543 		 * caller crash or recover.
5544 		 */
5545 		vm_object_unlock(object);
5546 		kr = KERN_MEMORY_ERROR;
5547 		goto done;
5548 	}
5549 
5550 	resilient_media_ref_transfer = false;
5551 	if (resilient_media_retry) {
5552 		/*
5553 		 * We could get here if we failed to get a free page
5554 		 * to zero-fill and had to take the slow path again.
5555 		 * Reset our "recovery-from-failed-media" state.
5556 		 */
5557 		assert(resilient_media_object != VM_OBJECT_NULL);
5558 		assert(resilient_media_offset != (vm_object_offset_t)-1);
5559 		/* release our extra reference on failed object */
5560 //             printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
5561 		if (object == resilient_media_object) {
5562 			/*
5563 			 * We're holding "object"'s lock, so we can't release
5564 			 * our extra reference at this point.
5565 			 * We need an extra reference on "object" anyway
5566 			 * (see below), so let's just transfer this reference.
5567 			 */
5568 			resilient_media_ref_transfer = true;
5569 		} else {
5570 			vm_object_lock_assert_notheld(resilient_media_object);
5571 			vm_object_deallocate(resilient_media_object);
5572 		}
5573 		resilient_media_object = VM_OBJECT_NULL;
5574 		resilient_media_offset = (vm_object_offset_t)-1;
5575 		resilient_media_retry = false;
5576 		vm_fault_resilient_media_abort2++;
5577 	}
5578 
5579 	/*
5580 	 * Make a reference to this object to
5581 	 * prevent its disposal while we are messing with
5582 	 * it.  Once we have the reference, the map is free
5583 	 * to be diddled.  Since objects reference their
5584 	 * shadows (and copies), they will stay around as well.
5585 	 */
5586 	if (resilient_media_ref_transfer) {
5587 		/* we already have an extra reference on this object */
5588 		resilient_media_ref_transfer = false;
5589 	} else {
5590 		vm_object_reference_locked(object);
5591 	}
5592 	vm_object_paging_begin(object);
5593 
5594 	set_thread_pagein_error(cthread, 0);
5595 	error_code = 0;
5596 
5597 	result_page = VM_PAGE_NULL;
5598 	kr = vm_fault_page(object, offset, fault_type,
5599 	    (change_wiring && !wired),
5600 	    FALSE,                /* page not looked up */
5601 	    &prot, &result_page, &top_page,
5602 	    &type_of_fault,
5603 	    &error_code, map->no_zero_fill,
5604 	    &fault_info);
5605 
5606 	/*
5607 	 * if kr != VM_FAULT_SUCCESS, then the paging reference
5608 	 * has been dropped and the object unlocked... the ref_count
5609 	 * is still held
5610 	 *
5611 	 * if kr == VM_FAULT_SUCCESS, then the paging reference
5612 	 * is still held along with the ref_count on the original object
5613 	 *
5614 	 *	the object is returned locked with a paging reference
5615 	 *
5616 	 *	if top_page != NULL, then it's BUSY and the
5617 	 *	object it belongs to has a paging reference
5618 	 *	but is returned unlocked
5619 	 */
5620 	if (kr != VM_FAULT_SUCCESS &&
5621 	    kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
5622 		if (kr == VM_FAULT_MEMORY_ERROR &&
5623 		    fault_info.resilient_media) {
5624 			assertf(object->internal, "object %p", object);
5625 			/*
5626 			 * This fault failed but the mapping was
5627 			 * "media resilient", so we'll retry the fault in
5628 			 * recovery mode to get a zero-filled page in the
5629 			 * top object.
5630 			 * Keep the reference on the failing object so
5631 			 * that we can check that the mapping is still
5632 			 * pointing to it when we retry the fault.
5633 			 */
5634 //                     printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
5635 			assert(!resilient_media_retry); /* no double retry */
5636 			assert(resilient_media_object == VM_OBJECT_NULL);
5637 			assert(resilient_media_offset == (vm_object_offset_t)-1);
5638 			resilient_media_retry = true;
5639 			resilient_media_object = object;
5640 			resilient_media_offset = offset;
5641 //                     printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
5642 			vm_fault_resilient_media_initiate++;
5643 			goto RetryFault;
5644 		} else {
5645 			/*
5646 			 * we didn't succeed, lose the object reference
5647 			 * immediately.
5648 			 */
5649 			vm_object_deallocate(object);
5650 			object = VM_OBJECT_NULL; /* no longer valid */
5651 		}
5652 
5653 		/*
5654 		 * See why we failed, and take corrective action.
5655 		 */
5656 		switch (kr) {
5657 		case VM_FAULT_MEMORY_SHORTAGE:
5658 			if (vm_page_wait((change_wiring) ?
5659 			    THREAD_UNINT :
5660 			    THREAD_ABORTSAFE)) {
5661 				goto RetryFault;
5662 			}
5663 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_MEMORY_SHORTAGE), 0 /* arg */);
5664 			OS_FALLTHROUGH;
5665 		case VM_FAULT_INTERRUPTED:
5666 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
5667 			kr = KERN_ABORTED;
5668 			goto done;
5669 		case VM_FAULT_RETRY:
5670 			goto RetryFault;
5671 		case VM_FAULT_MEMORY_ERROR:
5672 			if (error_code) {
5673 				kr = error_code;
5674 			} else {
5675 				kr = KERN_MEMORY_ERROR;
5676 			}
5677 			goto done;
5678 		default:
5679 			panic("vm_fault: unexpected error 0x%x from "
5680 			    "vm_fault_page()\n", kr);
5681 		}
5682 	}
5683 	m = result_page;
5684 	m_object = NULL;
5685 
5686 	if (m != VM_PAGE_NULL) {
5687 		m_object = VM_PAGE_OBJECT(m);
5688 		assert((change_wiring && !wired) ?
5689 		    (top_page == VM_PAGE_NULL) :
5690 		    ((top_page == VM_PAGE_NULL) == (m_object == object)));
5691 	}
5692 
5693 	/*
5694 	 * What to do with the resulting page from vm_fault_page
5695 	 * if it doesn't get entered into the physical map:
5696 	 */
5697 #define RELEASE_PAGE(m)                                 \
5698 	MACRO_BEGIN                                     \
5699 	PAGE_WAKEUP_DONE(m);                            \
5700 	if ( !VM_PAGE_PAGEABLE(m)) {                    \
5701 	        vm_page_lockspin_queues();              \
5702 	        if ( !VM_PAGE_PAGEABLE(m))              \
5703 	                vm_page_activate(m);            \
5704 	        vm_page_unlock_queues();                \
5705 	}                                               \
5706 	MACRO_END
5707 
5708 
5709 	object_locks_dropped = FALSE;
5710 	/*
5711 	 * We must verify that the maps have not changed
5712 	 * since our last lookup. vm_map_verify() needs the
5713 	 * map lock (shared) but we are holding object locks.
5714 	 * So we do a try_lock() first and, if that fails, we
5715 	 * drop the object locks and go in for the map lock again.
5716 	 */
5717 	if (!vm_map_try_lock_read(original_map)) {
5718 		if (m != VM_PAGE_NULL) {
5719 			old_copy_object = m_object->copy;
5720 			vm_object_unlock(m_object);
5721 		} else {
5722 			old_copy_object = VM_OBJECT_NULL;
5723 			vm_object_unlock(object);
5724 		}
5725 
5726 		object_locks_dropped = TRUE;
5727 
5728 		vm_map_lock_read(original_map);
5729 	}
5730 
5731 	if ((map != original_map) || !vm_map_verify(map, &version)) {
5732 		if (object_locks_dropped == FALSE) {
5733 			if (m != VM_PAGE_NULL) {
5734 				old_copy_object = m_object->copy;
5735 				vm_object_unlock(m_object);
5736 			} else {
5737 				old_copy_object = VM_OBJECT_NULL;
5738 				vm_object_unlock(object);
5739 			}
5740 
5741 			object_locks_dropped = TRUE;
5742 		}
5743 
5744 		/*
5745 		 * no object locks are held at this point
5746 		 */
5747 		vm_object_t             retry_object;
5748 		vm_object_offset_t      retry_offset;
5749 		vm_prot_t               retry_prot;
5750 
5751 		/*
5752 		 * To avoid trying to write_lock the map while another
5753 		 * thread has it read_locked (in vm_map_pageable), we
5754 		 * do not try for write permission.  If the page is
5755 		 * still writable, we will get write permission.  If it
5756 		 * is not, or has been marked needs_copy, we enter the
5757 		 * mapping without write permission, and will merely
5758 		 * take another fault.
5759 		 */
5760 		map = original_map;
5761 
5762 		kr = vm_map_lookup_and_lock_object(&map, vaddr,
5763 		    fault_type & ~VM_PROT_WRITE,
5764 		    OBJECT_LOCK_EXCLUSIVE, &version,
5765 		    &retry_object, &retry_offset, &retry_prot,
5766 		    &wired,
5767 		    &fault_info,
5768 		    &real_map,
5769 		    NULL);
5770 		pmap = real_map->pmap;
5771 
5772 		if (kr != KERN_SUCCESS) {
5773 			vm_map_unlock_read(map);
5774 
5775 			if (m != VM_PAGE_NULL) {
5776 				assert(VM_PAGE_OBJECT(m) == m_object);
5777 
5778 				/*
5779 				 * retake the lock so that
5780 				 * we can drop the paging reference
5781 				 * in vm_fault_cleanup and do the
5782 				 * PAGE_WAKEUP_DONE in RELEASE_PAGE
5783 				 */
5784 				vm_object_lock(m_object);
5785 
5786 				RELEASE_PAGE(m);
5787 
5788 				vm_fault_cleanup(m_object, top_page);
5789 			} else {
5790 				/*
5791 				 * retake the lock so that
5792 				 * we can drop the paging reference
5793 				 * in vm_fault_cleanup
5794 				 */
5795 				vm_object_lock(object);
5796 
5797 				vm_fault_cleanup(object, top_page);
5798 			}
5799 			vm_object_deallocate(object);
5800 
5801 			if (kr == KERN_INVALID_ADDRESS) {
5802 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_ADDRESS_NOT_FOUND), 0 /* arg */);
5803 			}
5804 			goto done;
5805 		}
5806 		vm_object_unlock(retry_object);
5807 
5808 		if ((retry_object != object) || (retry_offset != offset)) {
5809 			vm_map_unlock_read(map);
5810 			if (real_map != map) {
5811 				vm_map_unlock(real_map);
5812 			}
5813 
5814 			if (m != VM_PAGE_NULL) {
5815 				assert(VM_PAGE_OBJECT(m) == m_object);
5816 
5817 				/*
5818 				 * retake the lock so that
5819 				 * we can drop the paging reference
5820 				 * in vm_fault_cleanup and do the
5821 				 * PAGE_WAKEUP_DONE in RELEASE_PAGE
5822 				 */
5823 				vm_object_lock(m_object);
5824 
5825 				RELEASE_PAGE(m);
5826 
5827 				vm_fault_cleanup(m_object, top_page);
5828 			} else {
5829 				/*
5830 				 * retake the lock so that
5831 				 * we can drop the paging reference
5832 				 * in vm_fault_cleanup
5833 				 */
5834 				vm_object_lock(object);
5835 
5836 				vm_fault_cleanup(object, top_page);
5837 			}
5838 			vm_object_deallocate(object);
5839 
5840 			goto RetryFault;
5841 		}
5842 		/*
5843 		 * Check whether the protection has changed or the object
5844 		 * has been copied while we left the map unlocked.
5845 		 */
5846 		if (pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, retry_prot)) {
5847 			/* If the pmap layer cares, pass the full set. */
5848 			prot = retry_prot;
5849 		} else {
5850 			prot &= retry_prot;
5851 		}
5852 	}
5853 
5854 	if (object_locks_dropped == TRUE) {
5855 		if (m != VM_PAGE_NULL) {
5856 			assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p", m, m_object);
5857 			assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
5858 			vm_object_lock(m_object);
5859 
5860 			if (m_object->copy != old_copy_object) {
5861 				/*
5862 				 * The copy object changed while the top-level object
5863 				 * was unlocked, so take away write permission.
5864 				 */
5865 				assert(!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot));
5866 				prot &= ~VM_PROT_WRITE;
5867 			}
5868 		} else {
5869 			vm_object_lock(object);
5870 		}
5871 
5872 		object_locks_dropped = FALSE;
5873 	}
5874 
5875 	if (!need_copy &&
5876 	    !fault_info.no_copy_on_read &&
5877 	    m != VM_PAGE_NULL &&
5878 	    VM_PAGE_OBJECT(m) != object &&
5879 	    !VM_PAGE_OBJECT(m)->pager_trusted &&
5880 	    vm_protect_privileged_from_untrusted &&
5881 	    !VM_PAGE_OBJECT(m)->code_signed &&
5882 	    current_proc_is_privileged()) {
5883 		/*
5884 		 * We found the page we want in an "untrusted" VM object
5885 		 * down the shadow chain.  Since the target is "privileged"
5886 		 * we want to perform a copy-on-read of that page, so that the
5887 		 * mapped object gets a stable copy and does not have to
5888 		 * rely on the "untrusted" object to provide the same
5889 		 * contents if the page gets reclaimed and has to be paged
5890 		 * in again later on.
5891 		 *
5892 		 * Special case: if the mapping is executable and the untrusted
5893 		 * object is code-signed and the process is "cs_enforced", we
5894 		 * do not copy-on-read because that would break code-signing
5895 		 * enforcement expectations (an executable page must belong
5896 		 * to a code-signed object) and we can rely on code-signing
5897 		 * to re-validate the page if it gets evicted and paged back in.
5898 		 */
5899 //		printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
5900 		vm_copied_on_read++;
5901 		need_copy_on_read = TRUE;
5902 		need_copy = TRUE;
5903 	} else {
5904 		need_copy_on_read = FALSE;
5905 	}
5906 
5907 	/*
5908 	 * If we want to wire down this page, but no longer have
5909 	 * adequate permissions, we must start all over.
5910 	 * If we decided to copy-on-read, we must also start all over.
5911 	 */
5912 	if ((wired && (fault_type != (prot | VM_PROT_WRITE))) ||
5913 	    need_copy_on_read) {
5914 		vm_map_unlock_read(map);
5915 		if (real_map != map) {
5916 			vm_map_unlock(real_map);
5917 		}
5918 
5919 		if (m != VM_PAGE_NULL) {
5920 			assert(VM_PAGE_OBJECT(m) == m_object);
5921 
5922 			RELEASE_PAGE(m);
5923 
5924 			vm_fault_cleanup(m_object, top_page);
5925 		} else {
5926 			vm_fault_cleanup(object, top_page);
5927 		}
5928 
5929 		vm_object_deallocate(object);
5930 
5931 		goto RetryFault;
5932 	}
5933 	if (m != VM_PAGE_NULL) {
5934 		/*
5935 		 * Put this page into the physical map.
5936 		 * We had to do the unlock above because pmap_enter
5937 		 * may cause other faults.  The page may be on
5938 		 * the pageout queues.  If the pageout daemon comes
5939 		 * across the page, it will remove it from the queues.
5940 		 */
5941 		if (fault_page_size < PAGE_SIZE) {
5942 			DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx pa 0x%llx(0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
5943 			assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
5944 			    fault_phys_offset < PAGE_SIZE),
5945 			    "0x%llx\n", (uint64_t)fault_phys_offset);
5946 		} else {
5947 			assertf(fault_phys_offset == 0,
5948 			    "0x%llx\n", (uint64_t)fault_phys_offset);
5949 		}
5950 		assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p", m, m_object);
5951 		assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
5952 		if (caller_pmap) {
5953 			kr = vm_fault_enter(m,
5954 			    caller_pmap,
5955 			    caller_pmap_addr,
5956 			    fault_page_size,
5957 			    fault_phys_offset,
5958 			    prot,
5959 			    caller_prot,
5960 			    wired,
5961 			    change_wiring,
5962 			    wire_tag,
5963 			    &fault_info,
5964 			    NULL,
5965 			    &type_of_fault);
5966 		} else {
5967 			kr = vm_fault_enter(m,
5968 			    pmap,
5969 			    vaddr,
5970 			    fault_page_size,
5971 			    fault_phys_offset,
5972 			    prot,
5973 			    caller_prot,
5974 			    wired,
5975 			    change_wiring,
5976 			    wire_tag,
5977 			    &fault_info,
5978 			    NULL,
5979 			    &type_of_fault);
5980 		}
5981 		assert(VM_PAGE_OBJECT(m) == m_object);
5982 
5983 		{
5984 			int     event_code = 0;
5985 
5986 			if (m_object->internal) {
5987 				event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
5988 			} else if (m_object->object_is_shared_cache) {
5989 				event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
5990 			} else {
5991 				event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
5992 			}
5993 
5994 			KDBG_RELEASE(event_code | DBG_FUNC_NONE, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | vm_fault_type_for_tracing(need_copy_on_read, type_of_fault), m->vmp_offset, get_current_unique_pid());
5995 			KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_SLOW), get_current_unique_pid());
5996 
5997 			DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
5998 		}
5999 		if (kr != KERN_SUCCESS) {
6000 			/* abort this page fault */
6001 			vm_map_unlock_read(map);
6002 			if (real_map != map) {
6003 				vm_map_unlock(real_map);
6004 			}
6005 			PAGE_WAKEUP_DONE(m);
6006 			vm_fault_cleanup(m_object, top_page);
6007 			vm_object_deallocate(object);
6008 			goto done;
6009 		}
6010 		if (physpage_p != NULL) {
6011 			/* for vm_map_wire_and_extract() */
6012 			*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6013 			if (prot & VM_PROT_WRITE) {
6014 				vm_object_lock_assert_exclusive(m_object);
6015 				m->vmp_dirty = TRUE;
6016 			}
6017 		}
6018 	} else {
6019 		vm_map_entry_t          entry;
6020 		vm_map_offset_t         laddr;
6021 		vm_map_offset_t         ldelta, hdelta;
6022 
6023 		/*
6024 		 * do a pmap block mapping from the physical address
6025 		 * in the object
6026 		 */
6027 
6028 		if (real_map != map) {
6029 			vm_map_unlock(real_map);
6030 		}
6031 
6032 		if (original_map != map) {
6033 			vm_map_unlock_read(map);
6034 			vm_map_lock_read(original_map);
6035 			map = original_map;
6036 		}
6037 		real_map = map;
6038 
6039 		laddr = vaddr;
6040 		hdelta = ldelta = (vm_map_offset_t)0xFFFFFFFFFFFFF000ULL;
6041 
6042 		while (vm_map_lookup_entry(map, laddr, &entry)) {
6043 			if (ldelta > (laddr - entry->vme_start)) {
6044 				ldelta = laddr - entry->vme_start;
6045 			}
6046 			if (hdelta > (entry->vme_end - laddr)) {
6047 				hdelta = entry->vme_end - laddr;
6048 			}
6049 			if (entry->is_sub_map) {
6050 				laddr = ((laddr - entry->vme_start)
6051 				    + VME_OFFSET(entry));
6052 				vm_map_lock_read(VME_SUBMAP(entry));
6053 
6054 				if (map != real_map) {
6055 					vm_map_unlock_read(map);
6056 				}
6057 				if (entry->use_pmap) {
6058 					vm_map_unlock_read(real_map);
6059 					real_map = VME_SUBMAP(entry);
6060 				}
6061 				map = VME_SUBMAP(entry);
6062 			} else {
6063 				break;
6064 			}
6065 		}
6066 
6067 		if (vm_map_lookup_entry(map, laddr, &entry) &&
6068 		    (!entry->is_sub_map) &&
6069 		    (object != VM_OBJECT_NULL) &&
6070 		    (VME_OBJECT(entry) == object)) {
6071 			uint16_t superpage;
6072 
6073 			if (!object->pager_created &&
6074 			    object->phys_contiguous &&
6075 			    VME_OFFSET(entry) == 0 &&
6076 			    (entry->vme_end - entry->vme_start == object->vo_size) &&
6077 			    VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
6078 				superpage = VM_MEM_SUPERPAGE;
6079 			} else {
6080 				superpage = 0;
6081 			}
6082 
6083 			if (superpage && physpage_p) {
6084 				/* for vm_map_wire_and_extract() */
6085 				*physpage_p = (ppnum_t)
6086 				    ((((vm_map_offset_t)
6087 				    object->vo_shadow_offset)
6088 				    + VME_OFFSET(entry)
6089 				    + (laddr - entry->vme_start))
6090 				    >> PAGE_SHIFT);
6091 			}
6092 
6093 			if (caller_pmap) {
6094 				/*
6095 				 * Set up a block mapped area
6096 				 */
6097 				assert((uint32_t)((ldelta + hdelta) >> fault_page_shift) == ((ldelta + hdelta) >> fault_page_shift));
6098 				kr = pmap_map_block_addr(caller_pmap,
6099 				    (addr64_t)(caller_pmap_addr - ldelta),
6100 				    (pmap_paddr_t)(((vm_map_offset_t) (object->vo_shadow_offset)) +
6101 				    VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta),
6102 				    (uint32_t)((ldelta + hdelta) >> fault_page_shift), prot,
6103 				    (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
6104 
6105 				if (kr != KERN_SUCCESS) {
6106 					goto cleanup;
6107 				}
6108 			} else {
6109 				/*
6110 				 * Set up a block mapped area
6111 				 */
6112 				assert((uint32_t)((ldelta + hdelta) >> fault_page_shift) == ((ldelta + hdelta) >> fault_page_shift));
6113 				kr = pmap_map_block_addr(real_map->pmap,
6114 				    (addr64_t)(vaddr - ldelta),
6115 				    (pmap_paddr_t)(((vm_map_offset_t)(object->vo_shadow_offset)) +
6116 				    VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta),
6117 				    (uint32_t)((ldelta + hdelta) >> fault_page_shift), prot,
6118 				    (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
6119 
6120 				if (kr != KERN_SUCCESS) {
6121 					goto cleanup;
6122 				}
6123 			}
6124 		}
6125 	}
6126 
6127 	/*
6128 	 * Success
6129 	 */
6130 	kr = KERN_SUCCESS;
6131 
6132 	/*
6133 	 * TODO: could most of the done cases just use cleanup?
6134 	 */
6135 cleanup:
6136 	/*
6137 	 * Unlock everything, and return
6138 	 */
6139 	vm_map_unlock_read(map);
6140 	if (real_map != map) {
6141 		vm_map_unlock(real_map);
6142 	}
6143 
6144 	if (m != VM_PAGE_NULL) {
6145 		assert(VM_PAGE_OBJECT(m) == m_object);
6146 
6147 		if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
6148 			vm_object_paging_begin(m_object);
6149 
6150 			assert(written_on_object == VM_OBJECT_NULL);
6151 			written_on_object = m_object;
6152 			written_on_pager = m_object->pager;
6153 			written_on_offset = m_object->paging_offset + m->vmp_offset;
6154 		}
6155 		PAGE_WAKEUP_DONE(m);
6156 
6157 		vm_fault_cleanup(m_object, top_page);
6158 	} else {
6159 		vm_fault_cleanup(object, top_page);
6160 	}
6161 
6162 	vm_object_deallocate(object);
6163 
6164 #undef  RELEASE_PAGE
6165 
6166 done:
6167 	thread_interrupt_level(interruptible_state);
6168 
6169 	if (resilient_media_object != VM_OBJECT_NULL) {
6170 		assert(resilient_media_retry);
6171 		assert(resilient_media_offset != (vm_object_offset_t)-1);
6172 		/* release extra reference on failed object */
6173 //             printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
6174 		vm_object_lock_assert_notheld(resilient_media_object);
6175 		vm_object_deallocate(resilient_media_object);
6176 		resilient_media_object = VM_OBJECT_NULL;
6177 		resilient_media_offset = (vm_object_offset_t)-1;
6178 		resilient_media_retry = false;
6179 		vm_fault_resilient_media_release++;
6180 	}
6181 	assert(!resilient_media_retry);
6182 
6183 	/*
6184 	 * Only I/O throttle on faults which cause a pagein/swapin.
6185 	 */
6186 	if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
6187 		throttle_lowpri_io(1);
6188 	} else {
6189 		if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
6190 			if ((throttle_delay = vm_page_throttled(TRUE))) {
6191 				if (vm_debug_events) {
6192 					if (type_of_fault == DBG_COMPRESSOR_FAULT) {
6193 						VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6194 					} else if (type_of_fault == DBG_COW_FAULT) {
6195 						VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6196 					} else {
6197 						VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6198 					}
6199 				}
6200 				__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
6201 			}
6202 		}
6203 	}
6204 
6205 	if (written_on_object) {
6206 		vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
6207 
6208 		vm_object_lock(written_on_object);
6209 		vm_object_paging_end(written_on_object);
6210 		vm_object_unlock(written_on_object);
6211 
6212 		written_on_object = VM_OBJECT_NULL;
6213 	}
6214 
6215 	if (rtfault) {
6216 		vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
6217 	}
6218 
6219 	KDBG_RELEASE(
6220 		(MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
6221 		((uint64_t)trace_vaddr >> 32),
6222 		trace_vaddr,
6223 		kr,
6224 		vm_fault_type_for_tracing(need_copy_on_read, type_of_fault));
6225 
6226 	if (fault_page_size < PAGE_SIZE && kr != KERN_SUCCESS) {
6227 		DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map, original_map, (uint64_t)trace_real_vaddr, kr);
6228 	}
6229 
6230 	return kr;
6231 }
6232 
6233 /*
6234  *	vm_fault_wire:
6235  *
6236  *	Wire down a range of virtual addresses in a map.
6237  */
6238 kern_return_t
vm_fault_wire(vm_map_t map,vm_map_entry_t entry,vm_prot_t prot,vm_tag_t wire_tag,pmap_t pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6239 vm_fault_wire(
6240 	vm_map_t        map,
6241 	vm_map_entry_t  entry,
6242 	vm_prot_t       prot,
6243 	vm_tag_t        wire_tag,
6244 	pmap_t          pmap,
6245 	vm_map_offset_t pmap_addr,
6246 	ppnum_t         *physpage_p)
6247 {
6248 	vm_map_offset_t va;
6249 	vm_map_offset_t end_addr = entry->vme_end;
6250 	kern_return_t   rc;
6251 	vm_map_size_t   effective_page_size;
6252 
6253 	assert(entry->in_transition);
6254 
6255 	if (!entry->is_sub_map &&
6256 	    VME_OBJECT(entry) != VM_OBJECT_NULL &&
6257 	    VME_OBJECT(entry)->phys_contiguous) {
6258 		return KERN_SUCCESS;
6259 	}
6260 
6261 	/*
6262 	 *	Inform the physical mapping system that the
6263 	 *	range of addresses may not fault, so that
6264 	 *	page tables and such can be locked down as well.
6265 	 */
6266 
6267 	pmap_pageable(pmap, pmap_addr,
6268 	    pmap_addr + (end_addr - entry->vme_start), FALSE);
6269 
6270 	/*
6271 	 *	We simulate a fault to get the page and enter it
6272 	 *	in the physical map.
6273 	 */
6274 
6275 	effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6276 	for (va = entry->vme_start;
6277 	    va < end_addr;
6278 	    va += effective_page_size) {
6279 		rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
6280 		    pmap_addr + (va - entry->vme_start),
6281 		    physpage_p);
6282 		if (rc != KERN_SUCCESS) {
6283 			rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
6284 			    ((pmap == kernel_pmap)
6285 			    ? THREAD_UNINT
6286 			    : THREAD_ABORTSAFE),
6287 			    pmap,
6288 			    (pmap_addr +
6289 			    (va - entry->vme_start)),
6290 			    physpage_p);
6291 			DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
6292 		}
6293 
6294 		if (rc != KERN_SUCCESS) {
6295 			struct vm_map_entry     tmp_entry = *entry;
6296 
6297 			/* unwire wired pages */
6298 			tmp_entry.vme_end = va;
6299 			vm_fault_unwire(map,
6300 			    &tmp_entry, FALSE, pmap, pmap_addr);
6301 
6302 			return rc;
6303 		}
6304 	}
6305 	return KERN_SUCCESS;
6306 }
6307 
6308 /*
6309  *	vm_fault_unwire:
6310  *
6311  *	Unwire a range of virtual addresses in a map.
6312  */
6313 void
vm_fault_unwire(vm_map_t map,vm_map_entry_t entry,boolean_t deallocate,pmap_t pmap,vm_map_offset_t pmap_addr)6314 vm_fault_unwire(
6315 	vm_map_t        map,
6316 	vm_map_entry_t  entry,
6317 	boolean_t       deallocate,
6318 	pmap_t          pmap,
6319 	vm_map_offset_t pmap_addr)
6320 {
6321 	vm_map_offset_t va;
6322 	vm_map_offset_t end_addr = entry->vme_end;
6323 	vm_object_t             object;
6324 	struct vm_object_fault_info fault_info = {};
6325 	unsigned int    unwired_pages;
6326 	vm_map_size_t   effective_page_size;
6327 
6328 	object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
6329 
6330 	/*
6331 	 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
6332 	 * do anything since such memory is wired by default.  So we don't have
6333 	 * anything to undo here.
6334 	 */
6335 
6336 	if (object != VM_OBJECT_NULL && object->phys_contiguous) {
6337 		return;
6338 	}
6339 
6340 	fault_info.interruptible = THREAD_UNINT;
6341 	fault_info.behavior = entry->behavior;
6342 	fault_info.user_tag = VME_ALIAS(entry);
6343 	if (entry->iokit_acct ||
6344 	    (!entry->is_sub_map && !entry->use_pmap)) {
6345 		fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
6346 	}
6347 	fault_info.lo_offset = VME_OFFSET(entry);
6348 	fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
6349 	fault_info.no_cache = entry->no_cache;
6350 	fault_info.stealth = TRUE;
6351 
6352 	unwired_pages = 0;
6353 
6354 	/*
6355 	 *	Since the pages are wired down, we must be able to
6356 	 *	get their mappings from the physical map system.
6357 	 */
6358 
6359 	effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6360 	for (va = entry->vme_start;
6361 	    va < end_addr;
6362 	    va += effective_page_size) {
6363 		if (object == VM_OBJECT_NULL) {
6364 			if (pmap) {
6365 				pmap_change_wiring(pmap,
6366 				    pmap_addr + (va - entry->vme_start), FALSE);
6367 			}
6368 			(void) vm_fault(map, va, VM_PROT_NONE,
6369 			    TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
6370 		} else {
6371 			vm_prot_t       prot;
6372 			vm_page_t       result_page;
6373 			vm_page_t       top_page;
6374 			vm_object_t     result_object;
6375 			vm_fault_return_t result;
6376 
6377 			/* cap cluster size at maximum UPL size */
6378 			upl_size_t cluster_size;
6379 			if (os_sub_overflow(end_addr, va, &cluster_size)) {
6380 				cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6381 			}
6382 			fault_info.cluster_size = cluster_size;
6383 
6384 			do {
6385 				prot = VM_PROT_NONE;
6386 
6387 				vm_object_lock(object);
6388 				vm_object_paging_begin(object);
6389 				result_page = VM_PAGE_NULL;
6390 				result = vm_fault_page(
6391 					object,
6392 					(VME_OFFSET(entry) +
6393 					(va - entry->vme_start)),
6394 					VM_PROT_NONE, TRUE,
6395 					FALSE, /* page not looked up */
6396 					&prot, &result_page, &top_page,
6397 					(int *)0,
6398 					NULL, map->no_zero_fill,
6399 					&fault_info);
6400 			} while (result == VM_FAULT_RETRY);
6401 
6402 			/*
6403 			 * If this was a mapping to a file on a device that has been forcibly
6404 			 * unmounted, then we won't get a page back from vm_fault_page().  Just
6405 			 * move on to the next one in case the remaining pages are mapped from
6406 			 * different objects.  During a forced unmount, the object is terminated
6407 			 * so the alive flag will be false if this happens.  A forced unmount will
6408 			 * will occur when an external disk is unplugged before the user does an
6409 			 * eject, so we don't want to panic in that situation.
6410 			 */
6411 
6412 			if (result == VM_FAULT_MEMORY_ERROR) {
6413 				if (!object->alive) {
6414 					continue;
6415 				}
6416 				if (!object->internal && object->pager == NULL) {
6417 					continue;
6418 				}
6419 			}
6420 
6421 			if (result == VM_FAULT_MEMORY_ERROR &&
6422 			    object == kernel_object) {
6423 				/*
6424 				 * This must have been allocated with
6425 				 * KMA_KOBJECT and KMA_VAONLY and there's
6426 				 * no physical page at this offset.
6427 				 * We're done (no page to free).
6428 				 */
6429 				assert(deallocate);
6430 				continue;
6431 			}
6432 
6433 			if (result != VM_FAULT_SUCCESS) {
6434 				panic("vm_fault_unwire: failure");
6435 			}
6436 
6437 			result_object = VM_PAGE_OBJECT(result_page);
6438 
6439 			if (deallocate) {
6440 				assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
6441 				    vm_page_fictitious_addr);
6442 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
6443 				if (VM_PAGE_WIRED(result_page)) {
6444 					unwired_pages++;
6445 				}
6446 				VM_PAGE_FREE(result_page);
6447 			} else {
6448 				if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr)) {
6449 					pmap_change_wiring(pmap,
6450 					    pmap_addr + (va - entry->vme_start), FALSE);
6451 				}
6452 
6453 
6454 				if (VM_PAGE_WIRED(result_page)) {
6455 					vm_page_lockspin_queues();
6456 					vm_page_unwire(result_page, TRUE);
6457 					vm_page_unlock_queues();
6458 					unwired_pages++;
6459 				}
6460 				if (entry->zero_wired_pages) {
6461 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
6462 					entry->zero_wired_pages = FALSE;
6463 				}
6464 
6465 				PAGE_WAKEUP_DONE(result_page);
6466 			}
6467 			vm_fault_cleanup(result_object, top_page);
6468 		}
6469 	}
6470 
6471 	/*
6472 	 *	Inform the physical mapping system that the range
6473 	 *	of addresses may fault, so that page tables and
6474 	 *	such may be unwired themselves.
6475 	 */
6476 
6477 	pmap_pageable(pmap, pmap_addr,
6478 	    pmap_addr + (end_addr - entry->vme_start), TRUE);
6479 
6480 	if (kernel_object == object) {
6481 		/*
6482 		 * Would like to make user_tag in vm_object_fault_info
6483 		 * vm_tag_t (unsigned short) but user_tag derives its value from
6484 		 * VME_ALIAS(entry) at a few places and VME_ALIAS, in turn, casts
6485 		 * to an _unsigned int_ which is used by non-fault_info paths throughout the
6486 		 * code at many places.
6487 		 *
6488 		 * So, for now, an explicit truncation to unsigned short (vm_tag_t).
6489 		 */
6490 		assertf((fault_info.user_tag & VME_ALIAS_MASK) == fault_info.user_tag,
6491 		    "VM Tag truncated from 0x%x to 0x%x\n", fault_info.user_tag, (fault_info.user_tag & VME_ALIAS_MASK));
6492 		vm_tag_update_size((vm_tag_t) fault_info.user_tag, -ptoa_64(unwired_pages));
6493 	}
6494 }
6495 
6496 /*
6497  *	vm_fault_wire_fast:
6498  *
6499  *	Handle common case of a wire down page fault at the given address.
6500  *	If successful, the page is inserted into the associated physical map.
6501  *	The map entry is passed in to avoid the overhead of a map lookup.
6502  *
6503  *	NOTE: the given address should be truncated to the
6504  *	proper page address.
6505  *
6506  *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
6507  *	a standard error specifying why the fault is fatal is returned.
6508  *
6509  *	The map in question must be referenced, and remains so.
6510  *	Caller has a read lock on the map.
6511  *
6512  *	This is a stripped version of vm_fault() for wiring pages.  Anything
6513  *	other than the common case will return KERN_FAILURE, and the caller
6514  *	is expected to call vm_fault().
6515  */
6516 static kern_return_t
vm_fault_wire_fast(__unused vm_map_t map,vm_map_offset_t va,__unused vm_prot_t caller_prot,vm_tag_t wire_tag,vm_map_entry_t entry,pmap_t pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6517 vm_fault_wire_fast(
6518 	__unused vm_map_t       map,
6519 	vm_map_offset_t va,
6520 	__unused vm_prot_t       caller_prot,
6521 	vm_tag_t        wire_tag,
6522 	vm_map_entry_t  entry,
6523 	pmap_t          pmap,
6524 	vm_map_offset_t pmap_addr,
6525 	ppnum_t         *physpage_p)
6526 {
6527 	vm_object_t             object;
6528 	vm_object_offset_t      offset;
6529 	vm_page_t               m;
6530 	vm_prot_t               prot;
6531 	thread_t                thread = current_thread();
6532 	int                     type_of_fault;
6533 	kern_return_t           kr;
6534 	vm_map_size_t           fault_page_size;
6535 	vm_map_offset_t         fault_phys_offset;
6536 	struct vm_object_fault_info fault_info = {};
6537 
6538 	counter_inc(&vm_statistics_faults);
6539 
6540 	if (thread != THREAD_NULL) {
6541 		counter_inc(&get_threadtask(thread)->faults);
6542 	}
6543 
6544 /*
6545  *	Recovery actions
6546  */
6547 
6548 #undef  RELEASE_PAGE
6549 #define RELEASE_PAGE(m) {                               \
6550 	PAGE_WAKEUP_DONE(m);                            \
6551 	vm_page_lockspin_queues();                      \
6552 	vm_page_unwire(m, TRUE);                        \
6553 	vm_page_unlock_queues();                        \
6554 }
6555 
6556 
6557 #undef  UNLOCK_THINGS
6558 #define UNLOCK_THINGS   {                               \
6559 	vm_object_paging_end(object);                      \
6560 	vm_object_unlock(object);                          \
6561 }
6562 
6563 #undef  UNLOCK_AND_DEALLOCATE
6564 #define UNLOCK_AND_DEALLOCATE   {                       \
6565 	UNLOCK_THINGS;                                  \
6566 	vm_object_deallocate(object);                   \
6567 }
6568 /*
6569  *	Give up and have caller do things the hard way.
6570  */
6571 
6572 #define GIVE_UP {                                       \
6573 	UNLOCK_AND_DEALLOCATE;                          \
6574 	return(KERN_FAILURE);                           \
6575 }
6576 
6577 
6578 	/*
6579 	 *	If this entry is not directly to a vm_object, bail out.
6580 	 */
6581 	if (entry->is_sub_map) {
6582 		assert(physpage_p == NULL);
6583 		return KERN_FAILURE;
6584 	}
6585 
6586 	/*
6587 	 *	Find the backing store object and offset into it.
6588 	 */
6589 
6590 	object = VME_OBJECT(entry);
6591 	offset = (va - entry->vme_start) + VME_OFFSET(entry);
6592 	prot = entry->protection;
6593 
6594 	/*
6595 	 *	Make a reference to this object to prevent its
6596 	 *	disposal while we are messing with it.
6597 	 */
6598 
6599 	vm_object_lock(object);
6600 	vm_object_reference_locked(object);
6601 	vm_object_paging_begin(object);
6602 
6603 	/*
6604 	 *	INVARIANTS (through entire routine):
6605 	 *
6606 	 *	1)	At all times, we must either have the object
6607 	 *		lock or a busy page in some object to prevent
6608 	 *		some other thread from trying to bring in
6609 	 *		the same page.
6610 	 *
6611 	 *	2)	Once we have a busy page, we must remove it from
6612 	 *		the pageout queues, so that the pageout daemon
6613 	 *		will not grab it away.
6614 	 *
6615 	 */
6616 
6617 	/*
6618 	 *	Look for page in top-level object.  If it's not there or
6619 	 *	there's something going on, give up.
6620 	 */
6621 	m = vm_page_lookup(object, vm_object_trunc_page(offset));
6622 	if ((m == VM_PAGE_NULL) || (m->vmp_busy) ||
6623 	    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
6624 		GIVE_UP;
6625 	}
6626 	if (m->vmp_fictitious &&
6627 	    VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
6628 		/*
6629 		 * Guard pages are fictitious pages and are never
6630 		 * entered into a pmap, so let's say it's been wired...
6631 		 */
6632 		kr = KERN_SUCCESS;
6633 		goto done;
6634 	}
6635 
6636 	/*
6637 	 *	Wire the page down now.  All bail outs beyond this
6638 	 *	point must unwire the page.
6639 	 */
6640 
6641 	vm_page_lockspin_queues();
6642 	vm_page_wire(m, wire_tag, TRUE);
6643 	vm_page_unlock_queues();
6644 
6645 	/*
6646 	 *	Mark page busy for other threads.
6647 	 */
6648 	assert(!m->vmp_busy);
6649 	m->vmp_busy = TRUE;
6650 	assert(!m->vmp_absent);
6651 
6652 	/*
6653 	 *	Give up if the page is being written and there's a copy object
6654 	 */
6655 	if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
6656 		RELEASE_PAGE(m);
6657 		GIVE_UP;
6658 	}
6659 
6660 	fault_info.user_tag = VME_ALIAS(entry);
6661 	fault_info.pmap_options = 0;
6662 	if (entry->iokit_acct ||
6663 	    (!entry->is_sub_map && !entry->use_pmap)) {
6664 		fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
6665 	}
6666 
6667 	fault_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6668 	fault_phys_offset = offset - vm_object_trunc_page(offset);
6669 
6670 	/*
6671 	 *	Put this page into the physical map.
6672 	 */
6673 	type_of_fault = DBG_CACHE_HIT_FAULT;
6674 	assertf(VM_PAGE_OBJECT(m) == object, "m=%p object=%p", m, object);
6675 	assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
6676 	kr = vm_fault_enter(m,
6677 	    pmap,
6678 	    pmap_addr,
6679 	    fault_page_size,
6680 	    fault_phys_offset,
6681 	    prot,
6682 	    prot,
6683 	    TRUE,                  /* wired */
6684 	    FALSE,                 /* change_wiring */
6685 	    wire_tag,
6686 	    &fault_info,
6687 	    NULL,
6688 	    &type_of_fault);
6689 	if (kr != KERN_SUCCESS) {
6690 		RELEASE_PAGE(m);
6691 		GIVE_UP;
6692 	}
6693 
6694 done:
6695 	/*
6696 	 *	Unlock everything, and return
6697 	 */
6698 
6699 	if (physpage_p) {
6700 		/* for vm_map_wire_and_extract() */
6701 		if (kr == KERN_SUCCESS) {
6702 			assert(object == VM_PAGE_OBJECT(m));
6703 			*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6704 			if (prot & VM_PROT_WRITE) {
6705 				vm_object_lock_assert_exclusive(object);
6706 				m->vmp_dirty = TRUE;
6707 			}
6708 		} else {
6709 			*physpage_p = 0;
6710 		}
6711 	}
6712 
6713 	PAGE_WAKEUP_DONE(m);
6714 	UNLOCK_AND_DEALLOCATE;
6715 
6716 	return kr;
6717 }
6718 
6719 /*
6720  *	Routine:	vm_fault_copy_cleanup
6721  *	Purpose:
6722  *		Release a page used by vm_fault_copy.
6723  */
6724 
6725 static void
vm_fault_copy_cleanup(vm_page_t page,vm_page_t top_page)6726 vm_fault_copy_cleanup(
6727 	vm_page_t       page,
6728 	vm_page_t       top_page)
6729 {
6730 	vm_object_t     object = VM_PAGE_OBJECT(page);
6731 
6732 	vm_object_lock(object);
6733 	PAGE_WAKEUP_DONE(page);
6734 	if (!VM_PAGE_PAGEABLE(page)) {
6735 		vm_page_lockspin_queues();
6736 		if (!VM_PAGE_PAGEABLE(page)) {
6737 			vm_page_activate(page);
6738 		}
6739 		vm_page_unlock_queues();
6740 	}
6741 	vm_fault_cleanup(object, top_page);
6742 }
6743 
6744 static void
vm_fault_copy_dst_cleanup(vm_page_t page)6745 vm_fault_copy_dst_cleanup(
6746 	vm_page_t       page)
6747 {
6748 	vm_object_t     object;
6749 
6750 	if (page != VM_PAGE_NULL) {
6751 		object = VM_PAGE_OBJECT(page);
6752 		vm_object_lock(object);
6753 		vm_page_lockspin_queues();
6754 		vm_page_unwire(page, TRUE);
6755 		vm_page_unlock_queues();
6756 		vm_object_paging_end(object);
6757 		vm_object_unlock(object);
6758 	}
6759 }
6760 
6761 /*
6762  *	Routine:	vm_fault_copy
6763  *
6764  *	Purpose:
6765  *		Copy pages from one virtual memory object to another --
6766  *		neither the source nor destination pages need be resident.
6767  *
6768  *		Before actually copying a page, the version associated with
6769  *		the destination address map wil be verified.
6770  *
6771  *	In/out conditions:
6772  *		The caller must hold a reference, but not a lock, to
6773  *		each of the source and destination objects and to the
6774  *		destination map.
6775  *
6776  *	Results:
6777  *		Returns KERN_SUCCESS if no errors were encountered in
6778  *		reading or writing the data.  Returns KERN_INTERRUPTED if
6779  *		the operation was interrupted (only possible if the
6780  *		"interruptible" argument is asserted).  Other return values
6781  *		indicate a permanent error in copying the data.
6782  *
6783  *		The actual amount of data copied will be returned in the
6784  *		"copy_size" argument.  In the event that the destination map
6785  *		verification failed, this amount may be less than the amount
6786  *		requested.
6787  */
6788 kern_return_t
vm_fault_copy(vm_object_t src_object,vm_object_offset_t src_offset,vm_map_size_t * copy_size,vm_object_t dst_object,vm_object_offset_t dst_offset,vm_map_t dst_map,vm_map_version_t * dst_version,int interruptible)6789 vm_fault_copy(
6790 	vm_object_t             src_object,
6791 	vm_object_offset_t      src_offset,
6792 	vm_map_size_t           *copy_size,             /* INOUT */
6793 	vm_object_t             dst_object,
6794 	vm_object_offset_t      dst_offset,
6795 	vm_map_t                dst_map,
6796 	vm_map_version_t         *dst_version,
6797 	int                     interruptible)
6798 {
6799 	vm_page_t               result_page;
6800 
6801 	vm_page_t               src_page;
6802 	vm_page_t               src_top_page;
6803 	vm_prot_t               src_prot;
6804 
6805 	vm_page_t               dst_page;
6806 	vm_page_t               dst_top_page;
6807 	vm_prot_t               dst_prot;
6808 
6809 	vm_map_size_t           amount_left;
6810 	vm_object_t             old_copy_object;
6811 	vm_object_t             result_page_object = NULL;
6812 	kern_return_t           error = 0;
6813 	vm_fault_return_t       result;
6814 
6815 	vm_map_size_t           part_size;
6816 	struct vm_object_fault_info fault_info_src = {};
6817 	struct vm_object_fault_info fault_info_dst = {};
6818 
6819 	/*
6820 	 * In order not to confuse the clustered pageins, align
6821 	 * the different offsets on a page boundary.
6822 	 */
6823 
6824 #define RETURN(x)                                       \
6825 	MACRO_BEGIN                                     \
6826 	*copy_size -= amount_left;                      \
6827 	MACRO_RETURN(x);                                \
6828 	MACRO_END
6829 
6830 	amount_left = *copy_size;
6831 
6832 	fault_info_src.interruptible = interruptible;
6833 	fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
6834 	fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
6835 	fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
6836 	fault_info_src.stealth = TRUE;
6837 
6838 	fault_info_dst.interruptible = interruptible;
6839 	fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
6840 	fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
6841 	fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
6842 	fault_info_dst.stealth = TRUE;
6843 
6844 	do { /* while (amount_left > 0) */
6845 		/*
6846 		 * There may be a deadlock if both source and destination
6847 		 * pages are the same. To avoid this deadlock, the copy must
6848 		 * start by getting the destination page in order to apply
6849 		 * COW semantics if any.
6850 		 */
6851 
6852 RetryDestinationFault:;
6853 
6854 		dst_prot = VM_PROT_WRITE | VM_PROT_READ;
6855 
6856 		vm_object_lock(dst_object);
6857 		vm_object_paging_begin(dst_object);
6858 
6859 		/* cap cluster size at maximum UPL size */
6860 		upl_size_t cluster_size;
6861 		if (os_convert_overflow(amount_left, &cluster_size)) {
6862 			cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6863 		}
6864 		fault_info_dst.cluster_size = cluster_size;
6865 
6866 		dst_page = VM_PAGE_NULL;
6867 		result = vm_fault_page(dst_object,
6868 		    vm_object_trunc_page(dst_offset),
6869 		    VM_PROT_WRITE | VM_PROT_READ,
6870 		    FALSE,
6871 		    FALSE,                    /* page not looked up */
6872 		    &dst_prot, &dst_page, &dst_top_page,
6873 		    (int *)0,
6874 		    &error,
6875 		    dst_map->no_zero_fill,
6876 		    &fault_info_dst);
6877 		switch (result) {
6878 		case VM_FAULT_SUCCESS:
6879 			break;
6880 		case VM_FAULT_RETRY:
6881 			goto RetryDestinationFault;
6882 		case VM_FAULT_MEMORY_SHORTAGE:
6883 			if (vm_page_wait(interruptible)) {
6884 				goto RetryDestinationFault;
6885 			}
6886 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_COPY_MEMORY_SHORTAGE), 0 /* arg */);
6887 			OS_FALLTHROUGH;
6888 		case VM_FAULT_INTERRUPTED:
6889 			RETURN(MACH_SEND_INTERRUPTED);
6890 		case VM_FAULT_SUCCESS_NO_VM_PAGE:
6891 			/* success but no VM page: fail the copy */
6892 			vm_object_paging_end(dst_object);
6893 			vm_object_unlock(dst_object);
6894 			OS_FALLTHROUGH;
6895 		case VM_FAULT_MEMORY_ERROR:
6896 			if (error) {
6897 				return error;
6898 			} else {
6899 				return KERN_MEMORY_ERROR;
6900 			}
6901 		default:
6902 			panic("vm_fault_copy: unexpected error 0x%x from "
6903 			    "vm_fault_page()\n", result);
6904 		}
6905 		assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
6906 
6907 		assert(dst_object == VM_PAGE_OBJECT(dst_page));
6908 		old_copy_object = dst_object->copy;
6909 
6910 		/*
6911 		 * There exists the possiblity that the source and
6912 		 * destination page are the same.  But we can't
6913 		 * easily determine that now.  If they are the
6914 		 * same, the call to vm_fault_page() for the
6915 		 * destination page will deadlock.  To prevent this we
6916 		 * wire the page so we can drop busy without having
6917 		 * the page daemon steal the page.  We clean up the
6918 		 * top page  but keep the paging reference on the object
6919 		 * holding the dest page so it doesn't go away.
6920 		 */
6921 
6922 		vm_page_lockspin_queues();
6923 		vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
6924 		vm_page_unlock_queues();
6925 		PAGE_WAKEUP_DONE(dst_page);
6926 		vm_object_unlock(dst_object);
6927 
6928 		if (dst_top_page != VM_PAGE_NULL) {
6929 			vm_object_lock(dst_object);
6930 			VM_PAGE_FREE(dst_top_page);
6931 			vm_object_paging_end(dst_object);
6932 			vm_object_unlock(dst_object);
6933 		}
6934 
6935 RetrySourceFault:;
6936 
6937 		if (src_object == VM_OBJECT_NULL) {
6938 			/*
6939 			 *	No source object.  We will just
6940 			 *	zero-fill the page in dst_object.
6941 			 */
6942 			src_page = VM_PAGE_NULL;
6943 			result_page = VM_PAGE_NULL;
6944 		} else {
6945 			vm_object_lock(src_object);
6946 			src_page = vm_page_lookup(src_object,
6947 			    vm_object_trunc_page(src_offset));
6948 			if (src_page == dst_page) {
6949 				src_prot = dst_prot;
6950 				result_page = VM_PAGE_NULL;
6951 			} else {
6952 				src_prot = VM_PROT_READ;
6953 				vm_object_paging_begin(src_object);
6954 
6955 				/* cap cluster size at maximum UPL size */
6956 				if (os_convert_overflow(amount_left, &cluster_size)) {
6957 					cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6958 				}
6959 				fault_info_src.cluster_size = cluster_size;
6960 
6961 				result_page = VM_PAGE_NULL;
6962 				result = vm_fault_page(
6963 					src_object,
6964 					vm_object_trunc_page(src_offset),
6965 					VM_PROT_READ, FALSE,
6966 					FALSE, /* page not looked up */
6967 					&src_prot,
6968 					&result_page, &src_top_page,
6969 					(int *)0, &error, FALSE,
6970 					&fault_info_src);
6971 
6972 				switch (result) {
6973 				case VM_FAULT_SUCCESS:
6974 					break;
6975 				case VM_FAULT_RETRY:
6976 					goto RetrySourceFault;
6977 				case VM_FAULT_MEMORY_SHORTAGE:
6978 					if (vm_page_wait(interruptible)) {
6979 						goto RetrySourceFault;
6980 					}
6981 					OS_FALLTHROUGH;
6982 				case VM_FAULT_INTERRUPTED:
6983 					vm_fault_copy_dst_cleanup(dst_page);
6984 					RETURN(MACH_SEND_INTERRUPTED);
6985 				case VM_FAULT_SUCCESS_NO_VM_PAGE:
6986 					/* success but no VM page: fail */
6987 					vm_object_paging_end(src_object);
6988 					vm_object_unlock(src_object);
6989 					OS_FALLTHROUGH;
6990 				case VM_FAULT_MEMORY_ERROR:
6991 					vm_fault_copy_dst_cleanup(dst_page);
6992 					if (error) {
6993 						return error;
6994 					} else {
6995 						return KERN_MEMORY_ERROR;
6996 					}
6997 				default:
6998 					panic("vm_fault_copy(2): unexpected "
6999 					    "error 0x%x from "
7000 					    "vm_fault_page()\n", result);
7001 				}
7002 
7003 				result_page_object = VM_PAGE_OBJECT(result_page);
7004 				assert((src_top_page == VM_PAGE_NULL) ==
7005 				    (result_page_object == src_object));
7006 			}
7007 			assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
7008 			vm_object_unlock(result_page_object);
7009 		}
7010 
7011 		vm_map_lock_read(dst_map);
7012 
7013 		if (!vm_map_verify(dst_map, dst_version)) {
7014 			vm_map_unlock_read(dst_map);
7015 			if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7016 				vm_fault_copy_cleanup(result_page, src_top_page);
7017 			}
7018 			vm_fault_copy_dst_cleanup(dst_page);
7019 			break;
7020 		}
7021 		assert(dst_object == VM_PAGE_OBJECT(dst_page));
7022 
7023 		vm_object_lock(dst_object);
7024 
7025 		if (dst_object->copy != old_copy_object) {
7026 			vm_object_unlock(dst_object);
7027 			vm_map_unlock_read(dst_map);
7028 			if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7029 				vm_fault_copy_cleanup(result_page, src_top_page);
7030 			}
7031 			vm_fault_copy_dst_cleanup(dst_page);
7032 			break;
7033 		}
7034 		vm_object_unlock(dst_object);
7035 
7036 		/*
7037 		 *	Copy the page, and note that it is dirty
7038 		 *	immediately.
7039 		 */
7040 
7041 		if (!page_aligned(src_offset) ||
7042 		    !page_aligned(dst_offset) ||
7043 		    !page_aligned(amount_left)) {
7044 			vm_object_offset_t      src_po,
7045 			    dst_po;
7046 
7047 			src_po = src_offset - vm_object_trunc_page(src_offset);
7048 			dst_po = dst_offset - vm_object_trunc_page(dst_offset);
7049 
7050 			if (dst_po > src_po) {
7051 				part_size = PAGE_SIZE - dst_po;
7052 			} else {
7053 				part_size = PAGE_SIZE - src_po;
7054 			}
7055 			if (part_size > (amount_left)) {
7056 				part_size = amount_left;
7057 			}
7058 
7059 			if (result_page == VM_PAGE_NULL) {
7060 				assert((vm_offset_t) dst_po == dst_po);
7061 				assert((vm_size_t) part_size == part_size);
7062 				vm_page_part_zero_fill(dst_page,
7063 				    (vm_offset_t) dst_po,
7064 				    (vm_size_t) part_size);
7065 			} else {
7066 				assert((vm_offset_t) src_po == src_po);
7067 				assert((vm_offset_t) dst_po == dst_po);
7068 				assert((vm_size_t) part_size == part_size);
7069 				vm_page_part_copy(result_page,
7070 				    (vm_offset_t) src_po,
7071 				    dst_page,
7072 				    (vm_offset_t) dst_po,
7073 				    (vm_size_t)part_size);
7074 				if (!dst_page->vmp_dirty) {
7075 					vm_object_lock(dst_object);
7076 					SET_PAGE_DIRTY(dst_page, TRUE);
7077 					vm_object_unlock(dst_object);
7078 				}
7079 			}
7080 		} else {
7081 			part_size = PAGE_SIZE;
7082 
7083 			if (result_page == VM_PAGE_NULL) {
7084 				vm_page_zero_fill(dst_page);
7085 			} else {
7086 				vm_object_lock(result_page_object);
7087 				vm_page_copy(result_page, dst_page);
7088 				vm_object_unlock(result_page_object);
7089 
7090 				if (!dst_page->vmp_dirty) {
7091 					vm_object_lock(dst_object);
7092 					SET_PAGE_DIRTY(dst_page, TRUE);
7093 					vm_object_unlock(dst_object);
7094 				}
7095 			}
7096 		}
7097 
7098 		/*
7099 		 *	Unlock everything, and return
7100 		 */
7101 
7102 		vm_map_unlock_read(dst_map);
7103 
7104 		if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7105 			vm_fault_copy_cleanup(result_page, src_top_page);
7106 		}
7107 		vm_fault_copy_dst_cleanup(dst_page);
7108 
7109 		amount_left -= part_size;
7110 		src_offset += part_size;
7111 		dst_offset += part_size;
7112 	} while (amount_left > 0);
7113 
7114 	RETURN(KERN_SUCCESS);
7115 #undef  RETURN
7116 
7117 	/*NOTREACHED*/
7118 }
7119 
7120 #if     VM_FAULT_CLASSIFY
7121 /*
7122  *	Temporary statistics gathering support.
7123  */
7124 
7125 /*
7126  *	Statistics arrays:
7127  */
7128 #define VM_FAULT_TYPES_MAX      5
7129 #define VM_FAULT_LEVEL_MAX      8
7130 
7131 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
7132 
7133 #define VM_FAULT_TYPE_ZERO_FILL 0
7134 #define VM_FAULT_TYPE_MAP_IN    1
7135 #define VM_FAULT_TYPE_PAGER     2
7136 #define VM_FAULT_TYPE_COPY      3
7137 #define VM_FAULT_TYPE_OTHER     4
7138 
7139 
7140 void
vm_fault_classify(vm_object_t object,vm_object_offset_t offset,vm_prot_t fault_type)7141 vm_fault_classify(vm_object_t           object,
7142     vm_object_offset_t    offset,
7143     vm_prot_t             fault_type)
7144 {
7145 	int             type, level = 0;
7146 	vm_page_t       m;
7147 
7148 	while (TRUE) {
7149 		m = vm_page_lookup(object, offset);
7150 		if (m != VM_PAGE_NULL) {
7151 			if (m->vmp_busy || VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent) {
7152 				type = VM_FAULT_TYPE_OTHER;
7153 				break;
7154 			}
7155 			if (((fault_type & VM_PROT_WRITE) == 0) ||
7156 			    ((level == 0) && object->copy == VM_OBJECT_NULL)) {
7157 				type = VM_FAULT_TYPE_MAP_IN;
7158 				break;
7159 			}
7160 			type = VM_FAULT_TYPE_COPY;
7161 			break;
7162 		} else {
7163 			if (object->pager_created) {
7164 				type = VM_FAULT_TYPE_PAGER;
7165 				break;
7166 			}
7167 			if (object->shadow == VM_OBJECT_NULL) {
7168 				type = VM_FAULT_TYPE_ZERO_FILL;
7169 				break;
7170 			}
7171 
7172 			offset += object->vo_shadow_offset;
7173 			object = object->shadow;
7174 			level++;
7175 			continue;
7176 		}
7177 	}
7178 
7179 	if (level > VM_FAULT_LEVEL_MAX) {
7180 		level = VM_FAULT_LEVEL_MAX;
7181 	}
7182 
7183 	vm_fault_stats[type][level] += 1;
7184 
7185 	return;
7186 }
7187 
7188 /* cleanup routine to call from debugger */
7189 
7190 void
vm_fault_classify_init(void)7191 vm_fault_classify_init(void)
7192 {
7193 	int type, level;
7194 
7195 	for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
7196 		for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
7197 			vm_fault_stats[type][level] = 0;
7198 		}
7199 	}
7200 
7201 	return;
7202 }
7203 #endif  /* VM_FAULT_CLASSIFY */
7204 
7205 vm_offset_t
kdp_lightweight_fault(vm_map_t map,vm_offset_t cur_target_addr)7206 kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
7207 {
7208 	vm_map_entry_t  entry;
7209 	vm_object_t     object;
7210 	vm_offset_t     object_offset;
7211 	vm_page_t       m;
7212 	int             compressor_external_state, compressed_count_delta;
7213 	int             compressor_flags = (C_DONT_BLOCK | C_KEEP | C_KDP);
7214 	int             my_fault_type = VM_PROT_READ;
7215 	kern_return_t   kr;
7216 	int effective_page_mask, effective_page_size;
7217 
7218 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
7219 		effective_page_mask = VM_MAP_PAGE_MASK(map);
7220 		effective_page_size = VM_MAP_PAGE_SIZE(map);
7221 	} else {
7222 		effective_page_mask = PAGE_MASK;
7223 		effective_page_size = PAGE_SIZE;
7224 	}
7225 
7226 	if (not_in_kdp) {
7227 		panic("kdp_lightweight_fault called from outside of debugger context");
7228 	}
7229 
7230 	assert(map != VM_MAP_NULL);
7231 
7232 	assert((cur_target_addr & effective_page_mask) == 0);
7233 	if ((cur_target_addr & effective_page_mask) != 0) {
7234 		return 0;
7235 	}
7236 
7237 	if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
7238 		return 0;
7239 	}
7240 
7241 	if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
7242 		return 0;
7243 	}
7244 
7245 	if (entry->is_sub_map) {
7246 		return 0;
7247 	}
7248 
7249 	object = VME_OBJECT(entry);
7250 	if (object == VM_OBJECT_NULL) {
7251 		return 0;
7252 	}
7253 
7254 	object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
7255 
7256 	while (TRUE) {
7257 		if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
7258 			return 0;
7259 		}
7260 
7261 		if (object->pager_created && (object->paging_in_progress ||
7262 		    object->activity_in_progress)) {
7263 			return 0;
7264 		}
7265 
7266 		m = kdp_vm_page_lookup(object, vm_object_trunc_page(object_offset));
7267 
7268 		if (m != VM_PAGE_NULL) {
7269 			if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
7270 				return 0;
7271 			}
7272 
7273 			if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_cleaning ||
7274 			    m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) {
7275 				return 0;
7276 			}
7277 
7278 			assert(!m->vmp_private);
7279 			if (m->vmp_private) {
7280 				return 0;
7281 			}
7282 
7283 			assert(!m->vmp_fictitious);
7284 			if (m->vmp_fictitious) {
7285 				return 0;
7286 			}
7287 
7288 			assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7289 			if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7290 				return 0;
7291 			}
7292 
7293 			return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
7294 		}
7295 
7296 		compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
7297 
7298 		if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
7299 			if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
7300 				kr = vm_compressor_pager_get(object->pager,
7301 				    vm_object_trunc_page(object_offset + object->paging_offset),
7302 				    kdp_compressor_decompressed_page_ppnum, &my_fault_type,
7303 				    compressor_flags, &compressed_count_delta);
7304 				if (kr == KERN_SUCCESS) {
7305 					return kdp_compressor_decompressed_page_paddr;
7306 				} else {
7307 					return 0;
7308 				}
7309 			}
7310 		}
7311 
7312 		if (object->shadow == VM_OBJECT_NULL) {
7313 			return 0;
7314 		}
7315 
7316 		object_offset += object->vo_shadow_offset;
7317 		object = object->shadow;
7318 	}
7319 }
7320 
7321 /*
7322  * vm_page_validate_cs_fast():
7323  * Performs a few quick checks to determine if the page's code signature
7324  * really needs to be fully validated.  It could:
7325  *	1. have been modified (i.e. automatically tainted),
7326  *	2. have already been validated,
7327  *	3. have already been found to be tainted,
7328  *	4. no longer have a backing store.
7329  * Returns FALSE if the page needs to be fully validated.
7330  */
7331 static boolean_t
vm_page_validate_cs_fast(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)7332 vm_page_validate_cs_fast(
7333 	vm_page_t       page,
7334 	vm_map_size_t   fault_page_size,
7335 	vm_map_offset_t fault_phys_offset)
7336 {
7337 	vm_object_t     object;
7338 
7339 	object = VM_PAGE_OBJECT(page);
7340 	vm_object_lock_assert_held(object);
7341 
7342 	if (page->vmp_wpmapped &&
7343 	    !VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
7344 		/*
7345 		 * This page was mapped for "write" access sometime in the
7346 		 * past and could still be modifiable in the future.
7347 		 * Consider it tainted.
7348 		 * [ If the page was already found to be "tainted", no
7349 		 * need to re-validate. ]
7350 		 */
7351 		vm_object_lock_assert_exclusive(object);
7352 		VMP_CS_SET_VALIDATED(page, fault_page_size, fault_phys_offset, TRUE);
7353 		VMP_CS_SET_TAINTED(page, fault_page_size, fault_phys_offset, TRUE);
7354 		if (cs_debug) {
7355 			printf("CODESIGNING: %s: "
7356 			    "page %p obj %p off 0x%llx "
7357 			    "was modified\n",
7358 			    __FUNCTION__,
7359 			    page, object, page->vmp_offset);
7360 		}
7361 		vm_cs_validated_dirtied++;
7362 	}
7363 
7364 	if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) ||
7365 	    VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
7366 		return TRUE;
7367 	}
7368 	vm_object_lock_assert_exclusive(object);
7369 
7370 #if CHECK_CS_VALIDATION_BITMAP
7371 	kern_return_t kr;
7372 
7373 	kr = vnode_pager_cs_check_validation_bitmap(
7374 		object->pager,
7375 		page->vmp_offset + object->paging_offset,
7376 		CS_BITMAP_CHECK);
7377 	if (kr == KERN_SUCCESS) {
7378 		page->vmp_cs_validated = VMP_CS_ALL_TRUE;
7379 		page->vmp_cs_tainted = VMP_CS_ALL_FALSE;
7380 		vm_cs_bitmap_validated++;
7381 		return TRUE;
7382 	}
7383 #endif /* CHECK_CS_VALIDATION_BITMAP */
7384 
7385 	if (!object->alive || object->terminating || object->pager == NULL) {
7386 		/*
7387 		 * The object is terminating and we don't have its pager
7388 		 * so we can't validate the data...
7389 		 */
7390 		return TRUE;
7391 	}
7392 
7393 	/* we need to really validate this page */
7394 	vm_object_lock_assert_exclusive(object);
7395 	return FALSE;
7396 }
7397 
7398 void
vm_page_validate_cs_mapped_slow(vm_page_t page,const void * kaddr)7399 vm_page_validate_cs_mapped_slow(
7400 	vm_page_t       page,
7401 	const void      *kaddr)
7402 {
7403 	vm_object_t             object;
7404 	memory_object_offset_t  mo_offset;
7405 	memory_object_t         pager;
7406 	struct vnode            *vnode;
7407 	int                     validated, tainted, nx;
7408 
7409 	assert(page->vmp_busy);
7410 	object = VM_PAGE_OBJECT(page);
7411 	vm_object_lock_assert_exclusive(object);
7412 
7413 	vm_cs_validates++;
7414 
7415 	/*
7416 	 * Since we get here to validate a page that was brought in by
7417 	 * the pager, we know that this pager is all setup and ready
7418 	 * by now.
7419 	 */
7420 	assert(object->code_signed);
7421 	assert(!object->internal);
7422 	assert(object->pager != NULL);
7423 	assert(object->pager_ready);
7424 
7425 	pager = object->pager;
7426 	assert(object->paging_in_progress);
7427 	vnode = vnode_pager_lookup_vnode(pager);
7428 	mo_offset = page->vmp_offset + object->paging_offset;
7429 
7430 	/* verify the SHA1 hash for this page */
7431 	validated = 0;
7432 	tainted = 0;
7433 	nx = 0;
7434 	cs_validate_page(vnode,
7435 	    pager,
7436 	    mo_offset,
7437 	    (const void *)((const char *)kaddr),
7438 	    &validated,
7439 	    &tainted,
7440 	    &nx);
7441 
7442 	page->vmp_cs_validated |= validated;
7443 	page->vmp_cs_tainted |= tainted;
7444 	page->vmp_cs_nx |= nx;
7445 
7446 #if CHECK_CS_VALIDATION_BITMAP
7447 	if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
7448 	    page->vmp_cs_tainted == VMP_CS_ALL_FALSE) {
7449 		vnode_pager_cs_check_validation_bitmap(object->pager,
7450 		    mo_offset,
7451 		    CS_BITMAP_SET);
7452 	}
7453 #endif /* CHECK_CS_VALIDATION_BITMAP */
7454 }
7455 
7456 void
vm_page_validate_cs_mapped(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,const void * kaddr)7457 vm_page_validate_cs_mapped(
7458 	vm_page_t       page,
7459 	vm_map_size_t   fault_page_size,
7460 	vm_map_offset_t fault_phys_offset,
7461 	const void      *kaddr)
7462 {
7463 	if (!vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
7464 		vm_page_validate_cs_mapped_slow(page, kaddr);
7465 	}
7466 }
7467 
7468 static void
vm_page_map_and_validate_cs(vm_object_t object,vm_page_t page)7469 vm_page_map_and_validate_cs(
7470 	vm_object_t     object,
7471 	vm_page_t       page)
7472 {
7473 	vm_object_offset_t      offset;
7474 	vm_map_offset_t         koffset;
7475 	vm_map_size_t           ksize;
7476 	vm_offset_t             kaddr;
7477 	kern_return_t           kr;
7478 	boolean_t               busy_page;
7479 	boolean_t               need_unmap;
7480 
7481 	vm_object_lock_assert_exclusive(object);
7482 
7483 	assert(object->code_signed);
7484 	offset = page->vmp_offset;
7485 
7486 	busy_page = page->vmp_busy;
7487 	if (!busy_page) {
7488 		/* keep page busy while we map (and unlock) the VM object */
7489 		page->vmp_busy = TRUE;
7490 	}
7491 
7492 	/*
7493 	 * Take a paging reference on the VM object
7494 	 * to protect it from collapse or bypass,
7495 	 * and keep it from disappearing too.
7496 	 */
7497 	vm_object_paging_begin(object);
7498 
7499 	/* map the page in the kernel address space */
7500 	ksize = PAGE_SIZE_64;
7501 	koffset = 0;
7502 	need_unmap = FALSE;
7503 	kr = vm_paging_map_object(page,
7504 	    object,
7505 	    offset,
7506 	    VM_PROT_READ,
7507 	    FALSE,                       /* can't unlock object ! */
7508 	    &ksize,
7509 	    &koffset,
7510 	    &need_unmap);
7511 	if (kr != KERN_SUCCESS) {
7512 		panic("%s: could not map page: 0x%x", __FUNCTION__, kr);
7513 	}
7514 	kaddr = CAST_DOWN(vm_offset_t, koffset);
7515 
7516 	/* validate the mapped page */
7517 	vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
7518 
7519 	assert(page->vmp_busy);
7520 	assert(object == VM_PAGE_OBJECT(page));
7521 	vm_object_lock_assert_exclusive(object);
7522 
7523 	if (!busy_page) {
7524 		PAGE_WAKEUP_DONE(page);
7525 	}
7526 	if (need_unmap) {
7527 		/* unmap the map from the kernel address space */
7528 		vm_paging_unmap_object(object, koffset, koffset + ksize);
7529 		koffset = 0;
7530 		ksize = 0;
7531 		kaddr = 0;
7532 	}
7533 	vm_object_paging_end(object);
7534 }
7535 
7536 void
vm_page_validate_cs(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)7537 vm_page_validate_cs(
7538 	vm_page_t       page,
7539 	vm_map_size_t   fault_page_size,
7540 	vm_map_offset_t fault_phys_offset)
7541 {
7542 	vm_object_t             object;
7543 
7544 	object = VM_PAGE_OBJECT(page);
7545 	vm_object_lock_assert_held(object);
7546 
7547 	if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
7548 		return;
7549 	}
7550 	vm_page_map_and_validate_cs(object, page);
7551 }
7552 
7553 void
vm_page_validate_cs_mapped_chunk(vm_page_t page,const void * kaddr,vm_offset_t chunk_offset,vm_size_t chunk_size,boolean_t * validated_p,unsigned * tainted_p)7554 vm_page_validate_cs_mapped_chunk(
7555 	vm_page_t       page,
7556 	const void      *kaddr,
7557 	vm_offset_t     chunk_offset,
7558 	vm_size_t       chunk_size,
7559 	boolean_t       *validated_p,
7560 	unsigned        *tainted_p)
7561 {
7562 	vm_object_t             object;
7563 	vm_object_offset_t      offset, offset_in_page;
7564 	memory_object_t         pager;
7565 	struct vnode            *vnode;
7566 	boolean_t               validated;
7567 	unsigned                tainted;
7568 
7569 	*validated_p = FALSE;
7570 	*tainted_p = 0;
7571 
7572 	assert(page->vmp_busy);
7573 	object = VM_PAGE_OBJECT(page);
7574 	vm_object_lock_assert_exclusive(object);
7575 
7576 	assert(object->code_signed);
7577 	offset = page->vmp_offset;
7578 
7579 	if (!object->alive || object->terminating || object->pager == NULL) {
7580 		/*
7581 		 * The object is terminating and we don't have its pager
7582 		 * so we can't validate the data...
7583 		 */
7584 		return;
7585 	}
7586 	/*
7587 	 * Since we get here to validate a page that was brought in by
7588 	 * the pager, we know that this pager is all setup and ready
7589 	 * by now.
7590 	 */
7591 	assert(!object->internal);
7592 	assert(object->pager != NULL);
7593 	assert(object->pager_ready);
7594 
7595 	pager = object->pager;
7596 	assert(object->paging_in_progress);
7597 	vnode = vnode_pager_lookup_vnode(pager);
7598 
7599 	/* verify the signature for this chunk */
7600 	offset_in_page = chunk_offset;
7601 	assert(offset_in_page < PAGE_SIZE);
7602 
7603 	tainted = 0;
7604 	validated = cs_validate_range(vnode,
7605 	    pager,
7606 	    (object->paging_offset +
7607 	    offset +
7608 	    offset_in_page),
7609 	    (const void *)((const char *)kaddr
7610 	    + offset_in_page),
7611 	    chunk_size,
7612 	    &tainted);
7613 	if (validated) {
7614 		*validated_p = TRUE;
7615 	}
7616 	if (tainted) {
7617 		*tainted_p = tainted;
7618 	}
7619 }
7620 
7621 static void
vm_rtfrecord_lock(void)7622 vm_rtfrecord_lock(void)
7623 {
7624 	lck_spin_lock(&vm_rtfr_slock);
7625 }
7626 
7627 static void
vm_rtfrecord_unlock(void)7628 vm_rtfrecord_unlock(void)
7629 {
7630 	lck_spin_unlock(&vm_rtfr_slock);
7631 }
7632 
7633 unsigned int
vmrtfaultinfo_bufsz(void)7634 vmrtfaultinfo_bufsz(void)
7635 {
7636 	return vmrtf_num_records * sizeof(vm_rtfault_record_t);
7637 }
7638 
7639 #include <kern/backtrace.h>
7640 
7641 __attribute__((noinline))
7642 static void
vm_record_rtfault(thread_t cthread,uint64_t fstart,vm_map_offset_t fault_vaddr,int type_of_fault)7643 vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
7644 {
7645 	uint64_t fend = mach_continuous_time();
7646 
7647 	uint64_t cfpc = 0;
7648 	uint64_t ctid = cthread->thread_id;
7649 	uint64_t cupid = get_current_unique_pid();
7650 
7651 	uintptr_t bpc = 0;
7652 	errno_t btr = 0;
7653 
7654 	/*
7655 	 * Capture a single-frame backtrace.  This extracts just the program
7656 	 * counter at the point of the fault, and should not use copyin to get
7657 	 * Rosetta save state.
7658 	 */
7659 	struct backtrace_control ctl = {
7660 		.btc_user_thread = cthread,
7661 		.btc_user_copy = backtrace_user_copy_error,
7662 	};
7663 	unsigned int bfrs = backtrace_user(&bpc, 1U, &ctl, NULL);
7664 	if ((btr == 0) && (bfrs > 0)) {
7665 		cfpc = bpc;
7666 	}
7667 
7668 	assert((fstart != 0) && fend >= fstart);
7669 	vm_rtfrecord_lock();
7670 	assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
7671 
7672 	vmrtfrs.vmrtf_total++;
7673 	vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
7674 
7675 	cvmr->rtfabstime = fstart;
7676 	cvmr->rtfduration = fend - fstart;
7677 	cvmr->rtfaddr = fault_vaddr;
7678 	cvmr->rtfpc = cfpc;
7679 	cvmr->rtftype = type_of_fault;
7680 	cvmr->rtfupid = cupid;
7681 	cvmr->rtftid = ctid;
7682 
7683 	if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
7684 		vmrtfrs.vmrtfr_curi = 0;
7685 	}
7686 
7687 	vm_rtfrecord_unlock();
7688 }
7689 
7690 int
vmrtf_extract(uint64_t cupid,__unused boolean_t isroot,unsigned long vrecordsz,void * vrecords,unsigned long * vmrtfrv)7691 vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, unsigned long vrecordsz, void *vrecords, unsigned long *vmrtfrv)
7692 {
7693 	vm_rtfault_record_t *cvmrd = vrecords;
7694 	size_t residue = vrecordsz;
7695 	size_t numextracted = 0;
7696 	boolean_t early_exit = FALSE;
7697 
7698 	vm_rtfrecord_lock();
7699 
7700 	for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
7701 		if (residue < sizeof(vm_rtfault_record_t)) {
7702 			early_exit = TRUE;
7703 			break;
7704 		}
7705 
7706 		if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
7707 #if     DEVELOPMENT || DEBUG
7708 			if (isroot == FALSE) {
7709 				continue;
7710 			}
7711 #else
7712 			continue;
7713 #endif /* DEVDEBUG */
7714 		}
7715 
7716 		*cvmrd = vmrtfrs.vm_rtf_records[vmfi];
7717 		cvmrd++;
7718 		residue -= sizeof(vm_rtfault_record_t);
7719 		numextracted++;
7720 	}
7721 
7722 	vm_rtfrecord_unlock();
7723 
7724 	*vmrtfrv = numextracted;
7725 	return early_exit;
7726 }
7727 
7728 /*
7729  * Only allow one diagnosis to be in flight at a time, to avoid
7730  * creating too much additional memory usage.
7731  */
7732 static volatile uint_t vmtc_diagnosing;
7733 unsigned int vmtc_total = 0;
7734 
7735 /*
7736  * Type used to update telemetry for the diagnosis counts.
7737  */
7738 CA_EVENT(vmtc_telemetry,
7739     CA_INT, vmtc_num_byte,            /* number of corrupt bytes found */
7740     CA_BOOL, vmtc_undiagnosed,        /* undiagnosed because more than 1 at a time */
7741     CA_BOOL, vmtc_not_eligible,       /* the page didn't qualify */
7742     CA_BOOL, vmtc_copyin_fail,        /* unable to copy in the page */
7743     CA_BOOL, vmtc_not_found,          /* no corruption found even though CS failed */
7744     CA_BOOL, vmtc_one_bit_flip,       /* single bit flip */
7745     CA_BOOL, vmtc_testing);           /* caused on purpose by testing */
7746 
7747 #if DEVELOPMENT || DEBUG
7748 /*
7749  * Buffers used to compare before/after page contents.
7750  * Stashed to aid when debugging crashes.
7751  */
7752 static size_t vmtc_last_buffer_size = 0;
7753 static uint64_t *vmtc_last_before_buffer = NULL;
7754 static uint64_t *vmtc_last_after_buffer = NULL;
7755 
7756 /*
7757  * Needed to record corruptions due to testing.
7758  */
7759 static uintptr_t corruption_test_va = 0;
7760 #endif /* DEVELOPMENT || DEBUG */
7761 
7762 /*
7763  * Stash a copy of data from a possibly corrupt page.
7764  */
7765 static uint64_t *
vmtc_get_page_data(vm_map_offset_t code_addr,vm_page_t page)7766 vmtc_get_page_data(
7767 	vm_map_offset_t code_addr,
7768 	vm_page_t       page)
7769 {
7770 	uint64_t        *buffer = NULL;
7771 	addr64_t        buffer_paddr;
7772 	addr64_t        page_paddr;
7773 	extern void     bcopy_phys(addr64_t from, addr64_t to, vm_size_t bytes);
7774 	uint_t          size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
7775 
7776 	/*
7777 	 * Need an aligned buffer to do a physical copy.
7778 	 */
7779 	if (kernel_memory_allocate(kernel_map, (vm_offset_t *)&buffer,
7780 	    size, size - 1, KMA_KOBJECT, VM_KERN_MEMORY_DIAG) != KERN_SUCCESS) {
7781 		return NULL;
7782 	}
7783 	buffer_paddr = kvtophys((vm_offset_t)buffer);
7784 	page_paddr = ptoa(VM_PAGE_GET_PHYS_PAGE(page));
7785 
7786 	/* adjust the page start address if we need only 4K of a 16K page */
7787 	if (size < PAGE_SIZE) {
7788 		uint_t subpage_start = ((code_addr & (PAGE_SIZE - 1)) & ~(size - 1));
7789 		page_paddr += subpage_start;
7790 	}
7791 
7792 	bcopy_phys(page_paddr, buffer_paddr, size);
7793 	return buffer;
7794 }
7795 
7796 /*
7797  * Set things up so we can diagnose a potential text page corruption.
7798  */
7799 static uint64_t *
vmtc_text_page_diagnose_setup(vm_map_offset_t code_addr,vm_page_t page,CA_EVENT_TYPE (vmtc_telemetry)* event)7800 vmtc_text_page_diagnose_setup(
7801 	vm_map_offset_t code_addr,
7802 	vm_page_t       page,
7803 	CA_EVENT_TYPE(vmtc_telemetry) *event)
7804 {
7805 	uint64_t        *buffer = NULL;
7806 
7807 	/*
7808 	 * If another is being diagnosed, skip this one.
7809 	 */
7810 	if (!OSCompareAndSwap(0, 1, &vmtc_diagnosing)) {
7811 		event->vmtc_undiagnosed = true;
7812 		return NULL;
7813 	}
7814 
7815 	/*
7816 	 * Get the contents of the corrupt page.
7817 	 */
7818 	buffer = vmtc_get_page_data(code_addr, page);
7819 	if (buffer == NULL) {
7820 		event->vmtc_copyin_fail = true;
7821 		if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
7822 			panic("Bad compare and swap in setup!");
7823 		}
7824 		return NULL;
7825 	}
7826 	return buffer;
7827 }
7828 
7829 /*
7830  * Diagnose the text page by comparing its contents with
7831  * the one we've previously saved.
7832  */
7833 static void
vmtc_text_page_diagnose(vm_map_offset_t code_addr,uint64_t * old_code_buffer,CA_EVENT_TYPE (vmtc_telemetry)* event)7834 vmtc_text_page_diagnose(
7835 	vm_map_offset_t code_addr,
7836 	uint64_t        *old_code_buffer,
7837 	CA_EVENT_TYPE(vmtc_telemetry) *event)
7838 {
7839 	uint64_t        *new_code_buffer;
7840 	size_t          size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
7841 	uint_t          count = (uint_t)size / sizeof(uint64_t);
7842 	uint_t          diff_count = 0;
7843 	bool            bit_flip = false;
7844 	uint_t          b;
7845 	uint64_t        *new;
7846 	uint64_t        *old;
7847 
7848 	new_code_buffer = kalloc_data(size, Z_WAITOK);
7849 	assert(new_code_buffer != NULL);
7850 	if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - 1), new_code_buffer, size) != 0) {
7851 		/* copyin error, so undo things */
7852 		event->vmtc_copyin_fail = true;
7853 		goto done;
7854 	}
7855 
7856 	new = new_code_buffer;
7857 	old = old_code_buffer;
7858 	for (; count-- > 0; ++new, ++old) {
7859 		if (*new == *old) {
7860 			continue;
7861 		}
7862 
7863 		/*
7864 		 * On first diff, check for a single bit flip
7865 		 */
7866 		if (diff_count == 0) {
7867 			uint64_t x = (*new ^ *old);
7868 			assert(x != 0);
7869 			if ((x & (x - 1)) == 0) {
7870 				bit_flip = true;
7871 				++diff_count;
7872 				continue;
7873 			}
7874 		}
7875 
7876 		/*
7877 		 * count up the number of different bytes.
7878 		 */
7879 		for (b = 0; b < sizeof(uint64_t); ++b) {
7880 			char *n = (char *)new;
7881 			char *o = (char *)old;
7882 			if (n[b] != o[b]) {
7883 				++diff_count;
7884 			}
7885 		}
7886 	}
7887 
7888 	if (diff_count > 1) {
7889 		bit_flip = false;
7890 	}
7891 
7892 	if (diff_count == 0) {
7893 		event->vmtc_not_found = true;
7894 	} else {
7895 		event->vmtc_num_byte = diff_count;
7896 	}
7897 	if (bit_flip) {
7898 		event->vmtc_one_bit_flip = true;
7899 	}
7900 
7901 done:
7902 	/*
7903 	 * Free up the code copy buffers, but save the last
7904 	 * set on development / debug kernels in case they
7905 	 * can provide evidence for debugging memory stomps.
7906 	 */
7907 #if DEVELOPMENT || DEBUG
7908 	if (vmtc_last_before_buffer != NULL) {
7909 		kmem_free(kernel_map, (vm_offset_t)vmtc_last_before_buffer, vmtc_last_buffer_size);
7910 	}
7911 	if (vmtc_last_after_buffer != NULL) {
7912 		kfree_data(vmtc_last_after_buffer, vmtc_last_buffer_size);
7913 	}
7914 	vmtc_last_before_buffer = old_code_buffer;
7915 	vmtc_last_after_buffer = new_code_buffer;
7916 	vmtc_last_buffer_size = size;
7917 #else /* DEVELOPMENT || DEBUG */
7918 	kfree_data(new_code_buffer, size);
7919 	kmem_free(kernel_map, (vm_offset_t)old_code_buffer, size);
7920 #endif /* DEVELOPMENT || DEBUG */
7921 
7922 	/*
7923 	 * We're finished, so clear the diagnosing flag.
7924 	 */
7925 	if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
7926 		panic("Bad compare and swap in diagnose!");
7927 	}
7928 }
7929 
7930 /*
7931  * For the given map, virt address, find the object, offset, and page.
7932  * This has to lookup the map entry, verify protections, walk any shadow chains.
7933  * If found, returns with the object locked.
7934  */
7935 static kern_return_t
vmtc_revalidate_lookup(vm_map_t map,vm_map_offset_t vaddr,vm_object_t * ret_object,vm_object_offset_t * ret_offset,vm_page_t * ret_page,vm_prot_t * ret_prot)7936 vmtc_revalidate_lookup(
7937 	vm_map_t               map,
7938 	vm_map_offset_t        vaddr,
7939 	vm_object_t            *ret_object,
7940 	vm_object_offset_t     *ret_offset,
7941 	vm_page_t              *ret_page,
7942 	vm_prot_t              *ret_prot)
7943 {
7944 	vm_object_t            object;
7945 	vm_object_offset_t     offset;
7946 	vm_page_t              page;
7947 	kern_return_t          kr = KERN_SUCCESS;
7948 	uint8_t                object_lock_type = OBJECT_LOCK_EXCLUSIVE;
7949 	vm_map_version_t       version;
7950 	boolean_t              wired;
7951 	struct vm_object_fault_info fault_info = {};
7952 	vm_map_t               real_map = NULL;
7953 	vm_prot_t              prot;
7954 	vm_object_t            shadow;
7955 
7956 	/*
7957 	 * Find the object/offset for the given location/map.
7958 	 * Note this returns with the object locked.
7959 	 */
7960 restart:
7961 	vm_map_lock_read(map);
7962 	object = VM_OBJECT_NULL;        /* in case we come around the restart path */
7963 	kr = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
7964 	    object_lock_type, &version, &object, &offset, &prot, &wired,
7965 	    &fault_info, &real_map, NULL);
7966 	vm_map_unlock_read(map);
7967 	if (real_map != NULL && real_map != map) {
7968 		vm_map_unlock(real_map);
7969 	}
7970 
7971 	/*
7972 	 * If there's no page here, fail.
7973 	 */
7974 	if (kr != KERN_SUCCESS || object == NULL) {
7975 		kr = KERN_FAILURE;
7976 		goto done;
7977 	}
7978 
7979 	/*
7980 	 * Chase down any shadow chains to find the actual page.
7981 	 */
7982 	for (;;) {
7983 		/*
7984 		 * See if the page is on the current object.
7985 		 */
7986 		page = vm_page_lookup(object, vm_object_trunc_page(offset));
7987 		if (page != NULL) {
7988 			/* restart the lookup */
7989 			if (page->vmp_restart) {
7990 				vm_object_unlock(object);
7991 				goto restart;
7992 			}
7993 
7994 			/*
7995 			 * If this page is busy, we need to wait for it.
7996 			 */
7997 			if (page->vmp_busy) {
7998 				PAGE_SLEEP(object, page, TRUE);
7999 				vm_object_unlock(object);
8000 				goto restart;
8001 			}
8002 			break;
8003 		}
8004 
8005 		/*
8006 		 * If the object doesn't have the page and
8007 		 * has no shadow, then we can quit.
8008 		 */
8009 		shadow = object->shadow;
8010 		if (shadow == NULL) {
8011 			kr = KERN_FAILURE;
8012 			goto done;
8013 		}
8014 
8015 		/*
8016 		 * Move to the next object
8017 		 */
8018 		offset += object->vo_shadow_offset;
8019 		vm_object_lock(shadow);
8020 		vm_object_unlock(object);
8021 		object = shadow;
8022 		shadow = VM_OBJECT_NULL;
8023 	}
8024 	*ret_object = object;
8025 	*ret_offset = vm_object_trunc_page(offset);
8026 	*ret_page = page;
8027 	*ret_prot = prot;
8028 
8029 done:
8030 	if (kr != KERN_SUCCESS && object != NULL) {
8031 		vm_object_unlock(object);
8032 	}
8033 	return kr;
8034 }
8035 
8036 /*
8037  * Check if a page is wired, needs extra locking.
8038  */
8039 static bool
is_page_wired(vm_page_t page)8040 is_page_wired(vm_page_t page)
8041 {
8042 	bool result;
8043 	vm_page_lock_queues();
8044 	result = VM_PAGE_WIRED(page);
8045 	vm_page_unlock_queues();
8046 	return result;
8047 }
8048 
8049 /*
8050  * A fatal process error has occurred in the given task.
8051  * Recheck the code signing of the text page at the given
8052  * address to check for a text page corruption.
8053  *
8054  * Returns KERN_FAILURE if a page was found to be corrupt
8055  * by failing to match its code signature. KERN_SUCCESS
8056  * means the page is either valid or we don't have the
8057  * information to say it's corrupt.
8058  */
8059 kern_return_t
revalidate_text_page(task_t task,vm_map_offset_t code_addr)8060 revalidate_text_page(task_t task, vm_map_offset_t code_addr)
8061 {
8062 	kern_return_t          kr;
8063 	vm_map_t               map;
8064 	vm_object_t            object = NULL;
8065 	vm_object_offset_t     offset;
8066 	vm_page_t              page = NULL;
8067 	struct vnode           *vnode;
8068 	uint64_t               *diagnose_buffer = NULL;
8069 	CA_EVENT_TYPE(vmtc_telemetry) * event = NULL;
8070 	ca_event_t             ca_event = NULL;
8071 	vm_prot_t              prot;
8072 
8073 	map = task->map;
8074 	if (task->map == NULL) {
8075 		return KERN_SUCCESS;
8076 	}
8077 
8078 	kr = vmtc_revalidate_lookup(map, code_addr, &object, &offset, &page, &prot);
8079 	if (kr != KERN_SUCCESS) {
8080 		goto done;
8081 	}
8082 
8083 	/*
8084 	 * The page must be executable.
8085 	 */
8086 	if (!(prot & VM_PROT_EXECUTE)) {
8087 		goto done;
8088 	}
8089 
8090 	/*
8091 	 * The object needs to have a pager.
8092 	 */
8093 	if (object->pager == NULL) {
8094 		goto done;
8095 	}
8096 
8097 	/*
8098 	 * Needs to be a vnode backed page to have a signature.
8099 	 */
8100 	vnode = vnode_pager_lookup_vnode(object->pager);
8101 	if (vnode == NULL) {
8102 		goto done;
8103 	}
8104 
8105 	/*
8106 	 * Object checks to see if we should proceed.
8107 	 */
8108 	if (!object->code_signed ||     /* no code signature to check */
8109 	    object->internal ||         /* internal objects aren't signed */
8110 	    object->terminating ||      /* the object and its pages are already going away */
8111 	    !object->pager_ready) {     /* this should happen, but check shouldn't hurt */
8112 		goto done;
8113 	}
8114 
8115 
8116 	/*
8117 	 * Check the code signature of the page in question.
8118 	 */
8119 	vm_page_map_and_validate_cs(object, page);
8120 
8121 	/*
8122 	 * At this point:
8123 	 * vmp_cs_validated |= validated (set if a code signature exists)
8124 	 * vmp_cs_tainted |= tainted (set if code signature violation)
8125 	 * vmp_cs_nx |= nx;  ??
8126 	 *
8127 	 * if vmp_pmapped then have to pmap_disconnect..
8128 	 * other flags to check on object or page?
8129 	 */
8130 	if (page->vmp_cs_tainted != VMP_CS_ALL_FALSE) {
8131 #if DEBUG || DEVELOPMENT
8132 		/*
8133 		 * On development builds, a boot-arg can be used to cause
8134 		 * a panic, instead of a quiet repair.
8135 		 */
8136 		if (vmtc_panic_instead) {
8137 			panic("Text page corruption detected: vm_page_t 0x%llx", (long long)(uintptr_t)page);
8138 		}
8139 #endif /* DEBUG || DEVELOPMENT */
8140 
8141 		/*
8142 		 * We're going to invalidate this page. Grab a copy of it for comparison.
8143 		 */
8144 		ca_event = CA_EVENT_ALLOCATE(vmtc_telemetry);
8145 		event = ca_event->data;
8146 		diagnose_buffer = vmtc_text_page_diagnose_setup(code_addr, page, event);
8147 
8148 		/*
8149 		 * Invalidate, i.e. toss, the corrupted page.
8150 		 */
8151 		if (!page->vmp_cleaning &&
8152 		    !page->vmp_laundry &&
8153 		    !page->vmp_fictitious &&
8154 		    !page->vmp_precious &&
8155 		    !page->vmp_absent &&
8156 		    !VMP_ERROR_GET(page) &&
8157 		    !page->vmp_dirty &&
8158 		    !is_page_wired(page)) {
8159 			if (page->vmp_pmapped) {
8160 				int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page));
8161 				if (refmod & VM_MEM_MODIFIED) {
8162 					SET_PAGE_DIRTY(page, FALSE);
8163 				}
8164 				if (refmod & VM_MEM_REFERENCED) {
8165 					page->vmp_reference = TRUE;
8166 				}
8167 			}
8168 			/* If the page seems intentionally modified, don't trash it. */
8169 			if (!page->vmp_dirty) {
8170 				VM_PAGE_FREE(page);
8171 			} else {
8172 				event->vmtc_not_eligible = true;
8173 			}
8174 		} else {
8175 			event->vmtc_not_eligible = true;
8176 		}
8177 		vm_object_unlock(object);
8178 		object = VM_OBJECT_NULL;
8179 
8180 		/*
8181 		 * Now try to diagnose the type of failure by faulting
8182 		 * in a new copy and diff'ing it with what we saved.
8183 		 */
8184 		if (diagnose_buffer != NULL) {
8185 			vmtc_text_page_diagnose(code_addr, diagnose_buffer, event);
8186 		}
8187 #if DEBUG || DEVELOPMENT
8188 		if (corruption_test_va != 0) {
8189 			corruption_test_va = 0;
8190 			event->vmtc_testing = true;
8191 		}
8192 #endif /* DEBUG || DEVELOPMENT */
8193 		ktriage_record(thread_tid(current_thread()),
8194 		    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_TEXT_CORRUPTION),
8195 		    0 /* arg */);
8196 		CA_EVENT_SEND(ca_event);
8197 		printf("Text page corruption detected for pid %d\n", proc_selfpid());
8198 		++vmtc_total;
8199 		return KERN_FAILURE; /* failure means we definitely found a corrupt page */
8200 	}
8201 done:
8202 	if (object != NULL) {
8203 		vm_object_unlock(object);
8204 	}
8205 	return KERN_SUCCESS;
8206 }
8207 
8208 #if DEBUG || DEVELOPMENT
8209 /*
8210  * For implementing unit tests - ask the pmap to corrupt a text page.
8211  * We have to find the page, to get the physical address, then invoke
8212  * the pmap.
8213  */
8214 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
8215 
8216 kern_return_t
vm_corrupt_text_addr(uintptr_t va)8217 vm_corrupt_text_addr(uintptr_t va)
8218 {
8219 	task_t                 task = current_task();
8220 	vm_map_t               map;
8221 	kern_return_t          kr = KERN_SUCCESS;
8222 	vm_object_t            object = VM_OBJECT_NULL;
8223 	vm_object_offset_t     offset;
8224 	vm_page_t              page = NULL;
8225 	pmap_paddr_t           pa;
8226 	vm_prot_t              prot;
8227 
8228 	map = task->map;
8229 	if (task->map == NULL) {
8230 		printf("corrupt_text_addr: no map\n");
8231 		return KERN_FAILURE;
8232 	}
8233 
8234 	kr = vmtc_revalidate_lookup(map, (vm_map_offset_t)va, &object, &offset, &page, &prot);
8235 	if (kr != KERN_SUCCESS) {
8236 		printf("corrupt_text_addr: page lookup failed\n");
8237 		return kr;
8238 	}
8239 	if (!(prot & VM_PROT_EXECUTE)) {
8240 		printf("corrupt_text_addr: page not executable\n");
8241 		return KERN_FAILURE;
8242 	}
8243 
8244 	/* get the physical address to use */
8245 	pa = ptoa(VM_PAGE_GET_PHYS_PAGE(page)) + (va - vm_object_trunc_page(va));
8246 
8247 	/*
8248 	 * Check we have something we can work with.
8249 	 * Due to racing with pageout as we enter the sysctl,
8250 	 * it's theoretically possible to have the page disappear, just
8251 	 * before the lookup.
8252 	 *
8253 	 * That's highly likely to happen often. I've filed a radar 72857482
8254 	 * to bubble up the error here to the sysctl result and have the
8255 	 * test not FAIL in that case.
8256 	 */
8257 	if (page->vmp_busy) {
8258 		printf("corrupt_text_addr: vmp_busy\n");
8259 		kr = KERN_FAILURE;
8260 	}
8261 	if (page->vmp_cleaning) {
8262 		printf("corrupt_text_addr: vmp_cleaning\n");
8263 		kr = KERN_FAILURE;
8264 	}
8265 	if (page->vmp_laundry) {
8266 		printf("corrupt_text_addr: vmp_cleaning\n");
8267 		kr = KERN_FAILURE;
8268 	}
8269 	if (page->vmp_fictitious) {
8270 		printf("corrupt_text_addr: vmp_fictitious\n");
8271 		kr = KERN_FAILURE;
8272 	}
8273 	if (page->vmp_precious) {
8274 		printf("corrupt_text_addr: vmp_precious\n");
8275 		kr = KERN_FAILURE;
8276 	}
8277 	if (page->vmp_absent) {
8278 		printf("corrupt_text_addr: vmp_absent\n");
8279 		kr = KERN_FAILURE;
8280 	}
8281 	if (VMP_ERROR_GET(page)) {
8282 		printf("corrupt_text_addr: vmp_error\n");
8283 		kr = KERN_FAILURE;
8284 	}
8285 	if (page->vmp_dirty) {
8286 		printf("corrupt_text_addr: vmp_dirty\n");
8287 		kr = KERN_FAILURE;
8288 	}
8289 	if (is_page_wired(page)) {
8290 		printf("corrupt_text_addr: wired\n");
8291 		kr = KERN_FAILURE;
8292 	}
8293 	if (!page->vmp_pmapped) {
8294 		printf("corrupt_text_addr: !vmp_pmapped\n");
8295 		kr = KERN_FAILURE;
8296 	}
8297 
8298 	if (kr == KERN_SUCCESS) {
8299 		printf("corrupt_text_addr: using physaddr 0x%llx\n", (long long)pa);
8300 		kr = pmap_test_text_corruption(pa);
8301 		if (kr != KERN_SUCCESS) {
8302 			printf("corrupt_text_addr: pmap error %d\n", kr);
8303 		} else {
8304 			corruption_test_va = va;
8305 		}
8306 	} else {
8307 		printf("corrupt_text_addr: object %p\n", object);
8308 		printf("corrupt_text_addr: offset 0x%llx\n", (uint64_t)offset);
8309 		printf("corrupt_text_addr: va 0x%llx\n", (uint64_t)va);
8310 		printf("corrupt_text_addr: vm_object_trunc_page(va) 0x%llx\n", (uint64_t)vm_object_trunc_page(va));
8311 		printf("corrupt_text_addr: vm_page_t %p\n", page);
8312 		printf("corrupt_text_addr: ptoa(PHYS_PAGE) 0x%llx\n", (uint64_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page)));
8313 		printf("corrupt_text_addr: using physaddr 0x%llx\n", (uint64_t)pa);
8314 	}
8315 
8316 	if (object != VM_OBJECT_NULL) {
8317 		vm_object_unlock(object);
8318 	}
8319 	return kr;
8320 }
8321 
8322 #endif /* DEBUG || DEVELOPMENT */
8323