xref: /xnu-12377.81.4/osfmk/vm/vm_fault.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm_fault.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *
62  *	Page fault handling module.
63  */
64 
65 #include <libkern/OSAtomic.h>
66 
67 #include <mach/mach_types.h>
68 #include <mach/kern_return.h>
69 #include <mach/message.h>       /* for error codes */
70 #include <mach/vm_param.h>
71 #include <mach/vm_behavior.h>
72 #include <mach/memory_object.h>
73 /* For memory_object_data_{request,unlock} */
74 #include <mach/sdt.h>
75 
76 #include <kern/kern_types.h>
77 #include <kern/host_statistics.h>
78 #include <kern/counter.h>
79 #include <kern/task.h>
80 #include <kern/thread.h>
81 #include <kern/sched_prim.h>
82 #include <kern/host.h>
83 #include <kern/mach_param.h>
84 #include <kern/macro_help.h>
85 #include <kern/zalloc_internal.h>
86 #include <kern/misc_protos.h>
87 #include <kern/policy_internal.h>
88 #include <kern/exc_guard.h>
89 
90 #include <vm/vm_compressor_internal.h>
91 #include <vm/vm_compressor_pager_internal.h>
92 #include <vm/vm_dyld_pager_internal.h>
93 #include <vm/vm_fault_internal.h>
94 #include <vm/vm_map_internal.h>
95 #include <vm/vm_object_internal.h>
96 #include <vm/vm_page_internal.h>
97 #include <vm/vm_kern_internal.h>
98 #include <vm/pmap.h>
99 #include <vm/vm_pageout_internal.h>
100 #include <vm/vm_protos_internal.h>
101 #include <vm/vm_external.h>
102 #include <vm/memory_object.h>
103 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
104 #include <vm/vm_shared_region.h>
105 #include <vm/vm_page_internal.h>
106 #if HAS_MTE
107 #include <vm/vm_mteinfo_internal.h>
108 #include <vm/vm_memtag.h>
109 #endif /* HAS_MTE */
110 
111 #include <sys/codesign.h>
112 #include <sys/code_signing.h>
113 #include <sys/kdebug.h>
114 #include <sys/kdebug_triage.h>
115 #include <sys/reason.h>
116 #include <sys/signalvar.h>
117 
118 #include <san/kasan.h>
119 #include <libkern/coreanalytics/coreanalytics.h>
120 
121 #define VM_FAULT_CLASSIFY       0
122 
123 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
124 
125 int vm_protect_privileged_from_untrusted = 1;
126 
127 /*
128  * Enforce a maximum number of concurrent PageIns per vm-object to prevent
129  * high-I/O-volume tasks from saturating storage and starving the rest of the
130  * system.
131  *
132  * TODO: This throttling mechanism may be more naturally done by the pager,
133  * filesystem, or storage layers, which will have better information about how
134  * much concurrency the backing store can reasonably support.
135  */
136 TUNABLE(uint16_t, vm_object_pagein_throttle, "vm_object_pagein_throttle", 16);
137 
138 /*
139  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
140  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
141  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
142  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
143  * keep the UI active so that the user has a chance to kill the offending task before the system
144  * completely hangs.
145  *
146  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
147  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
148  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
149  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
150  */
151 
152 extern void throttle_lowpri_io(int);
153 
154 extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
155 
156 uint64_t vm_hard_throttle_threshold;
157 
158 #if DEBUG || DEVELOPMENT
159 static bool vmtc_panic_instead = false;
160 int panic_object_not_alive = 1;
161 #endif /* DEBUG || DEVELOPMENT */
162 
163 OS_ALWAYS_INLINE
164 boolean_t
NEED_TO_HARD_THROTTLE_THIS_TASK(void)165 NEED_TO_HARD_THROTTLE_THIS_TASK(void)
166 {
167 	return vm_wants_task_throttled(current_task()) ||
168 	       ((vm_page_free_count < vm_page_throttle_limit ||
169 	       HARD_THROTTLE_LIMIT_REACHED()) &&
170 	       proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED);
171 }
172 
173 
174 /*
175  * XXX: For now, vm faults cannot be recursively disabled. If the need for
176  * nested code that disables faults arises, the implementation can be modified
177  * to track a disabled-count.
178  */
179 
180 OS_ALWAYS_INLINE
181 void
vm_fault_disable(void)182 vm_fault_disable(void)
183 {
184 	thread_t t = current_thread();
185 	assert(!t->th_vm_faults_disabled);
186 	t->th_vm_faults_disabled = true;
187 	act_set_debug_assert();
188 }
189 
190 OS_ALWAYS_INLINE
191 void
vm_fault_enable(void)192 vm_fault_enable(void)
193 {
194 	thread_t t = current_thread();
195 	assert(t->th_vm_faults_disabled);
196 	t->th_vm_faults_disabled = false;
197 }
198 
199 OS_ALWAYS_INLINE
200 bool
vm_fault_get_disabled(void)201 vm_fault_get_disabled(void)
202 {
203 	thread_t t = current_thread();
204 	return t->th_vm_faults_disabled;
205 }
206 
207 #define HARD_THROTTLE_DELAY     10000   /* 10000 us == 10 ms */
208 #define SOFT_THROTTLE_DELAY     200     /* 200 us == .2 ms */
209 
210 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS   6
211 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC  20000
212 
213 
214 #define VM_STAT_DECOMPRESSIONS()        \
215 MACRO_BEGIN                             \
216 	counter_inc(&vm_statistics_decompressions); \
217 	current_thread()->decompressions++; \
218 MACRO_END
219 
220 boolean_t current_thread_aborted(void);
221 
222 /* Forward declarations of internal routines. */
223 static kern_return_t vm_fault_wire_fast(
224 	vm_map_t        map,
225 	vm_map_offset_t va,
226 	vm_prot_t       prot,
227 	vm_tag_t        wire_tag,
228 	vm_map_entry_t  entry,
229 	pmap_t          pmap,
230 	vm_map_offset_t pmap_addr,
231 	ppnum_t         *physpage_p);
232 
233 static kern_return_t vm_fault_internal(
234 	vm_map_t               map,
235 	vm_map_offset_t        vaddr,
236 	vm_prot_t              caller_prot,
237 	vm_tag_t               wire_tag,
238 	pmap_t                 pmap,
239 	vm_map_offset_t        pmap_addr,
240 	ppnum_t                *physpage_p,
241 	vm_object_fault_info_t fault_info);
242 
243 static void vm_fault_copy_cleanup(
244 	vm_page_t       page,
245 	vm_page_t       top_page);
246 
247 static void vm_fault_copy_dst_cleanup(
248 	vm_page_t       page);
249 
250 #if     VM_FAULT_CLASSIFY
251 extern void vm_fault_classify(vm_object_t       object,
252     vm_object_offset_t    offset,
253     vm_prot_t             fault_type);
254 
255 extern void vm_fault_classify_init(void);
256 #endif
257 
258 unsigned long vm_pmap_enter_blocked = 0;
259 unsigned long vm_pmap_enter_retried = 0;
260 
261 unsigned long vm_cs_validates = 0;
262 unsigned long vm_cs_revalidates = 0;
263 unsigned long vm_cs_query_modified = 0;
264 unsigned long vm_cs_validated_dirtied = 0;
265 unsigned long vm_cs_bitmap_validated = 0;
266 
267 #if CODE_SIGNING_MONITOR
268 uint64_t vm_cs_defer_to_csm = 0;
269 uint64_t vm_cs_defer_to_csm_not = 0;
270 #endif /* CODE_SIGNING_MONITOR */
271 
272 extern char *kdp_compressor_decompressed_page;
273 extern addr64_t kdp_compressor_decompressed_page_paddr;
274 extern ppnum_t  kdp_compressor_decompressed_page_ppnum;
275 
276 struct vmrtfr {
277 	int vmrtfr_maxi;
278 	int vmrtfr_curi;
279 	int64_t vmrtf_total;
280 	vm_rtfault_record_t *vm_rtf_records;
281 } vmrtfrs;
282 #define VMRTF_DEFAULT_BUFSIZE (4096)
283 #define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
284 TUNABLE(int, vmrtf_num_records, "vm_rtfault_records", VMRTF_NUM_RECORDS_DEFAULT);
285 
286 static void vm_rtfrecord_lock(void);
287 static void vm_rtfrecord_unlock(void);
288 static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
289 
290 extern lck_grp_t vm_page_lck_grp_bucket;
291 extern lck_attr_t vm_page_lck_attr;
292 LCK_SPIN_DECLARE_ATTR(vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
293 
294 #if DEVELOPMENT || DEBUG
295 extern int madvise_free_debug;
296 extern int madvise_free_debug_sometimes;
297 #endif /* DEVELOPMENT || DEBUG */
298 
299 extern int vm_pageout_protect_realtime;
300 
301 #if CONFIG_FREEZE
302 #endif /* CONFIG_FREEZE */
303 
304 /*
305  *	Routine:	vm_fault_init
306  *	Purpose:
307  *		Initialize our private data structures.
308  */
309 __startup_func
310 void
vm_fault_init(void)311 vm_fault_init(void)
312 {
313 	int i, vm_compressor_temp;
314 	boolean_t need_default_val = TRUE;
315 	/*
316 	 * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
317 	 * computed as a percentage of available memory, and the percentage used is scaled inversely with
318 	 * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
319 	 * and reduce the value down to 10% for very large memory configurations.  This helps give us a
320 	 * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
321 	 * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
322 	 */
323 
324 	vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024 * 1024 * 1024)), 25)) / 100;
325 
326 	/*
327 	 * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
328 	 */
329 
330 	if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof(vm_compressor_temp))) {
331 		for (i = 0; i < VM_PAGER_MAX_MODES; i++) {
332 			if (((vm_compressor_temp & (1 << i)) == vm_compressor_temp)) {
333 				need_default_val = FALSE;
334 				vm_compressor_mode = vm_compressor_temp;
335 				break;
336 			}
337 		}
338 		if (need_default_val) {
339 			printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
340 		}
341 	}
342 #if CONFIG_FREEZE
343 	if (need_default_val) {
344 		if (osenvironment_is_diagnostics() || osenvironment_is_device_recovery()) {
345 			printf("osenvironment == \"diagnostics or device-recovery\". Setting \"vm_compressor_mode\" to in-core compressor only\n");
346 			vm_compressor_mode = VM_PAGER_COMPRESSOR_NO_SWAP;
347 			need_default_val = false;
348 		}
349 	}
350 #endif /* CONFIG_FREEZE */
351 	if (need_default_val) {
352 		/* If no boot arg or incorrect boot arg, try device tree. */
353 		PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
354 	}
355 	printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
356 	vm_config_init();
357 
358 	PE_parse_boot_argn("vm_protect_privileged_from_untrusted",
359 	    &vm_protect_privileged_from_untrusted,
360 	    sizeof(vm_protect_privileged_from_untrusted));
361 
362 #if DEBUG || DEVELOPMENT
363 	(void)PE_parse_boot_argn("text_corruption_panic", &vmtc_panic_instead, sizeof(vmtc_panic_instead));
364 
365 	if (kern_feature_override(KF_MADVISE_FREE_DEBUG_OVRD)) {
366 		madvise_free_debug = 0;
367 		madvise_free_debug_sometimes = 0;
368 	}
369 
370 	PE_parse_boot_argn("panic_object_not_alive", &panic_object_not_alive, sizeof(panic_object_not_alive));
371 #endif /* DEBUG || DEVELOPMENT */
372 }
373 
374 __startup_func
375 static void
vm_rtfault_record_init(void)376 vm_rtfault_record_init(void)
377 {
378 	size_t size;
379 
380 	vmrtf_num_records = MAX(vmrtf_num_records, 1);
381 	size = vmrtf_num_records * sizeof(vm_rtfault_record_t);
382 	vmrtfrs.vm_rtf_records = zalloc_permanent_tag(size,
383 	    ZALIGN(vm_rtfault_record_t), VM_KERN_MEMORY_DIAG);
384 	vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
385 }
386 STARTUP(ZALLOC, STARTUP_RANK_MIDDLE, vm_rtfault_record_init);
387 
388 /*
389  *	Routine:	vm_fault_cleanup
390  *	Purpose:
391  *		Clean up the result of vm_fault_page.
392  *	Results:
393  *		The paging reference for "object" is released.
394  *		"object" is unlocked.
395  *		If "top_page" is not null,  "top_page" is
396  *		freed and the paging reference for the object
397  *		containing it is released.
398  *
399  *	In/out conditions:
400  *		"object" must be locked.
401  */
402 void
vm_fault_cleanup(vm_object_t object,vm_page_t top_page)403 vm_fault_cleanup(
404 	vm_object_t     object,
405 	vm_page_t       top_page)
406 {
407 	thread_pri_floor_t token = {
408 		.thread = THREAD_NULL
409 	};
410 	if (top_page != VM_PAGE_NULL &&
411 	    top_page->vmp_busy) {
412 		/*
413 		 * We busied the top page. Apply a priority floor before dropping the
414 		 * current object (and therefore the rw-lock boost) to avoid
415 		 * inversions due to another thread sleeping on the top-level page.
416 		 *
417 		 * TODO: Register a page-worker token when busying the top-level page instead
418 		 * (rdar://154313767)
419 		 */
420 		token = thread_priority_floor_start();
421 	}
422 
423 	vm_object_paging_end(object);
424 	vm_object_unlock(object);
425 
426 	if (top_page != VM_PAGE_NULL) {
427 		object = VM_PAGE_OBJECT(top_page);
428 
429 		vm_object_lock(object);
430 		VM_PAGE_FREE(top_page);
431 		vm_object_paging_end(object);
432 		vm_object_unlock(object);
433 	}
434 	if (token.thread != THREAD_NULL) {
435 		thread_priority_floor_end(&token);
436 	}
437 }
438 
439 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
440 
441 
442 TUNABLE(bool, vm_page_deactivate_behind, "vm_deactivate_behind", true);
443 TUNABLE(uint32_t, vm_page_deactivate_behind_min_resident_ratio, "vm_deactivate_behind_min_resident_ratio", 3);
444 /*
445  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
446  */
447 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
448 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
449                                                                 /* we use it to size an array on the stack */
450 
451 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
452 
453 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
454 
455 /*
456  * vm_page_is_sequential
457  *
458  * Determine if sequential access is in progress
459  * in accordance with the behavior specified.
460  * Update state to indicate current access pattern.
461  *
462  * object must have at least the shared lock held
463  */
464 static
465 void
vm_fault_is_sequential(vm_object_t object,vm_object_offset_t offset,vm_behavior_t behavior)466 vm_fault_is_sequential(
467 	vm_object_t             object,
468 	vm_object_offset_t      offset,
469 	vm_behavior_t           behavior)
470 {
471 	vm_object_offset_t      last_alloc;
472 	int                     sequential;
473 	int                     orig_sequential;
474 
475 	last_alloc = object->last_alloc;
476 	sequential = object->sequential;
477 	orig_sequential = sequential;
478 
479 	offset = vm_object_trunc_page(offset);
480 	if (offset == last_alloc && behavior != VM_BEHAVIOR_RANDOM) {
481 		/* re-faulting in the same page: no change in behavior */
482 		return;
483 	}
484 
485 	switch (behavior) {
486 	case VM_BEHAVIOR_RANDOM:
487 		/*
488 		 * reset indicator of sequential behavior
489 		 */
490 		sequential = 0;
491 		break;
492 
493 	case VM_BEHAVIOR_SEQUENTIAL:
494 		if (offset && last_alloc == offset - PAGE_SIZE_64) {
495 			/*
496 			 * advance indicator of sequential behavior
497 			 */
498 			if (sequential < MAX_SEQUENTIAL_RUN) {
499 				sequential += PAGE_SIZE;
500 			}
501 		} else {
502 			/*
503 			 * reset indicator of sequential behavior
504 			 */
505 			sequential = 0;
506 		}
507 		break;
508 
509 	case VM_BEHAVIOR_RSEQNTL:
510 		if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
511 			/*
512 			 * advance indicator of sequential behavior
513 			 */
514 			if (sequential > -MAX_SEQUENTIAL_RUN) {
515 				sequential -= PAGE_SIZE;
516 			}
517 		} else {
518 			/*
519 			 * reset indicator of sequential behavior
520 			 */
521 			sequential = 0;
522 		}
523 		break;
524 
525 	case VM_BEHAVIOR_DEFAULT:
526 	default:
527 		if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
528 			/*
529 			 * advance indicator of sequential behavior
530 			 */
531 			if (sequential < 0) {
532 				sequential = 0;
533 			}
534 			if (sequential < MAX_SEQUENTIAL_RUN) {
535 				sequential += PAGE_SIZE;
536 			}
537 		} else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
538 			/*
539 			 * advance indicator of sequential behavior
540 			 */
541 			if (sequential > 0) {
542 				sequential = 0;
543 			}
544 			if (sequential > -MAX_SEQUENTIAL_RUN) {
545 				sequential -= PAGE_SIZE;
546 			}
547 		} else {
548 			/*
549 			 * reset indicator of sequential behavior
550 			 */
551 			sequential = 0;
552 		}
553 		break;
554 	}
555 	if (sequential != orig_sequential) {
556 		if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
557 			/*
558 			 * if someone else has already updated object->sequential
559 			 * don't bother trying to update it or object->last_alloc
560 			 */
561 			return;
562 		}
563 	}
564 	/*
565 	 * I'd like to do this with a OSCompareAndSwap64, but that
566 	 * doesn't exist for PPC...  however, it shouldn't matter
567 	 * that much... last_alloc is maintained so that we can determine
568 	 * if a sequential access pattern is taking place... if only
569 	 * one thread is banging on this object, no problem with the unprotected
570 	 * update... if 2 or more threads are banging away, we run the risk of
571 	 * someone seeing a mangled update... however, in the face of multiple
572 	 * accesses, no sequential access pattern can develop anyway, so we
573 	 * haven't lost any real info.
574 	 */
575 	object->last_alloc = offset;
576 }
577 
578 #if DEVELOPMENT || DEBUG
579 SCALABLE_COUNTER_DEFINE(vm_page_deactivate_behind_count);
580 #endif /* DEVELOPMENT || DEBUG */
581 
582 /*
583  * @func vm_fault_deactivate_behind
584  *
585  * @description
586  * Determine if sequential access is in progress
587  * in accordance with the behavior specified.  If
588  * so, compute a potential page to deactivate and
589  * deactivate it.
590  *
591  * object must be locked.
592  *
593  * @returns the number of deactivated pages
594  */
595 static
596 uint32_t
vm_fault_deactivate_behind(vm_object_t object,vm_object_offset_t offset,vm_behavior_t behavior)597 vm_fault_deactivate_behind(
598 	vm_object_t             object,
599 	vm_object_offset_t      offset,
600 	vm_behavior_t           behavior)
601 {
602 	uint32_t        pages_in_run = 0;
603 	uint32_t        max_pages_in_run = 0;
604 	int32_t         sequential_run;
605 	vm_behavior_t   sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
606 	vm_object_offset_t      run_offset = 0;
607 	vm_object_offset_t      pg_offset = 0;
608 	vm_page_t       m;
609 	vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
610 
611 #if TRACEFAULTPAGE
612 	dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
613 #endif
614 	if (is_kernel_object(object) ||
615 	    !vm_page_deactivate_behind ||
616 	    (vm_object_trunc_page(offset) != offset) ||
617 	    (object->resident_page_count - object->wired_page_count <
618 	    vm_page_active_count / vm_page_deactivate_behind_min_resident_ratio)) {
619 		/*
620 		 * Do not deactivate pages from the kernel object: they
621 		 * are not intended to become pageable.
622 		 * or we've disabled the deactivate behind mechanism
623 		 * or we are dealing with an offset that is not aligned to
624 		 * the system's PAGE_SIZE because in that case we will
625 		 * handle the deactivation on the aligned offset and, thus,
626 		 * the full PAGE_SIZE page once. This helps us avoid the redundant
627 		 * deactivates and the extra faults.
628 		 *
629 		 * Objects need only participate in backwards
630 		 * deactivation if they are exceedingly large (i.e. their
631 		 * resident pages are liable to comprise a substantially large
632 		 * portion of the active queue and push out the rest of the
633 		 * system's working set).
634 		 */
635 		return 0;
636 	}
637 
638 	KDBG_FILTERED(VMDBG_CODE(DBG_VM_FAULT_DEACTIVATE_BEHIND) | DBG_FUNC_START,
639 	    VM_KERNEL_ADDRHIDE(object), offset, behavior);
640 
641 	if ((sequential_run = object->sequential)) {
642 		if (sequential_run < 0) {
643 			sequential_behavior = VM_BEHAVIOR_RSEQNTL;
644 			sequential_run = 0 - sequential_run;
645 		} else {
646 			sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
647 		}
648 	}
649 	switch (behavior) {
650 	case VM_BEHAVIOR_RANDOM:
651 		break;
652 	case VM_BEHAVIOR_SEQUENTIAL:
653 		if (sequential_run >= (int)PAGE_SIZE) {
654 			run_offset = 0 - PAGE_SIZE_64;
655 			max_pages_in_run = 1;
656 		}
657 		break;
658 	case VM_BEHAVIOR_RSEQNTL:
659 		if (sequential_run >= (int)PAGE_SIZE) {
660 			run_offset = PAGE_SIZE_64;
661 			max_pages_in_run = 1;
662 		}
663 		break;
664 	case VM_BEHAVIOR_DEFAULT:
665 	default:
666 	{       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
667 
668 		/*
669 		 * determine if the run of sequential accesss has been
670 		 * long enough on an object with default access behavior
671 		 * to consider it for deactivation
672 		 */
673 		if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
674 			/*
675 			 * the comparisons between offset and behind are done
676 			 * in this kind of odd fashion in order to prevent wrap around
677 			 * at the end points
678 			 */
679 			if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
680 				if (offset >= behind) {
681 					run_offset = 0 - behind;
682 					pg_offset = PAGE_SIZE_64;
683 					max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
684 				}
685 			} else {
686 				if (offset < -behind) {
687 					run_offset = behind;
688 					pg_offset = 0 - PAGE_SIZE_64;
689 					max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
690 				}
691 			}
692 		}
693 		break;}
694 	}
695 	for (unsigned n = 0; n < max_pages_in_run; n++) {
696 		m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
697 
698 		if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache &&
699 		    (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) &&
700 		    !vm_page_is_fictitious(m) && !m->vmp_absent) {
701 			page_run[pages_in_run++] = m;
702 
703 			/*
704 			 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
705 			 *
706 			 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
707 			 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
708 			 * new reference happens. If no futher references happen on the page after that remote TLB flushes
709 			 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
710 			 * by pageout_scan, which is just fine since the last reference would have happened quite far
711 			 * in the past (TLB caches don't hang around for very long), and of course could just as easily
712 			 * have happened before we did the deactivate_behind.
713 			 */
714 			pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
715 		}
716 	}
717 
718 	if (pages_in_run) {
719 		vm_page_lockspin_queues();
720 
721 		for (unsigned n = 0; n < pages_in_run; n++) {
722 			m = page_run[n];
723 
724 			vm_page_deactivate_internal(m, FALSE);
725 
726 #if DEVELOPMENT || DEBUG
727 			counter_inc(&vm_page_deactivate_behind_count);
728 #endif /* DEVELOPMENT || DEBUG */
729 
730 #if TRACEFAULTPAGE
731 			dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
732 #endif
733 		}
734 		vm_page_unlock_queues();
735 	}
736 
737 	KDBG_FILTERED(VMDBG_CODE(DBG_VM_FAULT_DEACTIVATE_BEHIND) | DBG_FUNC_END,
738 	    pages_in_run);
739 
740 	return pages_in_run;
741 }
742 
743 
744 #if (DEVELOPMENT || DEBUG)
745 uint32_t        vm_page_creation_throttled_hard = 0;
746 uint32_t        vm_page_creation_throttled_soft = 0;
747 uint64_t        vm_page_creation_throttle_avoided = 0;
748 #endif /* DEVELOPMENT || DEBUG */
749 
750 static int
vm_page_throttled(boolean_t page_kept)751 vm_page_throttled(boolean_t page_kept)
752 {
753 	clock_sec_t     elapsed_sec;
754 	clock_sec_t     tv_sec;
755 	clock_usec_t    tv_usec;
756 	task_t          curtask = current_task_early();
757 
758 	thread_t thread = current_thread();
759 
760 	if (thread->options & TH_OPT_VMPRIV) {
761 		return 0;
762 	}
763 
764 	if (curtask && !curtask->active) {
765 		return 0;
766 	}
767 
768 	if (thread->t_page_creation_throttled) {
769 		thread->t_page_creation_throttled = 0;
770 
771 		if (page_kept == FALSE) {
772 			goto no_throttle;
773 		}
774 	}
775 	if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
776 #if (DEVELOPMENT || DEBUG)
777 		thread->t_page_creation_throttled_hard++;
778 		OSAddAtomic(1, &vm_page_creation_throttled_hard);
779 #endif /* DEVELOPMENT || DEBUG */
780 		return HARD_THROTTLE_DELAY;
781 	}
782 
783 	if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
784 	    thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
785 		if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
786 #if (DEVELOPMENT || DEBUG)
787 			OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
788 #endif
789 			goto no_throttle;
790 		}
791 		clock_get_system_microtime(&tv_sec, &tv_usec);
792 
793 		elapsed_sec = tv_sec - thread->t_page_creation_time;
794 
795 		if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
796 		    (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
797 			if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
798 				/*
799 				 * we'll reset our stats to give a well behaved app
800 				 * that was unlucky enough to accumulate a bunch of pages
801 				 * over a long period of time a chance to get out of
802 				 * the throttled state... we reset the counter and timestamp
803 				 * so that if it stays under the rate limit for the next second
804 				 * it will be back in our good graces... if it exceeds it, it
805 				 * will remain in the throttled state
806 				 */
807 				thread->t_page_creation_time = tv_sec;
808 				thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
809 			}
810 			VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
811 
812 			thread->t_page_creation_throttled = 1;
813 
814 			if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
815 #if (DEVELOPMENT || DEBUG)
816 				thread->t_page_creation_throttled_hard++;
817 				OSAddAtomic(1, &vm_page_creation_throttled_hard);
818 #endif /* DEVELOPMENT || DEBUG */
819 				return HARD_THROTTLE_DELAY;
820 			} else {
821 #if (DEVELOPMENT || DEBUG)
822 				thread->t_page_creation_throttled_soft++;
823 				OSAddAtomic(1, &vm_page_creation_throttled_soft);
824 #endif /* DEVELOPMENT || DEBUG */
825 				return SOFT_THROTTLE_DELAY;
826 			}
827 		}
828 		thread->t_page_creation_time = tv_sec;
829 		thread->t_page_creation_count = 0;
830 	}
831 no_throttle:
832 	thread->t_page_creation_count++;
833 
834 	return 0;
835 }
836 
837 extern boolean_t vm_pageout_running;
838 static __attribute__((noinline, not_tail_called)) void
__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(int throttle_delay)839 __VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(
840 	int throttle_delay)
841 {
842 	/* make sure vm_pageout_scan() gets to work while we're throttled */
843 	if (!vm_pageout_running) {
844 		thread_wakeup((event_t)&vm_page_free_wanted);
845 	}
846 	delay(throttle_delay);
847 }
848 
849 
850 /*
851  * check for various conditions that would
852  * prevent us from creating a ZF page...
853  * cleanup is based on being called from vm_fault_page
854  *
855  * object must be locked
856  * object == m->vmp_object
857  */
858 static vm_fault_return_t
vm_fault_check(vm_object_t object,vm_page_t m,vm_page_t first_m,wait_interrupt_t interruptible_state,boolean_t page_throttle)859 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
860 {
861 	int throttle_delay;
862 
863 	if (object->shadow_severed ||
864 	    VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
865 		/*
866 		 * Either:
867 		 * 1. the shadow chain was severed,
868 		 * 2. the purgeable object is volatile or empty and is marked
869 		 *    to fault on access while volatile.
870 		 * Just have to return an error at this point
871 		 */
872 		if (m != VM_PAGE_NULL) {
873 			VM_PAGE_FREE(m);
874 		}
875 		vm_fault_cleanup(object, first_m);
876 
877 		thread_interrupt_level(interruptible_state);
878 
879 		if (VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
880 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
881 		}
882 
883 		if (object->shadow_severed) {
884 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_SHADOW_SEVERED), 0 /* arg */);
885 		}
886 		return VM_FAULT_MEMORY_ERROR;
887 	}
888 	if (page_throttle == TRUE) {
889 		if ((throttle_delay = vm_page_throttled(FALSE))) {
890 			/*
891 			 * we're throttling zero-fills...
892 			 * treat this as if we couldn't grab a page
893 			 */
894 			if (m != VM_PAGE_NULL) {
895 				VM_PAGE_FREE(m);
896 			}
897 			vm_fault_cleanup(object, first_m);
898 
899 			VM_DEBUG_EVENT(vmf_check_zfdelay, DBG_VM_FAULT_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
900 
901 			__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
902 
903 			if (current_thread_aborted()) {
904 				thread_interrupt_level(interruptible_state);
905 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
906 				return VM_FAULT_INTERRUPTED;
907 			}
908 			thread_interrupt_level(interruptible_state);
909 
910 			return VM_FAULT_MEMORY_SHORTAGE;
911 		}
912 	}
913 	return VM_FAULT_SUCCESS;
914 }
915 
916 /*
917  * Clear the code signing bits on the given page_t
918  */
919 static void
vm_fault_cs_clear(vm_page_t m)920 vm_fault_cs_clear(vm_page_t m)
921 {
922 	m->vmp_cs_validated = VMP_CS_ALL_FALSE;
923 	m->vmp_cs_tainted = VMP_CS_ALL_FALSE;
924 	m->vmp_cs_nx = VMP_CS_ALL_FALSE;
925 }
926 
927 /*
928  * Enqueues the given page on the throttled queue.
929  * The caller must hold the vm_page_queue_lock and it will be held on return.
930  */
931 static void
vm_fault_enqueue_throttled_locked(vm_page_t m)932 vm_fault_enqueue_throttled_locked(vm_page_t m)
933 {
934 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
935 	assert(!VM_PAGE_WIRED(m));
936 
937 	/*
938 	 * can't be on the pageout queue since we don't
939 	 * have a pager to try and clean to
940 	 */
941 	vm_page_queues_remove(m, TRUE);
942 	vm_page_check_pageable_safe(m);
943 	vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
944 	m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
945 	vm_page_throttled_count++;
946 }
947 
948 /*
949  * do the work to zero fill a page and
950  * inject it into the correct paging queue
951  *
952  * m->vmp_object must be locked
953  * page queue lock must NOT be held
954  */
955 static int
vm_fault_zero_page(vm_page_t m,boolean_t no_zero_fill)956 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
957 {
958 	int my_fault = DBG_ZERO_FILL_FAULT;
959 	vm_object_t     object;
960 
961 	object = VM_PAGE_OBJECT(m);
962 
963 	/*
964 	 * This is is a zero-fill page fault...
965 	 *
966 	 * Checking the page lock is a waste of
967 	 * time;  this page was absent, so
968 	 * it can't be page locked by a pager.
969 	 *
970 	 * we also consider it undefined
971 	 * with respect to instruction
972 	 * execution.  i.e. it is the responsibility
973 	 * of higher layers to call for an instruction
974 	 * sync after changing the contents and before
975 	 * sending a program into this area.  We
976 	 * choose this approach for performance
977 	 */
978 	vm_fault_cs_clear(m);
979 	m->vmp_pmapped = TRUE;
980 
981 	if (no_zero_fill == TRUE) {
982 		my_fault = DBG_NZF_PAGE_FAULT;
983 
984 		if (m->vmp_absent && m->vmp_busy) {
985 			return my_fault;
986 		}
987 	} else {
988 		vm_page_zero_fill(
989 			m
990 #if HAS_MTE
991 			, true /* zero_tags */
992 #endif /* HAS_MTE */
993 			);
994 
995 		counter_inc(&vm_statistics_zero_fill_count);
996 		DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
997 	}
998 	assert(!m->vmp_laundry);
999 	assert(!is_kernel_object(object));
1000 	//assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
1001 	if (!VM_DYNAMIC_PAGING_ENABLED() &&
1002 	    (object->purgable == VM_PURGABLE_DENY ||
1003 	    object->purgable == VM_PURGABLE_NONVOLATILE ||
1004 	    object->purgable == VM_PURGABLE_VOLATILE)) {
1005 		vm_page_lockspin_queues();
1006 		if (!VM_DYNAMIC_PAGING_ENABLED()) {
1007 			vm_fault_enqueue_throttled_locked(m);
1008 		}
1009 		vm_page_unlock_queues();
1010 	}
1011 	return my_fault;
1012 }
1013 
1014 /*
1015  * Recovery actions for vm_fault_page
1016  */
1017 __attribute__((always_inline))
1018 static void
vm_fault_page_release_page(vm_page_t m,bool * clear_absent_on_error)1019 vm_fault_page_release_page(
1020 	vm_page_t m,                    /* Page to release */
1021 	bool *clear_absent_on_error /* IN/OUT */)
1022 {
1023 	vm_page_wakeup_done(VM_PAGE_OBJECT(m), m);
1024 	if (!VM_PAGE_PAGEABLE(m)) {
1025 		vm_page_lockspin_queues();
1026 		if (*clear_absent_on_error && m->vmp_absent) {
1027 			vm_page_zero_fill(
1028 				m
1029 #if HAS_MTE
1030 				, false /* zero_tags */
1031 #endif /* HAS_MTE */
1032 				);
1033 			counter_inc(&vm_statistics_zero_fill_count);
1034 			DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
1035 			m->vmp_absent = false;
1036 		}
1037 		if (!VM_PAGE_PAGEABLE(m)) {
1038 			if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
1039 				vm_page_deactivate(m);
1040 			} else {
1041 				vm_page_activate(m);
1042 			}
1043 		}
1044 		vm_page_unlock_queues();
1045 	}
1046 	*clear_absent_on_error = false;
1047 }
1048 /*
1049  *	Routine:	vm_fault_page
1050  *	Purpose:
1051  *		Find the resident page for the virtual memory
1052  *		specified by the given virtual memory object
1053  *		and offset.
1054  *	Additional arguments:
1055  *		The required permissions for the page is given
1056  *		in "fault_type".  Desired permissions are included
1057  *		in "protection".
1058  *		fault_info is passed along to determine pagein cluster
1059  *		limits... it contains the expected reference pattern,
1060  *		cluster size if available, etc...
1061  *
1062  *		If the desired page is known to be resident (for
1063  *		example, because it was previously wired down), asserting
1064  *		the "unwiring" parameter will speed the search.
1065  *
1066  *		If the operation can be interrupted (by thread_abort
1067  *		or thread_terminate), then the "interruptible"
1068  *		parameter should be asserted.
1069  *
1070  *	Results:
1071  *		The page containing the proper data is returned
1072  *		in "result_page".
1073  *
1074  *	In/out conditions:
1075  *		The source object must be locked and referenced,
1076  *		and must donate one paging reference.  The reference
1077  *		is not affected.  The paging reference and lock are
1078  *		consumed.
1079  *
1080  *		If the call succeeds, the object in which "result_page"
1081  *		resides is left locked and holding a paging reference.
1082  *		If this is not the original object, a busy page in the
1083  *		original object is returned in "top_page", to prevent other
1084  *		callers from pursuing this same data, along with a paging
1085  *		reference for the original object.  The "top_page" should
1086  *		be destroyed when this guarantee is no longer required.
1087  *		The "result_page" is also left busy.  It is not removed
1088  *		from the pageout queues.
1089  *	Special Case:
1090  *		A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
1091  *		fault succeeded but there's no VM page (i.e. the VM object
1092  *              does not actually hold VM pages, but device memory or
1093  *		large pages).  The object is still locked and we still hold a
1094  *		paging_in_progress reference.
1095  */
1096 unsigned int vm_fault_page_blocked_access = 0;
1097 unsigned int vm_fault_page_forced_retry = 0;
1098 
1099 vm_fault_return_t
vm_fault_page(vm_object_t first_object,vm_object_offset_t first_offset,vm_prot_t fault_type,boolean_t must_be_resident,boolean_t caller_lookup,vm_prot_t * protection,vm_page_t * result_page,vm_page_t * top_page,int * type_of_fault,kern_return_t * error_code,boolean_t no_zero_fill,vm_object_fault_info_t fault_info)1100 vm_fault_page(
1101 	/* Arguments: */
1102 	vm_object_t     first_object,   /* Object to begin search */
1103 	vm_object_offset_t first_offset,        /* Offset into object */
1104 	vm_prot_t       fault_type,     /* What access is requested */
1105 	boolean_t       must_be_resident,/* Must page be resident? */
1106 	boolean_t       caller_lookup,  /* caller looked up page */
1107 	/* Modifies in place: */
1108 	vm_prot_t       *protection,    /* Protection for mapping */
1109 	vm_page_t       *result_page,   /* Page found, if successful */
1110 	/* Returns: */
1111 	vm_page_t       *top_page,      /* Page in top object, if
1112                                          * not result_page.  */
1113 	int             *type_of_fault, /* if non-null, fill in with type of fault
1114                                          * COW, zero-fill, etc... returned in trace point */
1115 	/* More arguments: */
1116 	kern_return_t   *error_code,    /* code if page is in error */
1117 	boolean_t       no_zero_fill,   /* don't zero fill absent pages */
1118 	vm_object_fault_info_t fault_info)
1119 {
1120 	vm_page_t               m;
1121 	vm_object_t             object;
1122 	vm_object_offset_t      offset;
1123 	vm_page_t               first_m;
1124 	vm_object_t             next_object;
1125 	vm_object_t             copy_object;
1126 	boolean_t               look_for_page;
1127 	boolean_t               force_fault_retry = FALSE;
1128 	vm_prot_t               access_required = fault_type;
1129 	vm_prot_t               wants_copy_flag;
1130 	kern_return_t           wait_result;
1131 	wait_interrupt_t        interruptible_state;
1132 	boolean_t               data_already_requested = FALSE;
1133 	vm_behavior_t           orig_behavior;
1134 	vm_size_t               orig_cluster_size;
1135 	vm_fault_return_t       error;
1136 	int                     my_fault;
1137 	uint32_t                try_failed_count;
1138 	wait_interrupt_t        interruptible; /* how may fault be interrupted? */
1139 	int                     external_state = VM_EXTERNAL_STATE_UNKNOWN;
1140 	memory_object_t         pager;
1141 	vm_fault_return_t       retval;
1142 	vm_grab_options_t       grab_options;
1143 	bool                    clear_absent_on_error = false;
1144 
1145 /*
1146  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
1147  * marked as paged out in the compressor pager or the pager doesn't exist.
1148  * Note also that if the pager for an internal object
1149  * has not been created, the pager is not invoked regardless of the value
1150  * of MUST_ASK_PAGER().
1151  *
1152  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
1153  * is marked as paged out in the compressor pager.
1154  * PAGED_OUT() is used to determine if a page has already been pushed
1155  * into a copy object in order to avoid a redundant page out operation.
1156  */
1157 #define MUST_ASK_PAGER(o, f, s)                                 \
1158 	((s = vm_object_compressor_pager_state_get((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
1159 
1160 #define PAGED_OUT(o, f) \
1161 	(vm_object_compressor_pager_state_get((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
1162 
1163 #if TRACEFAULTPAGE
1164 	dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
1165 #endif
1166 
1167 	interruptible = fault_info->interruptible;
1168 	interruptible_state = thread_interrupt_level(interruptible);
1169 
1170 	/*
1171 	 *	INVARIANTS (through entire routine):
1172 	 *
1173 	 *	1)	At all times, we must either have the object
1174 	 *		lock or a busy page in some object to prevent
1175 	 *		some other thread from trying to bring in
1176 	 *		the same page.
1177 	 *
1178 	 *		Note that we cannot hold any locks during the
1179 	 *		pager access or when waiting for memory, so
1180 	 *		we use a busy page then.
1181 	 *
1182 	 *	2)	To prevent another thread from racing us down the
1183 	 *		shadow chain and entering a new page in the top
1184 	 *		object before we do, we must keep a busy page in
1185 	 *		the top object while following the shadow chain.
1186 	 *
1187 	 *	3)	We must increment paging_in_progress on any object
1188 	 *		for which we have a busy page before dropping
1189 	 *		the object lock
1190 	 *
1191 	 *	4)	We leave busy pages on the pageout queues.
1192 	 *		If the pageout daemon comes across a busy page,
1193 	 *		it will remove the page from the pageout queues.
1194 	 */
1195 
1196 	object = first_object;
1197 	offset = first_offset;
1198 	first_m = VM_PAGE_NULL;
1199 	access_required = fault_type;
1200 
1201 	/*
1202 	 * default type of fault
1203 	 */
1204 	my_fault = DBG_CACHE_HIT_FAULT;
1205 	thread_pri_floor_t token;
1206 	bool    drop_floor = false;
1207 
1208 	while (TRUE) {
1209 #if TRACEFAULTPAGE
1210 		dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1211 #endif
1212 
1213 		grab_options = vm_page_grab_options_for_object(object);
1214 #if HAS_MTE
1215 		if (!(grab_options & VM_PAGE_GRAB_MTE) &&
1216 		    mteinfo_vm_tag_can_use_tag_storage((vm_tag_t)fault_info->user_tag)) {
1217 			grab_options |= VM_PAGE_GRAB_ALLOW_TAG_STORAGE;
1218 		}
1219 #endif /* HAS_MTE */
1220 
1221 		if (!object->alive) {
1222 			/*
1223 			 * object is no longer valid
1224 			 * clean up and return error
1225 			 */
1226 #if DEVELOPMENT || DEBUG
1227 			printf("FBDP rdar://93769854 %s:%d object %p internal %d pager %p (%s) copy %p shadow %p alive %d terminating %d named %d ref %d shadow_severed %d\n", __FUNCTION__, __LINE__, object, object->internal, object->pager, object->pager ? object->pager->mo_pager_ops->memory_object_pager_name : "?", object->vo_copy, object->shadow, object->alive, object->terminating, object->named, os_ref_get_count_raw(&object->ref_count), object->shadow_severed);
1228 			if (panic_object_not_alive) {
1229 				panic("FBDP rdar://93769854 %s:%d object %p internal %d pager %p (%s) copy %p shadow %p alive %d terminating %d named %d ref %d shadow_severed %d\n", __FUNCTION__, __LINE__, object, object->internal, object->pager, object->pager ? object->pager->mo_pager_ops->memory_object_pager_name : "?", object->vo_copy, object->shadow, object->alive, object->terminating, object->named, os_ref_get_count_raw(&object->ref_count), object->shadow_severed);
1230 			}
1231 #endif /* DEVELOPMENT || DEBUG */
1232 			vm_fault_cleanup(object, first_m);
1233 			thread_interrupt_level(interruptible_state);
1234 
1235 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_NOT_ALIVE), 0 /* arg */);
1236 			return VM_FAULT_MEMORY_ERROR;
1237 		}
1238 
1239 		if (!object->pager_created && object->phys_contiguous) {
1240 			/*
1241 			 * A physically-contiguous object without a pager:
1242 			 * must be a "large page" object.  We do not deal
1243 			 * with VM pages for this object.
1244 			 */
1245 			caller_lookup = FALSE;
1246 			m = VM_PAGE_NULL;
1247 			goto phys_contig_object;
1248 		}
1249 
1250 		if (object->blocked_access) {
1251 			/*
1252 			 * Access to this VM object has been blocked.
1253 			 * Replace our "paging_in_progress" reference with
1254 			 * a "activity_in_progress" reference and wait for
1255 			 * access to be unblocked.
1256 			 */
1257 			caller_lookup = FALSE; /* no longer valid after sleep */
1258 			vm_object_activity_begin(object);
1259 			vm_object_paging_end(object);
1260 			while (object->blocked_access) {
1261 				vm_object_sleep(object,
1262 				    VM_OBJECT_EVENT_UNBLOCKED,
1263 				    THREAD_UNINT, LCK_SLEEP_EXCLUSIVE);
1264 			}
1265 			vm_fault_page_blocked_access++;
1266 			vm_object_paging_begin(object);
1267 			vm_object_activity_end(object);
1268 		}
1269 
1270 		/*
1271 		 * See whether the page at 'offset' is resident
1272 		 */
1273 		if (caller_lookup == TRUE) {
1274 			/*
1275 			 * The caller has already looked up the page
1276 			 * and gave us the result in "result_page".
1277 			 * We can use this for the first lookup but
1278 			 * it loses its validity as soon as we unlock
1279 			 * the object.
1280 			 */
1281 			m = *result_page;
1282 			caller_lookup = FALSE; /* no longer valid after that */
1283 		} else {
1284 			m = vm_page_lookup(object, vm_object_trunc_page(offset));
1285 		}
1286 #if TRACEFAULTPAGE
1287 		dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1288 #endif
1289 		if (m != VM_PAGE_NULL) {
1290 			if (m->vmp_busy) {
1291 				/*
1292 				 * The page is being brought in,
1293 				 * wait for it and then retry.
1294 				 */
1295 #if TRACEFAULTPAGE
1296 				dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1297 #endif
1298 				if (fault_info->fi_no_sleep) {
1299 					/* Caller has requested not to sleep on busy pages */
1300 					vm_fault_cleanup(object, first_m);
1301 					thread_interrupt_level(interruptible_state);
1302 					return VM_FAULT_BUSY;
1303 				}
1304 
1305 				wait_result = vm_page_sleep(object, m, interruptible, LCK_SLEEP_DEFAULT);
1306 
1307 				if (wait_result != THREAD_AWAKENED) {
1308 					vm_fault_cleanup(object, first_m);
1309 					thread_interrupt_level(interruptible_state);
1310 
1311 					if (wait_result == THREAD_RESTART) {
1312 						return VM_FAULT_RETRY;
1313 					} else {
1314 						ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_BUSYPAGE_WAIT_INTERRUPTED), 0 /* arg */);
1315 						return VM_FAULT_INTERRUPTED;
1316 					}
1317 				}
1318 				continue;
1319 			}
1320 			if (m->vmp_laundry) {
1321 				m->vmp_free_when_done = FALSE;
1322 
1323 				if (!m->vmp_cleaning) {
1324 					vm_pageout_steal_laundry(m, FALSE);
1325 				}
1326 			}
1327 			vm_object_lock_assert_exclusive(VM_PAGE_OBJECT(m));
1328 			if (vm_page_is_guard(m)) {
1329 				/*
1330 				 * Guard page: off limits !
1331 				 */
1332 				if (fault_type == VM_PROT_NONE) {
1333 					/*
1334 					 * The fault is not requesting any
1335 					 * access to the guard page, so it must
1336 					 * be just to wire or unwire it.
1337 					 * Let's pretend it succeeded...
1338 					 */
1339 					m->vmp_busy = TRUE;
1340 					*result_page = m;
1341 					assert(first_m == VM_PAGE_NULL);
1342 					*top_page = first_m;
1343 					if (type_of_fault) {
1344 						*type_of_fault = DBG_GUARD_FAULT;
1345 					}
1346 					thread_interrupt_level(interruptible_state);
1347 					return VM_FAULT_SUCCESS;
1348 				} else {
1349 					/*
1350 					 * The fault requests access to the
1351 					 * guard page: let's deny that !
1352 					 */
1353 					vm_fault_cleanup(object, first_m);
1354 					thread_interrupt_level(interruptible_state);
1355 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_GUARDPAGE_FAULT), 0 /* arg */);
1356 					return VM_FAULT_MEMORY_ERROR;
1357 				}
1358 			}
1359 
1360 
1361 			if (m->vmp_error) {
1362 				/*
1363 				 * The page is in error, give up now.
1364 				 */
1365 #if TRACEFAULTPAGE
1366 				dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1367 #endif
1368 				if (error_code) {
1369 					*error_code = KERN_MEMORY_ERROR;
1370 				}
1371 				VM_PAGE_FREE(m);
1372 
1373 				vm_fault_cleanup(object, first_m);
1374 				thread_interrupt_level(interruptible_state);
1375 
1376 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PAGE_HAS_ERROR), 0 /* arg */);
1377 				return VM_FAULT_MEMORY_ERROR;
1378 			}
1379 			if (m->vmp_restart) {
1380 				/*
1381 				 * The pager wants us to restart
1382 				 * at the top of the chain,
1383 				 * typically because it has moved the
1384 				 * page to another pager, then do so.
1385 				 */
1386 #if TRACEFAULTPAGE
1387 				dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1388 #endif
1389 				VM_PAGE_FREE(m);
1390 
1391 				vm_fault_cleanup(object, first_m);
1392 				thread_interrupt_level(interruptible_state);
1393 
1394 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PAGE_HAS_RESTART), 0 /* arg */);
1395 				return VM_FAULT_RETRY;
1396 			}
1397 			if (m->vmp_absent) {
1398 				/*
1399 				 * The page isn't busy, but is absent,
1400 				 * therefore it's deemed "unavailable".
1401 				 *
1402 				 * Remove the non-existent page (unless it's
1403 				 * in the top object) and move on down to the
1404 				 * next object (if there is one).
1405 				 */
1406 #if TRACEFAULTPAGE
1407 				dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1408 #endif
1409 				next_object = object->shadow;
1410 
1411 				if (next_object == VM_OBJECT_NULL) {
1412 					/*
1413 					 * Absent page at bottom of shadow
1414 					 * chain; zero fill the page we left
1415 					 * busy in the first object, and free
1416 					 * the absent page.
1417 					 */
1418 					assert(!must_be_resident);
1419 
1420 					/*
1421 					 * check for any conditions that prevent
1422 					 * us from creating a new zero-fill page
1423 					 * vm_fault_check will do all of the
1424 					 * fault cleanup in the case of an error condition
1425 					 * including resetting the thread_interrupt_level
1426 					 */
1427 					error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1428 
1429 					if (error != VM_FAULT_SUCCESS) {
1430 						return error;
1431 					}
1432 
1433 					if (object != first_object) {
1434 						/*
1435 						 * free the absent page we just found
1436 						 */
1437 						VM_PAGE_FREE(m);
1438 
1439 						/*
1440 						 * drop reference and lock on current object
1441 						 */
1442 						vm_object_paging_end(object);
1443 						vm_object_unlock(object);
1444 
1445 						/*
1446 						 * grab the original page we
1447 						 * 'soldered' in place and
1448 						 * retake lock on 'first_object'
1449 						 */
1450 						m = first_m;
1451 						first_m = VM_PAGE_NULL;
1452 
1453 						object = first_object;
1454 						offset = first_offset;
1455 
1456 						vm_object_lock(object);
1457 					} else {
1458 						/*
1459 						 * we're going to use the absent page we just found
1460 						 * so convert it to a 'busy' page
1461 						 */
1462 						m->vmp_absent = FALSE;
1463 						m->vmp_busy = TRUE;
1464 					}
1465 					if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1466 						m->vmp_absent = TRUE;
1467 						clear_absent_on_error = true;
1468 					}
1469 					/*
1470 					 * zero-fill the page and put it on
1471 					 * the correct paging queue
1472 					 */
1473 					my_fault = vm_fault_zero_page(m, no_zero_fill);
1474 
1475 					break;
1476 				} else {
1477 					if (must_be_resident) {
1478 						vm_object_paging_end(object);
1479 					} else if (object != first_object) {
1480 						vm_object_paging_end(object);
1481 						VM_PAGE_FREE(m);
1482 					} else {
1483 						first_m = m;
1484 						m->vmp_absent = FALSE;
1485 						m->vmp_busy = TRUE;
1486 
1487 						vm_page_lockspin_queues();
1488 						vm_page_queues_remove(m, FALSE);
1489 						vm_page_unlock_queues();
1490 					}
1491 
1492 					offset += object->vo_shadow_offset;
1493 					fault_info->lo_offset += object->vo_shadow_offset;
1494 					fault_info->hi_offset += object->vo_shadow_offset;
1495 					access_required = VM_PROT_READ;
1496 
1497 					vm_object_lock(next_object);
1498 					vm_object_unlock(object);
1499 					object = next_object;
1500 					vm_object_paging_begin(object);
1501 
1502 					/*
1503 					 * reset to default type of fault
1504 					 */
1505 					my_fault = DBG_CACHE_HIT_FAULT;
1506 
1507 					continue;
1508 				}
1509 			}
1510 			if ((m->vmp_cleaning)
1511 			    && ((object != first_object) || (object->vo_copy != VM_OBJECT_NULL))
1512 			    && (fault_type & VM_PROT_WRITE)) {
1513 				/*
1514 				 * This is a copy-on-write fault that will
1515 				 * cause us to revoke access to this page, but
1516 				 * this page is in the process of being cleaned
1517 				 * in a clustered pageout. We must wait until
1518 				 * the cleaning operation completes before
1519 				 * revoking access to the original page,
1520 				 * otherwise we might attempt to remove a
1521 				 * wired mapping.
1522 				 */
1523 #if TRACEFAULTPAGE
1524 				dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1525 #endif
1526 				/*
1527 				 * take an extra ref so that object won't die
1528 				 */
1529 				vm_object_reference_locked(object);
1530 
1531 				vm_fault_cleanup(object, first_m);
1532 
1533 				vm_object_lock(object);
1534 				assert(os_ref_get_count_raw(&object->ref_count) > 0);
1535 
1536 				m = vm_page_lookup(object, vm_object_trunc_page(offset));
1537 
1538 				if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1539 					wait_result = vm_page_sleep(object, m, interruptible, LCK_SLEEP_UNLOCK);
1540 					vm_object_deallocate(object);
1541 					goto backoff;
1542 				} else {
1543 					vm_object_unlock(object);
1544 
1545 					vm_object_deallocate(object);
1546 					thread_interrupt_level(interruptible_state);
1547 
1548 					return VM_FAULT_RETRY;
1549 				}
1550 			}
1551 			if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1552 			    !(fault_info != NULL && fault_info->stealth)) {
1553 				/*
1554 				 * If we were passed a non-NULL pointer for
1555 				 * "type_of_fault", than we came from
1556 				 * vm_fault... we'll let it deal with
1557 				 * this condition, since it
1558 				 * needs to see m->vmp_speculative to correctly
1559 				 * account the pageins, otherwise...
1560 				 * take it off the speculative queue, we'll
1561 				 * let the caller of vm_fault_page deal
1562 				 * with getting it onto the correct queue
1563 				 *
1564 				 * If the caller specified in fault_info that
1565 				 * it wants a "stealth" fault, we also leave
1566 				 * the page in the speculative queue.
1567 				 */
1568 				vm_page_lockspin_queues();
1569 				if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
1570 					vm_page_queues_remove(m, FALSE);
1571 				}
1572 				vm_page_unlock_queues();
1573 			}
1574 			assert(object == VM_PAGE_OBJECT(m));
1575 
1576 			if (object->code_signed) {
1577 				/*
1578 				 * CODE SIGNING:
1579 				 * We just paged in a page from a signed
1580 				 * memory object but we don't need to
1581 				 * validate it now.  We'll validate it if
1582 				 * when it gets mapped into a user address
1583 				 * space for the first time or when the page
1584 				 * gets copied to another object as a result
1585 				 * of a copy-on-write.
1586 				 */
1587 			}
1588 
1589 			/*
1590 			 * We mark the page busy and leave it on
1591 			 * the pageout queues.  If the pageout
1592 			 * deamon comes across it, then it will
1593 			 * remove the page from the queue, but not the object
1594 			 */
1595 #if TRACEFAULTPAGE
1596 			dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1597 #endif
1598 			assert(!m->vmp_busy);
1599 			assert(!m->vmp_absent);
1600 
1601 			m->vmp_busy = TRUE;
1602 			break;
1603 		}
1604 
1605 		/*
1606 		 * we get here when there is no page present in the object at
1607 		 * the offset we're interested in... we'll allocate a page
1608 		 * at this point if the pager associated with
1609 		 * this object can provide the data or we're the top object...
1610 		 * object is locked;  m == NULL
1611 		 */
1612 
1613 		if (must_be_resident) {
1614 			if (fault_type == VM_PROT_NONE &&
1615 			    is_kernel_object(object)) {
1616 				/*
1617 				 * We've been called from vm_fault_unwire()
1618 				 * while removing a map entry that was allocated
1619 				 * with KMA_KOBJECT and KMA_VAONLY.  This page
1620 				 * is not present and there's nothing more to
1621 				 * do here (nothing to unwire).
1622 				 */
1623 				vm_fault_cleanup(object, first_m);
1624 				thread_interrupt_level(interruptible_state);
1625 
1626 				return VM_FAULT_MEMORY_ERROR;
1627 			}
1628 
1629 			goto dont_look_for_page;
1630 		}
1631 
1632 		/* Don't expect to fault pages into the kernel object. */
1633 		assert(!is_kernel_object(object));
1634 
1635 		look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE));
1636 
1637 #if TRACEFAULTPAGE
1638 		dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1639 #endif
1640 		if (!look_for_page && object == first_object && !object->phys_contiguous) {
1641 			/*
1642 			 * Allocate a new page for this object/offset pair as a placeholder
1643 			 */
1644 			m = vm_page_grab_options(grab_options);
1645 #if TRACEFAULTPAGE
1646 			dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1647 #endif
1648 			if (m == VM_PAGE_NULL) {
1649 				vm_fault_cleanup(object, first_m);
1650 				thread_interrupt_level(interruptible_state);
1651 
1652 				return VM_FAULT_MEMORY_SHORTAGE;
1653 			}
1654 
1655 			if (fault_info && fault_info->batch_pmap_op == TRUE) {
1656 				vm_page_insert_internal(m, object,
1657 				    vm_object_trunc_page(offset),
1658 				    VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1659 			} else {
1660 				vm_page_insert(m, object, vm_object_trunc_page(offset));
1661 			}
1662 		}
1663 		if (look_for_page) {
1664 			kern_return_t   rc;
1665 			int             my_fault_type;
1666 
1667 			/*
1668 			 *	If the memory manager is not ready, we
1669 			 *	cannot make requests.
1670 			 */
1671 			if (!object->pager_ready) {
1672 #if TRACEFAULTPAGE
1673 				dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1674 #endif
1675 				if (m != VM_PAGE_NULL) {
1676 					VM_PAGE_FREE(m);
1677 				}
1678 
1679 				/*
1680 				 * take an extra ref so object won't die
1681 				 */
1682 				vm_object_reference_locked(object);
1683 				vm_fault_cleanup(object, first_m);
1684 
1685 				vm_object_lock(object);
1686 				assert(os_ref_get_count_raw(&object->ref_count) > 0);
1687 
1688 				if (!object->pager_ready) {
1689 					wait_result = vm_object_sleep(object, VM_OBJECT_EVENT_PAGER_READY, interruptible, LCK_SLEEP_UNLOCK);
1690 					vm_object_deallocate(object);
1691 
1692 					goto backoff;
1693 				} else {
1694 					vm_object_unlock(object);
1695 					vm_object_deallocate(object);
1696 					thread_interrupt_level(interruptible_state);
1697 
1698 					return VM_FAULT_RETRY;
1699 				}
1700 			}
1701 			if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1702 				/*
1703 				 * If there are too many outstanding page
1704 				 * requests pending on this external object, we
1705 				 * wait for them to be resolved now.
1706 				 */
1707 #if TRACEFAULTPAGE
1708 				dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1709 #endif
1710 				if (m != VM_PAGE_NULL) {
1711 					VM_PAGE_FREE(m);
1712 				}
1713 				/*
1714 				 * take an extra ref so object won't die
1715 				 */
1716 				vm_object_reference_locked(object);
1717 
1718 				vm_fault_cleanup(object, first_m);
1719 
1720 				vm_object_lock(object);
1721 				assert(os_ref_get_count_raw(&object->ref_count) > 0);
1722 
1723 				if (object->paging_in_progress >= vm_object_pagein_throttle) {
1724 					wait_result = vm_object_paging_throttle_wait(object, interruptible);
1725 					vm_object_unlock(object);
1726 					vm_object_deallocate(object);
1727 					goto backoff;
1728 				} else {
1729 					vm_object_unlock(object);
1730 					vm_object_deallocate(object);
1731 					thread_interrupt_level(interruptible_state);
1732 
1733 					return VM_FAULT_RETRY;
1734 				}
1735 			}
1736 			if (object->internal) {
1737 				int compressed_count_delta;
1738 				vm_compressor_options_t c_flags = 0;
1739 
1740 				assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1741 
1742 				if (m == VM_PAGE_NULL) {
1743 					/*
1744 					 * Allocate a new page for this object/offset pair as a placeholder
1745 					 */
1746 					m = vm_page_grab_options(grab_options);
1747 #if TRACEFAULTPAGE
1748 					dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1749 #endif
1750 					if (m == VM_PAGE_NULL) {
1751 						vm_fault_cleanup(object, first_m);
1752 						thread_interrupt_level(interruptible_state);
1753 
1754 						return VM_FAULT_MEMORY_SHORTAGE;
1755 					}
1756 
1757 					m->vmp_absent = TRUE;
1758 					if (fault_info && fault_info->batch_pmap_op == TRUE) {
1759 						vm_page_insert_internal(m, object, vm_object_trunc_page(offset), VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1760 					} else {
1761 						vm_page_insert(m, object, vm_object_trunc_page(offset));
1762 					}
1763 				}
1764 				assert(m->vmp_busy);
1765 
1766 				m->vmp_absent = TRUE;
1767 				pager = object->pager;
1768 
1769 				assert(object->paging_in_progress > 0);
1770 
1771 				page_worker_token_t pw_token;
1772 #if PAGE_SLEEP_WITH_INHERITOR
1773 				page_worker_register_worker((event_t)m, &pw_token);
1774 #endif /* PAGE_SLEEP_WITH_INHERITOR */
1775 
1776 				vm_object_unlock(object);
1777 #if HAS_MTE
1778 				if (vm_object_is_mte_mappable(object)) {
1779 					c_flags |= C_MTE;
1780 				}
1781 #endif /* HAS_MTE */
1782 				rc = vm_compressor_pager_get(
1783 					pager,
1784 					offset + object->paging_offset,
1785 					VM_PAGE_GET_PHYS_PAGE(m),
1786 					&my_fault_type,
1787 					c_flags,
1788 					&compressed_count_delta);
1789 
1790 				if (type_of_fault == NULL) {
1791 					int     throttle_delay;
1792 
1793 					/*
1794 					 * we weren't called from vm_fault, so we
1795 					 * need to apply page creation throttling
1796 					 * do it before we re-acquire any locks
1797 					 */
1798 					if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1799 						if ((throttle_delay = vm_page_throttled(TRUE))) {
1800 							VM_DEBUG_EVENT(vmf_compressordelay, DBG_VM_FAULT_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1801 							__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
1802 						}
1803 					}
1804 				}
1805 				vm_object_lock(object);
1806 				assert(object->paging_in_progress > 0);
1807 
1808 				vm_compressor_pager_count(
1809 					pager,
1810 					compressed_count_delta,
1811 					FALSE, /* shared_lock */
1812 					object);
1813 
1814 				switch (rc) {
1815 				case KERN_SUCCESS:
1816 					m->vmp_absent = FALSE;
1817 					m->vmp_dirty = TRUE;
1818 					if (!HAS_DEFAULT_CACHEABILITY(object->wimg_bits &
1819 					    VM_WIMG_MASK)) {
1820 						/*
1821 						 * If the page is not cacheable,
1822 						 * we can't let its contents
1823 						 * linger in the data cache
1824 						 * after the decompression.
1825 						 */
1826 						pmap_sync_page_attributes_phys(
1827 							VM_PAGE_GET_PHYS_PAGE(m));
1828 					} else {
1829 						m->vmp_written_by_kernel = TRUE;
1830 					}
1831 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
1832 					if ((fault_type & VM_PROT_WRITE) == 0) {
1833 						vm_object_lock_assert_exclusive(object);
1834 						vm_page_lockspin_queues();
1835 						m->vmp_unmodified_ro = true;
1836 						vm_page_unlock_queues();
1837 						os_atomic_inc(&compressor_ro_uncompressed, relaxed);
1838 						*protection &= ~VM_PROT_WRITE;
1839 					}
1840 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
1841 
1842 					/*
1843 					 * If the object is purgeable, its
1844 					 * owner's purgeable ledgers have been
1845 					 * updated in vm_page_insert() but the
1846 					 * page was also accounted for in a
1847 					 * "compressed purgeable" ledger, so
1848 					 * update that now.
1849 					 */
1850 					if (((object->purgable !=
1851 					    VM_PURGABLE_DENY) ||
1852 					    object->vo_ledger_tag) &&
1853 					    (object->vo_owner !=
1854 					    NULL)) {
1855 						/*
1856 						 * One less compressed
1857 						 * purgeable/tagged page.
1858 						 */
1859 						if (compressed_count_delta) {
1860 							vm_object_owner_compressed_update(
1861 								object,
1862 								-1);
1863 						}
1864 					}
1865 
1866 					break;
1867 				case KERN_MEMORY_FAILURE:
1868 					m->vmp_unusual = TRUE;
1869 					m->vmp_error = TRUE;
1870 					m->vmp_absent = FALSE;
1871 					break;
1872 				case KERN_MEMORY_ERROR:
1873 					assert(m->vmp_absent);
1874 					break;
1875 				default:
1876 					panic("vm_fault_page(): unexpected "
1877 					    "error %d from "
1878 					    "vm_compressor_pager_get()\n",
1879 					    rc);
1880 				}
1881 				vm_page_wakeup_done_with_inheritor(object, m, &pw_token);
1882 
1883 				rc = KERN_SUCCESS;
1884 				goto data_requested;
1885 			}
1886 			my_fault_type = DBG_PAGEIN_FAULT;
1887 
1888 			if (m != VM_PAGE_NULL) {
1889 				VM_PAGE_FREE(m);
1890 				m = VM_PAGE_NULL;
1891 			}
1892 
1893 #if TRACEFAULTPAGE
1894 			dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1895 #endif
1896 
1897 			/*
1898 			 * It's possible someone called vm_object_destroy while we weren't
1899 			 * holding the object lock.  If that has happened, then bail out
1900 			 * here.
1901 			 */
1902 
1903 			pager = object->pager;
1904 
1905 			if (pager == MEMORY_OBJECT_NULL) {
1906 				vm_fault_cleanup(object, first_m);
1907 				thread_interrupt_level(interruptible_state);
1908 
1909 				static const enum vm_subsys_error_codes object_destroy_errors[VM_OBJECT_DESTROY_MAX + 1] = {
1910 					[VM_OBJECT_DESTROY_UNKNOWN_REASON] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER,
1911 					[VM_OBJECT_DESTROY_UNMOUNT] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER_UNMOUNT,
1912 					[VM_OBJECT_DESTROY_FORCED_UNMOUNT] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER_FORCED_UNMOUNT,
1913 					[VM_OBJECT_DESTROY_UNGRAFT] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER_UNGRAFT,
1914 					[VM_OBJECT_DESTROY_PAGER] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER_DEALLOC_PAGER,
1915 					[VM_OBJECT_DESTROY_RECLAIM] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER_RECLAIM,
1916 				};
1917 				enum vm_subsys_error_codes kdbg_code = object_destroy_errors[(vm_object_destroy_reason_t)object->no_pager_reason];
1918 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, kdbg_code), 0 /* arg */);
1919 				return VM_FAULT_MEMORY_ERROR;
1920 			}
1921 
1922 			/*
1923 			 * We have an absent page in place for the faulting offset,
1924 			 * so we can release the object lock.
1925 			 */
1926 
1927 			if (object->object_is_shared_cache || pager->mo_pager_ops == &dyld_pager_ops) {
1928 				token = thread_priority_floor_start();
1929 				/*
1930 				 * A non-native shared cache object might
1931 				 * be getting set up in parallel with this
1932 				 * fault and so we can't assume that this
1933 				 * check will be valid after we drop the
1934 				 * object lock below.
1935 				 *
1936 				 * FIXME: This should utilize @c page_worker_register_worker()
1937 				 * (rdar://153586539)
1938 				 */
1939 				drop_floor = true;
1940 			}
1941 
1942 			vm_object_unlock(object);
1943 
1944 			/*
1945 			 * If this object uses a copy_call strategy,
1946 			 * and we are interested in a copy of this object
1947 			 * (having gotten here only by following a
1948 			 * shadow chain), then tell the memory manager
1949 			 * via a flag added to the desired_access
1950 			 * parameter, so that it can detect a race
1951 			 * between our walking down the shadow chain
1952 			 * and its pushing pages up into a copy of
1953 			 * the object that it manages.
1954 			 */
1955 			if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
1956 				wants_copy_flag = VM_PROT_WANTS_COPY;
1957 			} else {
1958 				wants_copy_flag = VM_PROT_NONE;
1959 			}
1960 
1961 			if (object->vo_copy == first_object) {
1962 				/*
1963 				 * if we issue the memory_object_data_request in
1964 				 * this state, we are subject to a deadlock with
1965 				 * the underlying filesystem if it is trying to
1966 				 * shrink the file resulting in a push of pages
1967 				 * into the copy object...  that push will stall
1968 				 * on the placeholder page, and if the pushing thread
1969 				 * is holding a lock that is required on the pagein
1970 				 * path (such as a truncate lock), we'll deadlock...
1971 				 * to avoid this potential deadlock, we throw away
1972 				 * our placeholder page before calling memory_object_data_request
1973 				 * and force this thread to retry the vm_fault_page after
1974 				 * we have issued the I/O.  the second time through this path
1975 				 * we will find the page already in the cache (presumably still
1976 				 * busy waiting for the I/O to complete) and then complete
1977 				 * the fault w/o having to go through memory_object_data_request again
1978 				 */
1979 				assert(first_m != VM_PAGE_NULL);
1980 				assert(VM_PAGE_OBJECT(first_m) == first_object);
1981 
1982 				vm_object_lock(first_object);
1983 				VM_PAGE_FREE(first_m);
1984 				vm_object_paging_end(first_object);
1985 				vm_object_unlock(first_object);
1986 
1987 				first_m = VM_PAGE_NULL;
1988 				force_fault_retry = TRUE;
1989 
1990 				vm_fault_page_forced_retry++;
1991 			}
1992 
1993 			if (data_already_requested == TRUE) {
1994 				orig_behavior = fault_info->behavior;
1995 				orig_cluster_size = fault_info->cluster_size;
1996 
1997 				fault_info->behavior = VM_BEHAVIOR_RANDOM;
1998 				fault_info->cluster_size = PAGE_SIZE;
1999 			}
2000 			/*
2001 			 * Call the memory manager to retrieve the data.
2002 			 */
2003 			rc = memory_object_data_request(
2004 				pager,
2005 				vm_object_trunc_page(offset) + object->paging_offset,
2006 				PAGE_SIZE,
2007 				access_required | wants_copy_flag,
2008 				(memory_object_fault_info_t)fault_info);
2009 
2010 			if (data_already_requested == TRUE) {
2011 				fault_info->behavior = orig_behavior;
2012 				fault_info->cluster_size = orig_cluster_size;
2013 			} else {
2014 				data_already_requested = TRUE;
2015 			}
2016 
2017 			DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
2018 #if TRACEFAULTPAGE
2019 			dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
2020 #endif
2021 			vm_object_lock(object);
2022 
2023 			if (drop_floor) {
2024 				thread_priority_floor_end(&token);
2025 				drop_floor = false;
2026 			}
2027 
2028 data_requested:
2029 			if (rc != ERR_SUCCESS) {
2030 				vm_fault_cleanup(object, first_m);
2031 				thread_interrupt_level(interruptible_state);
2032 
2033 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NO_DATA), 0 /* arg */);
2034 
2035 				if (rc == MACH_SEND_INTERRUPTED) {
2036 					return VM_FAULT_INTERRUPTED;
2037 				} else if (rc == KERN_ALREADY_WAITING) {
2038 					return VM_FAULT_BUSY;
2039 				} else {
2040 					return VM_FAULT_MEMORY_ERROR;
2041 				}
2042 			} else {
2043 				clock_sec_t     tv_sec;
2044 				clock_usec_t    tv_usec;
2045 
2046 				if (my_fault_type == DBG_PAGEIN_FAULT) {
2047 					clock_get_system_microtime(&tv_sec, &tv_usec);
2048 					current_thread()->t_page_creation_time = tv_sec;
2049 					current_thread()->t_page_creation_count = 0;
2050 				}
2051 			}
2052 			if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
2053 				vm_fault_cleanup(object, first_m);
2054 				thread_interrupt_level(interruptible_state);
2055 
2056 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
2057 				return VM_FAULT_INTERRUPTED;
2058 			}
2059 			if (force_fault_retry == TRUE) {
2060 				vm_fault_cleanup(object, first_m);
2061 				thread_interrupt_level(interruptible_state);
2062 
2063 				return VM_FAULT_RETRY;
2064 			}
2065 			if (m == VM_PAGE_NULL && object->phys_contiguous) {
2066 				/*
2067 				 * No page here means that the object we
2068 				 * initially looked up was "physically
2069 				 * contiguous" (i.e. device memory).  However,
2070 				 * with Virtual VRAM, the object might not
2071 				 * be backed by that device memory anymore,
2072 				 * so we're done here only if the object is
2073 				 * still "phys_contiguous".
2074 				 * Otherwise, if the object is no longer
2075 				 * "phys_contiguous", we need to retry the
2076 				 * page fault against the object's new backing
2077 				 * store (different memory object).
2078 				 */
2079 phys_contig_object:
2080 				assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
2081 				assert(object == first_object);
2082 				goto done;
2083 			}
2084 			/*
2085 			 * potentially a pagein fault
2086 			 * if we make it through the state checks
2087 			 * above, than we'll count it as such
2088 			 */
2089 			my_fault = my_fault_type;
2090 
2091 			/*
2092 			 * Retry with same object/offset, since new data may
2093 			 * be in a different page (i.e., m is meaningless at
2094 			 * this point).
2095 			 */
2096 			continue;
2097 		}
2098 dont_look_for_page:
2099 		/*
2100 		 * We get here if the object has no pager, or an existence map
2101 		 * exists and indicates the page isn't present on the pager
2102 		 * or we're unwiring a page.  If a pager exists, but there
2103 		 * is no existence map, then the m->vmp_absent case above handles
2104 		 * the ZF case when the pager can't provide the page
2105 		 */
2106 #if TRACEFAULTPAGE
2107 		dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
2108 #endif
2109 		if (object == first_object) {
2110 			first_m = m;
2111 		} else {
2112 			assert(m == VM_PAGE_NULL);
2113 		}
2114 
2115 		next_object = object->shadow;
2116 
2117 		if (next_object == VM_OBJECT_NULL) {
2118 			/*
2119 			 * we've hit the bottom of the shadown chain,
2120 			 * fill the page in the top object with zeros.
2121 			 */
2122 			assert(!must_be_resident);
2123 
2124 			if (object != first_object) {
2125 				vm_object_paging_end(object);
2126 				vm_object_unlock(object);
2127 
2128 				object = first_object;
2129 				offset = first_offset;
2130 				vm_object_lock(object);
2131 			}
2132 			m = first_m;
2133 			assert(VM_PAGE_OBJECT(m) == object);
2134 			first_m = VM_PAGE_NULL;
2135 
2136 			/*
2137 			 * check for any conditions that prevent
2138 			 * us from creating a new zero-fill page
2139 			 * vm_fault_check will do all of the
2140 			 * fault cleanup in the case of an error condition
2141 			 * including resetting the thread_interrupt_level
2142 			 */
2143 			error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
2144 
2145 			if (error != VM_FAULT_SUCCESS) {
2146 				return error;
2147 			}
2148 
2149 			if (m == VM_PAGE_NULL) {
2150 				m = vm_page_grab_options(grab_options);
2151 
2152 				if (m == VM_PAGE_NULL) {
2153 					vm_fault_cleanup(object, VM_PAGE_NULL);
2154 					thread_interrupt_level(interruptible_state);
2155 
2156 					return VM_FAULT_MEMORY_SHORTAGE;
2157 				}
2158 				vm_page_insert(m, object, vm_object_trunc_page(offset));
2159 			}
2160 			if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
2161 				m->vmp_absent = TRUE;
2162 				clear_absent_on_error = true;
2163 			}
2164 
2165 			my_fault = vm_fault_zero_page(m, no_zero_fill);
2166 
2167 			break;
2168 		} else {
2169 			/*
2170 			 * Move on to the next object.  Lock the next
2171 			 * object before unlocking the current one.
2172 			 */
2173 			if ((object != first_object) || must_be_resident) {
2174 				vm_object_paging_end(object);
2175 			}
2176 
2177 			offset += object->vo_shadow_offset;
2178 			fault_info->lo_offset += object->vo_shadow_offset;
2179 			fault_info->hi_offset += object->vo_shadow_offset;
2180 			access_required = VM_PROT_READ;
2181 
2182 			vm_object_lock(next_object);
2183 			vm_object_unlock(object);
2184 
2185 			object = next_object;
2186 			vm_object_paging_begin(object);
2187 		}
2188 	}
2189 
2190 	/*
2191 	 *	PAGE HAS BEEN FOUND.
2192 	 *
2193 	 *	This page (m) is:
2194 	 *		busy, so that we can play with it;
2195 	 *		not absent, so that nobody else will fill it;
2196 	 *		possibly eligible for pageout;
2197 	 *
2198 	 *	The top-level page (first_m) is:
2199 	 *		VM_PAGE_NULL if the page was found in the
2200 	 *		 top-level object;
2201 	 *		busy, not absent, and ineligible for pageout.
2202 	 *
2203 	 *	The current object (object) is locked.  A paging
2204 	 *	reference is held for the current and top-level
2205 	 *	objects.
2206 	 */
2207 
2208 #if TRACEFAULTPAGE
2209 	dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
2210 #endif
2211 #if     EXTRA_ASSERTIONS
2212 	assert(m->vmp_busy && !m->vmp_absent);
2213 	assert((first_m == VM_PAGE_NULL) ||
2214 	    (first_m->vmp_busy && !first_m->vmp_absent &&
2215 	    !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
2216 #endif  /* EXTRA_ASSERTIONS */
2217 
2218 	/*
2219 	 * If the page is being written, but isn't
2220 	 * already owned by the top-level object,
2221 	 * we have to copy it into a new page owned
2222 	 * by the top-level object.
2223 	 */
2224 	if (object != first_object) {
2225 #if TRACEFAULTPAGE
2226 		dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
2227 #endif
2228 		if (fault_type & VM_PROT_WRITE) {
2229 			vm_page_t copy_m;
2230 
2231 			/*
2232 			 * We only really need to copy if we
2233 			 * want to write it.
2234 			 */
2235 			assert(!must_be_resident);
2236 
2237 			/*
2238 			 * If we try to collapse first_object at this
2239 			 * point, we may deadlock when we try to get
2240 			 * the lock on an intermediate object (since we
2241 			 * have the bottom object locked).  We can't
2242 			 * unlock the bottom object, because the page
2243 			 * we found may move (by collapse) if we do.
2244 			 *
2245 			 * Instead, we first copy the page.  Then, when
2246 			 * we have no more use for the bottom object,
2247 			 * we unlock it and try to collapse.
2248 			 *
2249 			 * Note that we copy the page even if we didn't
2250 			 * need to... that's the breaks.
2251 			 */
2252 
2253 			/*
2254 			 * Allocate a page for the copy
2255 			 */
2256 			copy_m = vm_page_grab_options(grab_options);
2257 
2258 			if (copy_m == VM_PAGE_NULL) {
2259 				vm_fault_page_release_page(m, &clear_absent_on_error);
2260 
2261 				vm_fault_cleanup(object, first_m);
2262 				thread_interrupt_level(interruptible_state);
2263 
2264 				return VM_FAULT_MEMORY_SHORTAGE;
2265 			}
2266 
2267 			vm_page_copy(m, copy_m);
2268 
2269 			/*
2270 			 * If another map is truly sharing this
2271 			 * page with us, we have to flush all
2272 			 * uses of the original page, since we
2273 			 * can't distinguish those which want the
2274 			 * original from those which need the
2275 			 * new copy.
2276 			 *
2277 			 * XXXO If we know that only one map has
2278 			 * access to this page, then we could
2279 			 * avoid the pmap_disconnect() call.
2280 			 */
2281 			if (m->vmp_pmapped) {
2282 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2283 			}
2284 
2285 			if (m->vmp_clustered) {
2286 				VM_PAGE_COUNT_AS_PAGEIN(m);
2287 				VM_PAGE_CONSUME_CLUSTERED(m);
2288 			}
2289 			assert(!m->vmp_cleaning);
2290 
2291 			/*
2292 			 * We no longer need the old page or object.
2293 			 */
2294 			vm_fault_page_release_page(m, &clear_absent_on_error);
2295 
2296 			/*
2297 			 * This check helps with marking the object as having a sequential pattern
2298 			 * Normally we'll miss doing this below because this fault is about COW to
2299 			 * the first_object i.e. bring page in from disk, push to object above but
2300 			 * don't update the file object's sequential pattern.
2301 			 */
2302 			if (object->internal == FALSE) {
2303 				vm_fault_is_sequential(object, offset, fault_info->behavior);
2304 			}
2305 
2306 			vm_object_paging_end(object);
2307 			vm_object_unlock(object);
2308 
2309 			my_fault = DBG_COW_FAULT;
2310 			counter_inc(&vm_statistics_cow_faults);
2311 			DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2312 			counter_inc(&current_task()->cow_faults);
2313 
2314 			object = first_object;
2315 			offset = first_offset;
2316 
2317 			vm_object_lock(object);
2318 			/*
2319 			 * get rid of the place holder
2320 			 * page that we soldered in earlier
2321 			 */
2322 			VM_PAGE_FREE(first_m);
2323 			first_m = VM_PAGE_NULL;
2324 
2325 			/*
2326 			 * and replace it with the
2327 			 * page we just copied into
2328 			 */
2329 			assert(copy_m->vmp_busy);
2330 			vm_page_insert(copy_m, object, vm_object_trunc_page(offset));
2331 			SET_PAGE_DIRTY(copy_m, TRUE);
2332 
2333 			m = copy_m;
2334 			/*
2335 			 * Now that we've gotten the copy out of the
2336 			 * way, let's try to collapse the top object.
2337 			 * But we have to play ugly games with
2338 			 * paging_in_progress to do that...
2339 			 */
2340 			vm_object_paging_end(object);
2341 			vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
2342 			vm_object_paging_begin(object);
2343 		} else {
2344 			*protection &= (~VM_PROT_WRITE);
2345 		}
2346 	}
2347 	/*
2348 	 * Now check whether the page needs to be pushed into the
2349 	 * copy object.  The use of asymmetric copy on write for
2350 	 * shared temporary objects means that we may do two copies to
2351 	 * satisfy the fault; one above to get the page from a
2352 	 * shadowed object, and one here to push it into the copy.
2353 	 */
2354 	try_failed_count = 0;
2355 
2356 	while ((copy_object = first_object->vo_copy) != VM_OBJECT_NULL) {
2357 		vm_object_offset_t      copy_offset;
2358 		vm_page_t               copy_m;
2359 
2360 #if TRACEFAULTPAGE
2361 		dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
2362 #endif
2363 		/*
2364 		 * If the page is being written, but hasn't been
2365 		 * copied to the copy-object, we have to copy it there.
2366 		 */
2367 		if ((fault_type & VM_PROT_WRITE) == 0) {
2368 			*protection &= ~VM_PROT_WRITE;
2369 			break;
2370 		}
2371 
2372 		/*
2373 		 * If the page was guaranteed to be resident,
2374 		 * we must have already performed the copy.
2375 		 */
2376 		if (must_be_resident) {
2377 			break;
2378 		}
2379 
2380 		/*
2381 		 * Try to get the lock on the copy_object.
2382 		 */
2383 		if (!vm_object_lock_try(copy_object)) {
2384 			vm_object_unlock(object);
2385 			try_failed_count++;
2386 
2387 			mutex_pause(try_failed_count);  /* wait a bit */
2388 			vm_object_lock(object);
2389 
2390 			continue;
2391 		}
2392 		try_failed_count = 0;
2393 
2394 		/*
2395 		 * Make another reference to the copy-object,
2396 		 * to keep it from disappearing during the
2397 		 * copy.
2398 		 */
2399 		vm_object_reference_locked(copy_object);
2400 
2401 		/*
2402 		 * Does the page exist in the copy?
2403 		 */
2404 		copy_offset = first_offset - copy_object->vo_shadow_offset;
2405 		copy_offset = vm_object_trunc_page(copy_offset);
2406 
2407 		if (copy_object->vo_size <= copy_offset) {
2408 			/*
2409 			 * Copy object doesn't cover this page -- do nothing.
2410 			 */
2411 			;
2412 		} else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2413 			/*
2414 			 * Page currently exists in the copy object
2415 			 */
2416 			if (copy_m->vmp_busy) {
2417 				/*
2418 				 * If the page is being brought
2419 				 * in, wait for it and then retry.
2420 				 */
2421 				vm_fault_page_release_page(m, &clear_absent_on_error);
2422 
2423 				/*
2424 				 * take an extra ref so object won't die
2425 				 */
2426 				vm_object_reference_locked(copy_object);
2427 				vm_object_unlock(copy_object);
2428 				vm_fault_cleanup(object, first_m);
2429 
2430 				vm_object_lock(copy_object);
2431 				vm_object_lock_assert_exclusive(copy_object);
2432 				os_ref_release_live_locked_raw(&copy_object->ref_count,
2433 				    &vm_object_refgrp);
2434 				copy_m = vm_page_lookup(copy_object, copy_offset);
2435 
2436 				if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2437 					wait_result = vm_page_sleep(copy_object, copy_m, interruptible, LCK_SLEEP_UNLOCK);
2438 					vm_object_deallocate(copy_object);
2439 
2440 					goto backoff;
2441 				} else {
2442 					vm_object_unlock(copy_object);
2443 					vm_object_deallocate(copy_object);
2444 					thread_interrupt_level(interruptible_state);
2445 
2446 					return VM_FAULT_RETRY;
2447 				}
2448 			}
2449 		} else if (!PAGED_OUT(copy_object, copy_offset)) {
2450 			/*
2451 			 * If PAGED_OUT is TRUE, then the page used to exist
2452 			 * in the copy-object, and has already been paged out.
2453 			 * We don't need to repeat this. If PAGED_OUT is
2454 			 * FALSE, then either we don't know (!pager_created,
2455 			 * for example) or it hasn't been paged out.
2456 			 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2457 			 * We must copy the page to the copy object.
2458 			 *
2459 			 * Allocate a page for the copy
2460 			 */
2461 			copy_m = vm_page_grab_options(grab_options);
2462 
2463 			if (copy_m == VM_PAGE_NULL) {
2464 				vm_fault_page_release_page(m, &clear_absent_on_error);
2465 
2466 				vm_object_lock_assert_exclusive(copy_object);
2467 				os_ref_release_live_locked_raw(&copy_object->ref_count,
2468 				    &vm_object_refgrp);
2469 
2470 				vm_object_unlock(copy_object);
2471 				vm_fault_cleanup(object, first_m);
2472 				thread_interrupt_level(interruptible_state);
2473 
2474 				return VM_FAULT_MEMORY_SHORTAGE;
2475 			}
2476 
2477 			/*
2478 			 * Must copy page into copy-object.
2479 			 */
2480 			vm_page_insert(copy_m, copy_object, copy_offset);
2481 			vm_page_copy(m, copy_m);
2482 
2483 			/*
2484 			 * If the old page was in use by any users
2485 			 * of the copy-object, it must be removed
2486 			 * from all pmaps.  (We can't know which
2487 			 * pmaps use it.)
2488 			 */
2489 			if (m->vmp_pmapped) {
2490 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2491 			}
2492 
2493 			if (m->vmp_clustered) {
2494 				VM_PAGE_COUNT_AS_PAGEIN(m);
2495 				VM_PAGE_CONSUME_CLUSTERED(m);
2496 			}
2497 			/*
2498 			 * If there's a pager, then immediately
2499 			 * page out this page, using the "initialize"
2500 			 * option.  Else, we use the copy.
2501 			 */
2502 			if ((!copy_object->pager_ready)
2503 			    || vm_object_compressor_pager_state_get(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2504 			    ) {
2505 				vm_page_lockspin_queues();
2506 				assert(!m->vmp_cleaning);
2507 				vm_page_activate(copy_m);
2508 				vm_page_unlock_queues();
2509 
2510 				SET_PAGE_DIRTY(copy_m, TRUE);
2511 				vm_page_wakeup_done(copy_object, copy_m);
2512 			} else {
2513 				assert(copy_m->vmp_busy == TRUE);
2514 				assert(!m->vmp_cleaning);
2515 
2516 				/*
2517 				 * dirty is protected by the object lock
2518 				 */
2519 				SET_PAGE_DIRTY(copy_m, TRUE);
2520 
2521 				/*
2522 				 * The page is already ready for pageout:
2523 				 * not on pageout queues and busy.
2524 				 * Unlock everything except the
2525 				 * copy_object itself.
2526 				 */
2527 				vm_object_unlock(object);
2528 
2529 				/*
2530 				 * Write the page to the copy-object,
2531 				 * flushing it from the kernel.
2532 				 */
2533 				vm_pageout_initialize_page(copy_m);
2534 
2535 				/*
2536 				 * Since the pageout may have
2537 				 * temporarily dropped the
2538 				 * copy_object's lock, we
2539 				 * check whether we'll have
2540 				 * to deallocate the hard way.
2541 				 */
2542 				if ((copy_object->shadow != object) ||
2543 				    (os_ref_get_count_raw(&copy_object->ref_count) == 1)) {
2544 					vm_object_unlock(copy_object);
2545 					vm_object_deallocate(copy_object);
2546 					vm_object_lock(object);
2547 
2548 					continue;
2549 				}
2550 				/*
2551 				 * Pick back up the old object's
2552 				 * lock.  [It is safe to do so,
2553 				 * since it must be deeper in the
2554 				 * object tree.]
2555 				 */
2556 				vm_object_lock(object);
2557 			}
2558 
2559 			/*
2560 			 * Because we're pushing a page upward
2561 			 * in the object tree, we must restart
2562 			 * any faults that are waiting here.
2563 			 * [Note that this is an expansion of
2564 			 * vm_page_wakeup() that uses the THREAD_RESTART
2565 			 * wait result].  Can't turn off the page's
2566 			 * busy bit because we're not done with it.
2567 			 */
2568 			if (m->vmp_wanted) {
2569 				m->vmp_wanted = FALSE;
2570 				thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2571 			}
2572 		}
2573 		/*
2574 		 * The reference count on copy_object must be
2575 		 * at least 2: one for our extra reference,
2576 		 * and at least one from the outside world
2577 		 * (we checked that when we last locked
2578 		 * copy_object).
2579 		 */
2580 		vm_object_lock_assert_exclusive(copy_object);
2581 		os_ref_release_live_locked_raw(&copy_object->ref_count,
2582 		    &vm_object_refgrp);
2583 
2584 		vm_object_unlock(copy_object);
2585 
2586 		break;
2587 	}
2588 
2589 done:
2590 	*result_page = m;
2591 	*top_page = first_m;
2592 
2593 	if (m != VM_PAGE_NULL) {
2594 		assert(VM_PAGE_OBJECT(m) == object);
2595 
2596 		retval = VM_FAULT_SUCCESS;
2597 
2598 		if (my_fault == DBG_PAGEIN_FAULT) {
2599 			VM_PAGE_COUNT_AS_PAGEIN(m);
2600 
2601 			if (object->internal) {
2602 				my_fault = DBG_PAGEIND_FAULT;
2603 			} else {
2604 				my_fault = DBG_PAGEINV_FAULT;
2605 			}
2606 
2607 			/*
2608 			 * evaluate access pattern and update state
2609 			 * vm_fault_deactivate_behind depends on the
2610 			 * state being up to date
2611 			 */
2612 			vm_fault_is_sequential(object, offset, fault_info->behavior);
2613 			vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2614 		} else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2615 			/*
2616 			 * we weren't called from vm_fault, so handle the
2617 			 * accounting here for hits in the cache
2618 			 */
2619 			if (m->vmp_clustered) {
2620 				VM_PAGE_COUNT_AS_PAGEIN(m);
2621 				VM_PAGE_CONSUME_CLUSTERED(m);
2622 			}
2623 			vm_fault_is_sequential(object, offset, fault_info->behavior);
2624 			vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2625 		} else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2626 			VM_STAT_DECOMPRESSIONS();
2627 		}
2628 		if (type_of_fault) {
2629 			*type_of_fault = my_fault;
2630 		}
2631 	} else {
2632 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUCCESS_NO_PAGE), 0 /* arg */);
2633 		retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2634 		assert(first_m == VM_PAGE_NULL);
2635 		assert(object == first_object);
2636 	}
2637 
2638 	thread_interrupt_level(interruptible_state);
2639 
2640 #if TRACEFAULTPAGE
2641 	dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2642 #endif
2643 	return retval;
2644 
2645 backoff:
2646 	thread_interrupt_level(interruptible_state);
2647 
2648 	if (wait_result == THREAD_INTERRUPTED) {
2649 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
2650 		return VM_FAULT_INTERRUPTED;
2651 	}
2652 	return VM_FAULT_RETRY;
2653 }
2654 
2655 #if MACH_ASSERT && (XNU_PLATFORM_WatchOS || __x86_64__)
2656 #define PANIC_ON_CS_KILLED_DEFAULT true
2657 #else
2658 #define PANIC_ON_CS_KILLED_DEFAULT false
2659 #endif
2660 static TUNABLE(bool, panic_on_cs_killed, "panic_on_cs_killed",
2661     PANIC_ON_CS_KILLED_DEFAULT);
2662 
2663 extern int proc_selfpid(void);
2664 extern char *proc_name_address(struct proc *p);
2665 extern const char *proc_best_name(struct proc *);
2666 unsigned long cs_enter_tainted_rejected = 0;
2667 unsigned long cs_enter_tainted_accepted = 0;
2668 
2669 /*
2670  * CODE SIGNING:
2671  * When soft faulting a page, we have to validate the page if:
2672  * 1. the page is being mapped in user space
2673  * 2. the page hasn't already been found to be "tainted"
2674  * 3. the page belongs to a code-signed object
2675  * 4. the page has not been validated yet or has been mapped for write.
2676  */
2677 static bool
vm_fault_cs_need_validation(pmap_t pmap,vm_page_t page,vm_object_t page_obj,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)2678 vm_fault_cs_need_validation(
2679 	pmap_t pmap,
2680 	vm_page_t page,
2681 	vm_object_t page_obj,
2682 	vm_map_size_t fault_page_size,
2683 	vm_map_offset_t fault_phys_offset)
2684 {
2685 	if (pmap == kernel_pmap) {
2686 		/* 1 - not user space */
2687 		return false;
2688 	}
2689 	if (!page_obj->code_signed) {
2690 		/* 3 - page does not belong to a code-signed object */
2691 		return false;
2692 	}
2693 	if (fault_page_size == PAGE_SIZE) {
2694 		/* looking at the whole page */
2695 		assertf(fault_phys_offset == 0,
2696 		    "fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
2697 		    (uint64_t)fault_page_size,
2698 		    (uint64_t)fault_phys_offset);
2699 		if (page->vmp_cs_tainted == VMP_CS_ALL_TRUE) {
2700 			/* 2 - page is all tainted */
2701 			return false;
2702 		}
2703 		if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
2704 		    !page->vmp_wpmapped) {
2705 			/* 4 - already fully validated and never mapped writable */
2706 			return false;
2707 		}
2708 	} else {
2709 		/* looking at a specific sub-page */
2710 		if (VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
2711 			/* 2 - sub-page was already marked as tainted */
2712 			return false;
2713 		}
2714 		if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) &&
2715 		    !page->vmp_wpmapped) {
2716 			/* 4 - already validated and never mapped writable */
2717 			return false;
2718 		}
2719 	}
2720 	/* page needs to be validated */
2721 	return true;
2722 }
2723 
2724 
2725 static bool
vm_fault_cs_page_immutable(vm_page_t m,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_prot_t prot __unused)2726 vm_fault_cs_page_immutable(
2727 	vm_page_t m,
2728 	vm_map_size_t fault_page_size,
2729 	vm_map_offset_t fault_phys_offset,
2730 	vm_prot_t prot __unused)
2731 {
2732 	if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)
2733 	    /*&& ((prot) & VM_PROT_EXECUTE)*/) {
2734 		return true;
2735 	}
2736 	return false;
2737 }
2738 
2739 static bool
vm_fault_cs_page_nx(vm_page_t m,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)2740 vm_fault_cs_page_nx(
2741 	vm_page_t m,
2742 	vm_map_size_t fault_page_size,
2743 	vm_map_offset_t fault_phys_offset)
2744 {
2745 	return VMP_CS_NX(m, fault_page_size, fault_phys_offset);
2746 }
2747 
2748 /*
2749  * Check if the page being entered into the pmap violates code signing.
2750  */
2751 static kern_return_t
vm_fault_cs_check_violation(bool cs_bypass,vm_object_t object,vm_page_t m,pmap_t pmap,vm_prot_t prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_object_fault_info_t fault_info,bool map_is_switched,bool map_is_switch_protected,bool * cs_violation)2752 vm_fault_cs_check_violation(
2753 	bool cs_bypass,
2754 	vm_object_t object,
2755 	vm_page_t m,
2756 	pmap_t pmap,
2757 	vm_prot_t prot,
2758 	vm_prot_t caller_prot,
2759 	vm_map_size_t fault_page_size,
2760 	vm_map_offset_t fault_phys_offset,
2761 	vm_object_fault_info_t fault_info,
2762 	bool map_is_switched,
2763 	bool map_is_switch_protected,
2764 	bool *cs_violation)
2765 {
2766 #if !CODE_SIGNING_MONITOR
2767 #pragma unused(caller_prot)
2768 #pragma unused(fault_info)
2769 #endif /* !CODE_SIGNING_MONITOR */
2770 
2771 	int             cs_enforcement_enabled;
2772 	if (!cs_bypass &&
2773 	    vm_fault_cs_need_validation(pmap, m, object,
2774 	    fault_page_size, fault_phys_offset)) {
2775 		vm_object_lock_assert_exclusive(object);
2776 
2777 		if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)) {
2778 			vm_cs_revalidates++;
2779 		}
2780 
2781 		/* VM map is locked, so 1 ref will remain on VM object -
2782 		 * so no harm if vm_page_validate_cs drops the object lock */
2783 
2784 #if CODE_SIGNING_MONITOR
2785 		if (fault_info->csm_associated &&
2786 		    csm_enabled() &&
2787 		    !VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2788 		    !VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) &&
2789 		    !VMP_CS_NX(m, fault_page_size, fault_phys_offset) &&
2790 		    (prot & VM_PROT_EXECUTE) &&
2791 		    (caller_prot & VM_PROT_EXECUTE)) {
2792 			/*
2793 			 * When we have a code signing monitor, the monitor will evaluate the code signature
2794 			 * for any executable page mapping. No need for the VM to also validate the page.
2795 			 * In the code signing monitor we trust :)
2796 			 */
2797 			vm_cs_defer_to_csm++;
2798 		} else {
2799 			vm_cs_defer_to_csm_not++;
2800 			vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
2801 		}
2802 #else /* CODE_SIGNING_MONITOR */
2803 		vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
2804 #endif /* CODE_SIGNING_MONITOR */
2805 	}
2806 
2807 	/* If the map is switched, and is switch-protected, we must protect
2808 	 * some pages from being write-faulted: immutable pages because by
2809 	 * definition they may not be written, and executable pages because that
2810 	 * would provide a way to inject unsigned code.
2811 	 * If the page is immutable, we can simply return. However, we can't
2812 	 * immediately determine whether a page is executable anywhere. But,
2813 	 * we can disconnect it everywhere and remove the executable protection
2814 	 * from the current map. We do that below right before we do the
2815 	 * PMAP_ENTER.
2816 	 */
2817 	if (pmap == kernel_pmap) {
2818 		/* kernel fault: cs_enforcement does not apply */
2819 		cs_enforcement_enabled = 0;
2820 	} else {
2821 		cs_enforcement_enabled = pmap_get_vm_map_cs_enforced(pmap);
2822 	}
2823 
2824 	if (cs_enforcement_enabled && map_is_switched &&
2825 	    map_is_switch_protected &&
2826 	    vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2827 	    (prot & VM_PROT_WRITE)) {
2828 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAILED_IMMUTABLE_PAGE_WRITE), 0 /* arg */);
2829 		return KERN_CODESIGN_ERROR;
2830 	}
2831 
2832 	if (cs_enforcement_enabled &&
2833 	    vm_fault_cs_page_nx(m, fault_page_size, fault_phys_offset) &&
2834 	    (prot & VM_PROT_EXECUTE)) {
2835 		if (cs_debug) {
2836 			printf("page marked to be NX, not letting it be mapped EXEC\n");
2837 		}
2838 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAILED_NX_PAGE_EXEC_MAPPING), 0 /* arg */);
2839 		return KERN_CODESIGN_ERROR;
2840 	}
2841 
2842 	/* A page could be tainted, or pose a risk of being tainted later.
2843 	 * Check whether the receiving process wants it, and make it feel
2844 	 * the consequences (that hapens in cs_invalid_page()).
2845 	 * For CS Enforcement, two other conditions will
2846 	 * cause that page to be tainted as well:
2847 	 * - pmapping an unsigned page executable - this means unsigned code;
2848 	 * - writeable mapping of a validated page - the content of that page
2849 	 *   can be changed without the kernel noticing, therefore unsigned
2850 	 *   code can be created
2851 	 */
2852 	if (cs_bypass) {
2853 		/* code-signing is bypassed */
2854 		*cs_violation = FALSE;
2855 	} else if (VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
2856 		/* tainted page */
2857 		*cs_violation = TRUE;
2858 	} else if (!cs_enforcement_enabled) {
2859 		/* no further code-signing enforcement */
2860 		*cs_violation = FALSE;
2861 	} else if (vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2862 	    ((prot & VM_PROT_WRITE) ||
2863 	    m->vmp_wpmapped)) {
2864 		/*
2865 		 * The page should be immutable, but is in danger of being
2866 		 * modified.
2867 		 * This is the case where we want policy from the code
2868 		 * directory - is the page immutable or not? For now we have
2869 		 * to assume that code pages will be immutable, data pages not.
2870 		 * We'll assume a page is a code page if it has a code directory
2871 		 * and we fault for execution.
2872 		 * That is good enough since if we faulted the code page for
2873 		 * writing in another map before, it is wpmapped; if we fault
2874 		 * it for writing in this map later it will also be faulted for
2875 		 * executing at the same time; and if we fault for writing in
2876 		 * another map later, we will disconnect it from this pmap so
2877 		 * we'll notice the change.
2878 		 */
2879 		*cs_violation = TRUE;
2880 	} else if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2881 	    (prot & VM_PROT_EXECUTE)
2882 #if CODE_SIGNING_MONITOR
2883 	    /*
2884 	     * Executable pages will be validated by the code signing monitor. If the
2885 	     * code signing monitor is turned off, then this is a code-signing violation.
2886 	     */
2887 	    && !csm_enabled()
2888 #endif /* CODE_SIGNING_MONITOR */
2889 	    ) {
2890 		*cs_violation = TRUE;
2891 	} else {
2892 		*cs_violation = FALSE;
2893 	}
2894 	return KERN_SUCCESS;
2895 }
2896 
2897 /*
2898  * Handles a code signing violation by either rejecting the page or forcing a disconnect.
2899  * @param must_disconnect This value will be set to true if the caller must disconnect
2900  * this page.
2901  * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
2902  */
2903 static kern_return_t
vm_fault_cs_handle_violation(vm_object_t object,vm_page_t m,pmap_t pmap,vm_prot_t prot,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,bool map_is_switched,bool map_is_switch_protected,bool * must_disconnect)2904 vm_fault_cs_handle_violation(
2905 	vm_object_t object,
2906 	vm_page_t m,
2907 	pmap_t pmap,
2908 	vm_prot_t prot,
2909 	vm_map_offset_t vaddr,
2910 	vm_map_size_t fault_page_size,
2911 	vm_map_offset_t fault_phys_offset,
2912 	bool map_is_switched,
2913 	bool map_is_switch_protected,
2914 	bool *must_disconnect)
2915 {
2916 #if !MACH_ASSERT
2917 #pragma unused(pmap)
2918 #pragma unused(map_is_switch_protected)
2919 #endif /* !MACH_ASSERT */
2920 	/*
2921 	 * We will have a tainted page. Have to handle the special case
2922 	 * of a switched map now. If the map is not switched, standard
2923 	 * procedure applies - call cs_invalid_page().
2924 	 * If the map is switched, the real owner is invalid already.
2925 	 * There is no point in invalidating the switching process since
2926 	 * it will not be executing from the map. So we don't call
2927 	 * cs_invalid_page() in that case.
2928 	 */
2929 	boolean_t reject_page, cs_killed;
2930 	kern_return_t kr;
2931 	if (map_is_switched) {
2932 		assert(pmap == vm_map_pmap(current_thread()->map));
2933 		assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2934 		reject_page = FALSE;
2935 	} else {
2936 		if (cs_debug > 5) {
2937 			printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2938 			    object->code_signed ? "yes" : "no",
2939 			    VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2940 			    VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2941 			    m->vmp_wpmapped ? "yes" : "no",
2942 			    (int)prot);
2943 		}
2944 		reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2945 	}
2946 
2947 	if (reject_page) {
2948 		/* reject the invalid page: abort the page fault */
2949 		int                     pid;
2950 		const char              *procname;
2951 		task_t                  task;
2952 		vm_object_t             file_object, shadow;
2953 		vm_object_offset_t      file_offset;
2954 		char                    *pathname, *filename;
2955 		vm_size_t               pathname_len, filename_len;
2956 		boolean_t               truncated_path;
2957 #define __PATH_MAX 1024
2958 		struct timespec         mtime, cs_mtime;
2959 		int                     shadow_depth;
2960 		os_reason_t             codesigning_exit_reason = OS_REASON_NULL;
2961 
2962 		kr = KERN_CODESIGN_ERROR;
2963 		cs_enter_tainted_rejected++;
2964 
2965 		/* get process name and pid */
2966 		procname = "?";
2967 		task = current_task();
2968 		pid = proc_selfpid();
2969 		if (get_bsdtask_info(task) != NULL) {
2970 			procname = proc_name_address(get_bsdtask_info(task));
2971 		}
2972 
2973 		/* get file's VM object */
2974 		file_object = object;
2975 		file_offset = m->vmp_offset;
2976 		for (shadow = file_object->shadow,
2977 		    shadow_depth = 0;
2978 		    shadow != VM_OBJECT_NULL;
2979 		    shadow = file_object->shadow,
2980 		    shadow_depth++) {
2981 			vm_object_lock_shared(shadow);
2982 			if (file_object != object) {
2983 				vm_object_unlock(file_object);
2984 			}
2985 			file_offset += file_object->vo_shadow_offset;
2986 			file_object = shadow;
2987 		}
2988 
2989 		mtime.tv_sec = 0;
2990 		mtime.tv_nsec = 0;
2991 		cs_mtime.tv_sec = 0;
2992 		cs_mtime.tv_nsec = 0;
2993 
2994 		/* get file's pathname and/or filename */
2995 		pathname = NULL;
2996 		filename = NULL;
2997 		pathname_len = 0;
2998 		filename_len = 0;
2999 		truncated_path = FALSE;
3000 		/* no pager -> no file -> no pathname, use "<nil>" in that case */
3001 		if (file_object->pager != NULL) {
3002 			pathname = kalloc_data(__PATH_MAX * 2, Z_WAITOK);
3003 			if (pathname) {
3004 				pathname[0] = '\0';
3005 				pathname_len = __PATH_MAX;
3006 				filename = pathname + pathname_len;
3007 				filename_len = __PATH_MAX;
3008 
3009 				if (vnode_pager_get_object_name(file_object->pager,
3010 				    pathname,
3011 				    pathname_len,
3012 				    filename,
3013 				    filename_len,
3014 				    &truncated_path) == KERN_SUCCESS) {
3015 					/* safety first... */
3016 					pathname[__PATH_MAX - 1] = '\0';
3017 					filename[__PATH_MAX - 1] = '\0';
3018 
3019 					vnode_pager_get_object_mtime(file_object->pager,
3020 					    &mtime,
3021 					    &cs_mtime);
3022 				} else {
3023 					kfree_data(pathname, __PATH_MAX * 2);
3024 					pathname = NULL;
3025 					filename = NULL;
3026 					pathname_len = 0;
3027 					filename_len = 0;
3028 					truncated_path = FALSE;
3029 				}
3030 			}
3031 		}
3032 		printf("CODE SIGNING: process %d[%s]: "
3033 		    "rejecting invalid page at address 0x%llx "
3034 		    "from offset 0x%llx in file \"%s%s%s\" "
3035 		    "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
3036 		    "(signed:%d validated:%d tainted:%d nx:%d "
3037 		    "wpmapped:%d dirty:%d depth:%d)\n",
3038 		    pid, procname, (addr64_t) vaddr,
3039 		    file_offset,
3040 		    (pathname ? pathname : "<nil>"),
3041 		    (truncated_path ? "/.../" : ""),
3042 		    (truncated_path ? filename : ""),
3043 		    cs_mtime.tv_sec, cs_mtime.tv_nsec,
3044 		    ((cs_mtime.tv_sec == mtime.tv_sec &&
3045 		    cs_mtime.tv_nsec == mtime.tv_nsec)
3046 		    ? "=="
3047 		    : "!="),
3048 		    mtime.tv_sec, mtime.tv_nsec,
3049 		    object->code_signed,
3050 		    VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
3051 		    VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
3052 		    VMP_CS_NX(m, fault_page_size, fault_phys_offset),
3053 		    m->vmp_wpmapped,
3054 		    m->vmp_dirty,
3055 		    shadow_depth);
3056 
3057 		/*
3058 		 * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
3059 		 * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
3060 		 * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
3061 		 * will deal with the segmentation fault.
3062 		 */
3063 		if (cs_killed) {
3064 			KDBG(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
3065 			    pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
3066 
3067 			codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
3068 			if (codesigning_exit_reason == NULL) {
3069 				printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
3070 			} else {
3071 				mach_vm_address_t data_addr = 0;
3072 				struct codesigning_exit_reason_info *ceri = NULL;
3073 				uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
3074 
3075 				if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
3076 					printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
3077 				} else {
3078 					if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
3079 					    EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
3080 						ceri = (struct codesigning_exit_reason_info *)data_addr;
3081 						static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
3082 
3083 						ceri->ceri_virt_addr = vaddr;
3084 						ceri->ceri_file_offset = file_offset;
3085 						if (pathname) {
3086 							strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
3087 						} else {
3088 							ceri->ceri_pathname[0] = '\0';
3089 						}
3090 						if (filename) {
3091 							strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
3092 						} else {
3093 							ceri->ceri_filename[0] = '\0';
3094 						}
3095 						ceri->ceri_path_truncated = (truncated_path ? 1 : 0);
3096 						ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
3097 						ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
3098 						ceri->ceri_page_modtime_secs = mtime.tv_sec;
3099 						ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
3100 						ceri->ceri_object_codesigned = (object->code_signed);
3101 						ceri->ceri_page_codesig_validated = VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset);
3102 						ceri->ceri_page_codesig_tainted = VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset);
3103 						ceri->ceri_page_codesig_nx = VMP_CS_NX(m, fault_page_size, fault_phys_offset);
3104 						ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
3105 						ceri->ceri_page_slid = 0;
3106 						ceri->ceri_page_dirty = (m->vmp_dirty);
3107 						ceri->ceri_page_shadow_depth = shadow_depth;
3108 					} else {
3109 #if DEBUG || DEVELOPMENT
3110 						panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
3111 #else
3112 						printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
3113 #endif /* DEBUG || DEVELOPMENT */
3114 						/* Free the buffer */
3115 						os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
3116 					}
3117 				}
3118 			}
3119 
3120 			set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
3121 		}
3122 		if (panic_on_cs_killed &&
3123 		    object->object_is_shared_cache) {
3124 			char *tainted_contents;
3125 			vm_map_offset_t src_vaddr;
3126 			src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
3127 			tainted_contents = kalloc_data(PAGE_SIZE, Z_WAITOK);
3128 			bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
3129 			printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
3130 			panic("CODE SIGNING: process %d[%s]: "
3131 			    "rejecting invalid page (phys#0x%x) at address 0x%llx "
3132 			    "from offset 0x%llx in file \"%s%s%s\" "
3133 			    "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
3134 			    "(signed:%d validated:%d tainted:%d nx:%d"
3135 			    "wpmapped:%d dirty:%d depth:%d)\n",
3136 			    pid, procname,
3137 			    VM_PAGE_GET_PHYS_PAGE(m),
3138 			    (addr64_t) vaddr,
3139 			    file_offset,
3140 			    (pathname ? pathname : "<nil>"),
3141 			    (truncated_path ? "/.../" : ""),
3142 			    (truncated_path ? filename : ""),
3143 			    cs_mtime.tv_sec, cs_mtime.tv_nsec,
3144 			    ((cs_mtime.tv_sec == mtime.tv_sec &&
3145 			    cs_mtime.tv_nsec == mtime.tv_nsec)
3146 			    ? "=="
3147 			    : "!="),
3148 			    mtime.tv_sec, mtime.tv_nsec,
3149 			    object->code_signed,
3150 			    VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
3151 			    VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
3152 			    VMP_CS_NX(m, fault_page_size, fault_phys_offset),
3153 			    m->vmp_wpmapped,
3154 			    m->vmp_dirty,
3155 			    shadow_depth);
3156 		}
3157 
3158 		if (file_object != object) {
3159 			vm_object_unlock(file_object);
3160 		}
3161 		if (pathname_len != 0) {
3162 			kfree_data(pathname, __PATH_MAX * 2);
3163 			pathname = NULL;
3164 			filename = NULL;
3165 		}
3166 	} else {
3167 		/* proceed with the invalid page */
3168 		kr = KERN_SUCCESS;
3169 		if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
3170 		    !object->code_signed) {
3171 			/*
3172 			 * This page has not been (fully) validated but
3173 			 * does not belong to a code-signed object
3174 			 * so it should not be forcefully considered
3175 			 * as tainted.
3176 			 * We're just concerned about it here because
3177 			 * we've been asked to "execute" it but that
3178 			 * does not mean that it should cause other
3179 			 * accesses to fail.
3180 			 * This happens when a debugger sets a
3181 			 * breakpoint and we then execute code in
3182 			 * that page.  Marking the page as "tainted"
3183 			 * would cause any inspection tool ("leaks",
3184 			 * "vmmap", "CrashReporter", ...) to get killed
3185 			 * due to code-signing violation on that page,
3186 			 * even though they're just reading it and not
3187 			 * executing from it.
3188 			 */
3189 		} else {
3190 			/*
3191 			 * Page might have been tainted before or not;
3192 			 * now it definitively is. If the page wasn't
3193 			 * tainted, we must disconnect it from all
3194 			 * pmaps later, to force existing mappings
3195 			 * through that code path for re-consideration
3196 			 * of the validity of that page.
3197 			 */
3198 			if (!VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
3199 				*must_disconnect = TRUE;
3200 				VMP_CS_SET_TAINTED(m, fault_page_size, fault_phys_offset, TRUE);
3201 			}
3202 		}
3203 		cs_enter_tainted_accepted++;
3204 	}
3205 	if (kr != KERN_SUCCESS) {
3206 		if (cs_debug) {
3207 			printf("CODESIGNING: vm_fault_enter(0x%llx): "
3208 			    "*** INVALID PAGE ***\n",
3209 			    (long long)vaddr);
3210 		}
3211 #if !SECURE_KERNEL
3212 		if (cs_enforcement_panic) {
3213 			panic("CODESIGNING: panicking on invalid page");
3214 		}
3215 #endif
3216 	}
3217 	return kr;
3218 }
3219 
3220 /*
3221  * Check that the code signature is valid for the given page being inserted into
3222  * the pmap.
3223  *
3224  * @param must_disconnect This value will be set to true if the caller must disconnect
3225  * this page.
3226  * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
3227  */
3228 static kern_return_t
vm_fault_validate_cs(bool cs_bypass,vm_object_t object,vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_prot_t prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_object_fault_info_t fault_info,bool * must_disconnect)3229 vm_fault_validate_cs(
3230 	bool cs_bypass,
3231 	vm_object_t object,
3232 	vm_page_t m,
3233 	pmap_t pmap,
3234 	vm_map_offset_t vaddr,
3235 	vm_prot_t prot,
3236 	vm_prot_t caller_prot,
3237 	vm_map_size_t fault_page_size,
3238 	vm_map_offset_t fault_phys_offset,
3239 	vm_object_fault_info_t fault_info,
3240 	bool *must_disconnect)
3241 {
3242 	bool map_is_switched, map_is_switch_protected, cs_violation;
3243 	kern_return_t kr;
3244 	/* Validate code signature if necessary. */
3245 	map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
3246 	    (pmap == vm_map_pmap(current_thread()->map)));
3247 	map_is_switch_protected = current_thread()->map->switch_protect;
3248 	kr = vm_fault_cs_check_violation(cs_bypass, object, m, pmap,
3249 	    prot, caller_prot, fault_page_size, fault_phys_offset, fault_info,
3250 	    map_is_switched, map_is_switch_protected, &cs_violation);
3251 	if (kr != KERN_SUCCESS) {
3252 		return kr;
3253 	}
3254 	if (cs_violation) {
3255 		kr = vm_fault_cs_handle_violation(object, m, pmap, prot, vaddr,
3256 		    fault_page_size, fault_phys_offset,
3257 		    map_is_switched, map_is_switch_protected, must_disconnect);
3258 	}
3259 	return kr;
3260 }
3261 
3262 /*
3263  * Enqueue the page on the appropriate paging queue.
3264  */
3265 static void
vm_fault_enqueue_page(vm_object_t object,vm_page_t m,bool wired,bool change_wiring,vm_tag_t wire_tag,bool no_cache,int * type_of_fault,kern_return_t kr)3266 vm_fault_enqueue_page(
3267 	vm_object_t object,
3268 	vm_page_t m,
3269 	bool wired,
3270 	bool change_wiring,
3271 	vm_tag_t wire_tag,
3272 	bool no_cache,
3273 	int *type_of_fault,
3274 	kern_return_t kr)
3275 {
3276 	assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
3277 	boolean_t       page_queues_locked = FALSE;
3278 	boolean_t       previously_pmapped = m->vmp_pmapped;
3279 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()   \
3280 MACRO_BEGIN                                     \
3281 	if (! page_queues_locked) {             \
3282 	        page_queues_locked = TRUE;      \
3283 	        vm_page_lockspin_queues();      \
3284 	}                                       \
3285 MACRO_END
3286 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()     \
3287 MACRO_BEGIN                                     \
3288 	if (page_queues_locked) {               \
3289 	        page_queues_locked = FALSE;     \
3290 	        vm_page_unlock_queues();        \
3291 	}                                       \
3292 MACRO_END
3293 
3294 	vm_page_update_special_state(m);
3295 	if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
3296 		/*
3297 		 * Compressor pages are neither wired
3298 		 * nor pageable and should never change.
3299 		 */
3300 		assert(object == compressor_object);
3301 	} else if (change_wiring) {
3302 		__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3303 
3304 		if (wired) {
3305 			if (kr == KERN_SUCCESS) {
3306 				vm_page_wire(m, wire_tag, TRUE);
3307 			}
3308 		} else {
3309 			vm_page_unwire(m, TRUE);
3310 		}
3311 		/* we keep the page queues lock, if we need it later */
3312 	} else {
3313 		if (object->internal == TRUE) {
3314 			/*
3315 			 * don't allow anonymous pages on
3316 			 * the speculative queues
3317 			 */
3318 			no_cache = FALSE;
3319 		}
3320 		if (kr != KERN_SUCCESS) {
3321 			__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3322 			vm_page_deactivate(m);
3323 			/* we keep the page queues lock, if we need it later */
3324 		} else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
3325 		    (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3326 		    (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
3327 		    ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3328 		    !VM_PAGE_WIRED(m)) {
3329 			if (vm_page_local_q &&
3330 			    (*type_of_fault == DBG_COW_FAULT ||
3331 			    *type_of_fault == DBG_ZERO_FILL_FAULT)) {
3332 				struct vpl      *lq;
3333 				uint32_t        lid;
3334 
3335 				assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3336 
3337 				__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3338 				vm_object_lock_assert_exclusive(object);
3339 
3340 				/*
3341 				 * we got a local queue to stuff this
3342 				 * new page on...
3343 				 * its safe to manipulate local and
3344 				 * local_id at this point since we're
3345 				 * behind an exclusive object lock and
3346 				 * the page is not on any global queue.
3347 				 *
3348 				 * we'll use the current cpu number to
3349 				 * select the queue note that we don't
3350 				 * need to disable preemption... we're
3351 				 * going to be behind the local queue's
3352 				 * lock to do the real work
3353 				 */
3354 				lid = cpu_number();
3355 
3356 				lq = zpercpu_get_cpu(vm_page_local_q, lid);
3357 
3358 				VPL_LOCK(&lq->vpl_lock);
3359 
3360 				vm_page_check_pageable_safe(m);
3361 				vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
3362 				m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3363 				m->vmp_local_id = (uint16_t)lid;
3364 				lq->vpl_count++;
3365 
3366 				if (object->internal) {
3367 					lq->vpl_internal_count++;
3368 				} else {
3369 					lq->vpl_external_count++;
3370 				}
3371 
3372 				VPL_UNLOCK(&lq->vpl_lock);
3373 
3374 				if (lq->vpl_count > vm_page_local_q_soft_limit) {
3375 					/*
3376 					 * we're beyond the soft limit
3377 					 * for the local queue
3378 					 * vm_page_reactivate_local will
3379 					 * 'try' to take the global page
3380 					 * queue lock... if it can't
3381 					 * that's ok... we'll let the
3382 					 * queue continue to grow up
3383 					 * to the hard limit... at that
3384 					 * point we'll wait for the
3385 					 * lock... once we've got the
3386 					 * lock, we'll transfer all of
3387 					 * the pages from the local
3388 					 * queue to the global active
3389 					 * queue
3390 					 */
3391 					vm_page_reactivate_local(lid, FALSE, FALSE);
3392 				}
3393 			} else {
3394 				__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3395 
3396 				/*
3397 				 * test again now that we hold the
3398 				 * page queue lock
3399 				 */
3400 				if (!VM_PAGE_WIRED(m)) {
3401 					if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3402 						vm_page_queues_remove(m, FALSE);
3403 
3404 						VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3405 						VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
3406 					}
3407 
3408 					if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
3409 					    no_cache) {
3410 						/*
3411 						 * If this is a no_cache mapping
3412 						 * and the page has never been
3413 						 * mapped before or was
3414 						 * previously a no_cache page,
3415 						 * then we want to leave pages
3416 						 * in the speculative state so
3417 						 * that they can be readily
3418 						 * recycled if free memory runs
3419 						 * low.  Otherwise the page is
3420 						 * activated as normal.
3421 						 */
3422 
3423 						if (no_cache &&
3424 						    (!previously_pmapped ||
3425 						    m->vmp_no_cache)) {
3426 							m->vmp_no_cache = TRUE;
3427 
3428 							if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
3429 								vm_page_speculate(m, FALSE);
3430 							}
3431 						} else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3432 							vm_page_activate(m);
3433 						}
3434 					}
3435 				}
3436 				/* we keep the page queues lock, if we need it later */
3437 			}
3438 		}
3439 	}
3440 	/* we're done with the page queues lock, if we ever took it */
3441 	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3442 }
3443 
3444 /*
3445  * Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
3446  * @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
3447  * before being inserted into the pmap.
3448  */
3449 static bool
vm_fault_enter_set_mapped(vm_object_t object,vm_page_t m,vm_prot_t prot,vm_prot_t fault_type)3450 vm_fault_enter_set_mapped(
3451 	vm_object_t object,
3452 	vm_page_t m,
3453 	vm_prot_t prot,
3454 	vm_prot_t fault_type)
3455 {
3456 	bool page_needs_sync = false;
3457 	/*
3458 	 * NOTE: we may only hold the vm_object lock SHARED
3459 	 * at this point, so we need the phys_page lock to
3460 	 * properly serialize updating the pmapped and
3461 	 * xpmapped bits
3462 	 */
3463 	if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3464 		ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3465 
3466 		pmap_lock_phys_page(phys_page);
3467 		m->vmp_pmapped = TRUE;
3468 
3469 		if (!m->vmp_xpmapped) {
3470 			m->vmp_xpmapped = TRUE;
3471 
3472 			pmap_unlock_phys_page(phys_page);
3473 
3474 			if (!object->internal) {
3475 				OSAddAtomic(1, &vm_page_xpmapped_external_count);
3476 			}
3477 
3478 #if defined(__arm64__)
3479 			page_needs_sync = true;
3480 #else
3481 			if (object->internal &&
3482 			    object->pager != NULL) {
3483 				/*
3484 				 * This page could have been
3485 				 * uncompressed by the
3486 				 * compressor pager and its
3487 				 * contents might be only in
3488 				 * the data cache.
3489 				 * Since it's being mapped for
3490 				 * "execute" for the fist time,
3491 				 * make sure the icache is in
3492 				 * sync.
3493 				 */
3494 				assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3495 				page_needs_sync = true;
3496 			}
3497 #endif
3498 		} else {
3499 			pmap_unlock_phys_page(phys_page);
3500 		}
3501 	} else {
3502 		if (m->vmp_pmapped == FALSE) {
3503 			ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3504 
3505 			pmap_lock_phys_page(phys_page);
3506 			m->vmp_pmapped = TRUE;
3507 			pmap_unlock_phys_page(phys_page);
3508 		}
3509 	}
3510 
3511 	if (fault_type & VM_PROT_WRITE) {
3512 		if (m->vmp_wpmapped == FALSE) {
3513 			vm_object_lock_assert_exclusive(object);
3514 			if (!object->internal && object->pager) {
3515 				task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3516 			}
3517 			m->vmp_wpmapped = TRUE;
3518 		}
3519 	}
3520 	return page_needs_sync;
3521 }
3522 
3523 #if HAS_MTE
3524 static bool
vm_should_override_mte_cacheattr(pmap_t pmap,vm_object_t obj,__unused vm_map_address_t va,pmap_paddr_t pa)3525 vm_should_override_mte_cacheattr(
3526 	pmap_t pmap,
3527 	vm_object_t obj,
3528 	__unused vm_map_address_t va,
3529 	pmap_paddr_t pa)
3530 {
3531 	/*
3532 	 * We need to ask whether _any_ tagged mapping exists for this frame,
3533 	 * rather than asking whether the object we're holding _now_ is tagged.
3534 	 * This is how we ensure that if an MTE mapping escapes into a non-MTE
3535 	 * context, shuffles around a bit, then comes back around to the originating
3536 	 * context, we'll enter it as MTE.
3537 	 */
3538 	if (obj != VM_OBJECT_NULL
3539 	    && pmap_is_tagged_page((ppnum_t)atop(pa))
3540 	    && pmap->associated_vm_map_serial_id != obj->vmo_provenance) {
3541 		return true;
3542 	}
3543 
3544 	return false;
3545 }
3546 #endif
3547 
3548 static inline kern_return_t
vm_fault_pmap_validate_page(pmap_t pmap __unused,vm_page_t m __unused,vm_map_offset_t vaddr __unused,vm_prot_t prot __unused,vm_object_fault_info_t fault_info __unused,bool * page_sleep_needed)3549 vm_fault_pmap_validate_page(
3550 	pmap_t pmap __unused,
3551 	vm_page_t m __unused,
3552 	vm_map_offset_t vaddr __unused,
3553 	vm_prot_t prot __unused,
3554 	vm_object_fault_info_t fault_info __unused,
3555 	bool *page_sleep_needed)
3556 {
3557 	assert(page_sleep_needed != NULL);
3558 	*page_sleep_needed = false;
3559 #if CONFIG_SPTM
3560 	/*
3561 	 * Reject the executable or debug mapping if the page is already wired for I/O.  The SPTM's security
3562 	 * model doesn't allow us to reliably use executable pages for I/O due to both CS integrity
3563 	 * protections and the possibility that the pages may be dynamically retyped while wired for I/O.
3564 	 * This check is required to happen under the VM object lock in order to synchronize with the
3565 	 * complementary check on the I/O wiring path in vm_page_do_delayed_work().
3566 	 */
3567 	if (__improbable((m->vmp_cleaning || m->vmp_iopl_wired) &&
3568 	    pmap_will_retype(pmap, vaddr, VM_PAGE_GET_PHYS_PAGE(m), prot, fault_info->pmap_options |
3569 	    ((fault_info->fi_xnu_user_debug && !VM_PAGE_OBJECT(m)->code_signed) ? PMAP_OPTIONS_XNU_USER_DEBUG : 0),
3570 	    PMAP_MAPPING_TYPE_INFER))) {
3571 		if (__improbable(m->vmp_iopl_wired)) {
3572 			vm_map_guard_exception(vaddr, kGUARD_EXC_SEC_EXEC_ON_IOPL_PAGE);
3573 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
3574 			    KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_EXEC_ON_IOPL_PAGE), (uintptr_t)vaddr);
3575 			return KERN_PROTECTION_FAILURE;
3576 		}
3577 		*page_sleep_needed = m->vmp_cleaning;
3578 	}
3579 #endif /* CONFIG_SPTM */
3580 	return KERN_SUCCESS;
3581 }
3582 
3583 /*
3584  * wrappers for pmap_enter_options()
3585  */
3586 kern_return_t
pmap_enter_object_options_check(pmap_t pmap,vm_map_address_t virtual_address,vm_map_offset_t fault_phys_offset,vm_object_t obj,ppnum_t pn,vm_prot_t protection,vm_prot_t fault_type,boolean_t wired,unsigned int options)3587 pmap_enter_object_options_check(
3588 	pmap_t           pmap,
3589 	vm_map_address_t virtual_address,
3590 	vm_map_offset_t  fault_phys_offset,
3591 	vm_object_t      obj,
3592 	ppnum_t          pn,
3593 	vm_prot_t        protection,
3594 	vm_prot_t        fault_type,
3595 	boolean_t        wired,
3596 	unsigned int     options)
3597 {
3598 	unsigned int flags = 0;
3599 	unsigned int extra_options = 0;
3600 
3601 	if (obj->internal) {
3602 		extra_options |= PMAP_OPTIONS_INTERNAL;
3603 	}
3604 	pmap_paddr_t physical_address = (pmap_paddr_t)ptoa(pn) + fault_phys_offset;
3605 
3606 #if HAS_MTE
3607 	/*
3608 	 * By policy we sometimes decide to enter an MTE-capable object
3609 	 *  as non-MTE in a particular map.
3610 	 *
3611 	 * Most notably, we enact a general policy that MTE memory which escapes its
3612 	 * original context will be aliased in other maps as non-MTE (aliasing back
3613 	 *  into the originating map will result in an MTE-enabled mapping.)
3614 	 *
3615 	 * Using VM_WIMG_DEFAULT for this pmap_enter only sets the PTE values
3616 	 * correctly *for this mapping only* without changing the MTE-ness
3617 	 * of the underlying page.
3618 	 */
3619 	if (vm_should_override_mte_cacheattr(pmap, obj, virtual_address, physical_address)) {
3620 		/*
3621 		 * Certain first-party actors (such as WCP and BlastDoor) are modeled untrustworthy, and should never
3622 		 * be allowed to receive untagged aliases to tagged memory from other actors.
3623 		 * If we make it this far on a pmap that should never receive untagged aliases, throw a fatal guard.
3624 		 */
3625 		if (pmap->restrict_receiving_aliases_to_tagged_memory) {
3626 			/* Immediately send a fatal guard */
3627 			uint64_t address_to_report = 0;
3628 #if DEBUG || DEVELOPMENT
3629 			/* On internal variants, report the PA we tried to alias */
3630 			address_to_report = physical_address;
3631 #endif /* DEBUG || DEVELOPMENT */
3632 			mach_exception_code_t code = 0;
3633 			EXC_GUARD_ENCODE_TYPE(code, GUARD_TYPE_VIRT_MEMORY);
3634 			EXC_GUARD_ENCODE_FLAVOR(code, kGUARD_EXC_SEC_SHARING_DENIED);
3635 			thread_guard_violation(
3636 				current_thread(),
3637 				code,
3638 				address_to_report,
3639 				/* Fatal */
3640 				true);
3641 			/* And indicate that something went wrong */
3642 			return VM_FAULT_MEMORY_ERROR;
3643 		} else {
3644 			assert(!(flags & VM_WIMG_MASK));
3645 			flags |= VM_WIMG_USE_DEFAULT;
3646 		}
3647 	}
3648 #endif /* HAS_MTE */
3649 
3650 	return pmap_enter_options_addr(pmap,
3651 	           virtual_address,
3652 	           physical_address,
3653 	           protection,
3654 	           fault_type,
3655 	           flags,
3656 	           wired,
3657 	           options | extra_options,
3658 	           NULL,
3659 	           PMAP_MAPPING_TYPE_INFER);
3660 }
3661 
3662 kern_return_t
pmap_enter_options_check(pmap_t pmap,vm_map_address_t virtual_address,vm_map_offset_t fault_phys_offset,vm_page_t page,vm_prot_t protection,vm_prot_t fault_type,boolean_t wired,unsigned int options)3663 pmap_enter_options_check(
3664 	pmap_t           pmap,
3665 	vm_map_address_t virtual_address,
3666 	vm_map_offset_t  fault_phys_offset,
3667 	vm_page_t        page,
3668 	vm_prot_t        protection,
3669 	vm_prot_t        fault_type,
3670 	boolean_t        wired,
3671 	unsigned int     options)
3672 {
3673 	if (page->vmp_error) {
3674 		return KERN_MEMORY_FAILURE;
3675 	}
3676 	vm_object_t obj = VM_PAGE_OBJECT(page);
3677 	if (page->vmp_reusable || obj->all_reusable) {
3678 		options |= PMAP_OPTIONS_REUSABLE;
3679 	}
3680 	assert(page->vmp_pmapped);
3681 	if (fault_type & VM_PROT_WRITE) {
3682 		if (pmap == kernel_pmap) {
3683 			/*
3684 			 * The kernel sometimes needs to map a page to provide its
3685 			 * initial contents but that does not mean that the page is
3686 			 * actually dirty/modified, so let's not assert that it's been
3687 			 * "wpmapped".
3688 			 */
3689 		} else {
3690 			assert(page->vmp_wpmapped);
3691 		}
3692 	}
3693 	return pmap_enter_object_options_check(
3694 		pmap,
3695 		virtual_address,
3696 		fault_phys_offset,
3697 		obj,
3698 		VM_PAGE_GET_PHYS_PAGE(page),
3699 		protection,
3700 		fault_type,
3701 		wired,
3702 		options);
3703 }
3704 
3705 kern_return_t
pmap_enter_check(pmap_t pmap,vm_map_address_t virtual_address,vm_page_t page,vm_prot_t protection,vm_prot_t fault_type,boolean_t wired)3706 pmap_enter_check(
3707 	pmap_t           pmap,
3708 	vm_map_address_t virtual_address,
3709 	vm_page_t        page,
3710 	vm_prot_t        protection,
3711 	vm_prot_t        fault_type,
3712 	boolean_t        wired)
3713 {
3714 	return pmap_enter_options_check(pmap,
3715 	           virtual_address,
3716 	           0 /* fault_phys_offset */,
3717 	           page,
3718 	           protection,
3719 	           fault_type,
3720 	           wired,
3721 	           0 /* options */);
3722 }
3723 
3724 /*
3725  * Try to enter the given page into the pmap.
3726  * Will retry without execute permission if the code signing monitor is enabled and
3727  * we encounter a codesigning failure on a non-execute fault.
3728  */
3729 static kern_return_t
vm_fault_attempt_pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options)3730 vm_fault_attempt_pmap_enter(
3731 	pmap_t pmap,
3732 	vm_map_offset_t vaddr,
3733 	vm_map_size_t fault_page_size,
3734 	vm_map_offset_t fault_phys_offset,
3735 	vm_page_t m,
3736 	vm_prot_t *prot,
3737 	vm_prot_t caller_prot,
3738 	vm_prot_t fault_type,
3739 	bool wired,
3740 	int pmap_options)
3741 {
3742 #if !CODE_SIGNING_MONITOR
3743 #pragma unused(caller_prot)
3744 #endif /* !CODE_SIGNING_MONITOR */
3745 
3746 	kern_return_t kr;
3747 	if (fault_page_size != PAGE_SIZE) {
3748 		DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, *prot, fault_type);
3749 		assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
3750 		    fault_phys_offset < PAGE_SIZE),
3751 		    "0x%llx\n", (uint64_t)fault_phys_offset);
3752 	} else {
3753 		assertf(fault_phys_offset == 0,
3754 		    "0x%llx\n", (uint64_t)fault_phys_offset);
3755 	}
3756 
3757 	kr = pmap_enter_options_check(pmap, vaddr,
3758 	    fault_phys_offset,
3759 	    m, *prot, fault_type,
3760 	    wired, pmap_options);
3761 
3762 #if CODE_SIGNING_MONITOR
3763 	/*
3764 	 * Retry without execute permission if we encountered a codesigning
3765 	 * failure on a non-execute fault.  This allows applications which
3766 	 * don't actually need to execute code to still map it for read access.
3767 	 */
3768 	if (kr == KERN_CODESIGN_ERROR &&
3769 	    csm_enabled() &&
3770 	    (*prot & VM_PROT_EXECUTE) &&
3771 	    !(caller_prot & VM_PROT_EXECUTE)) {
3772 		*prot &= ~VM_PROT_EXECUTE;
3773 		kr = pmap_enter_options_check(pmap, vaddr,
3774 		    fault_phys_offset,
3775 		    m, *prot, fault_type,
3776 		    wired, pmap_options);
3777 	}
3778 #endif /* CODE_SIGNING_MONITOR */
3779 
3780 	return kr;
3781 }
3782 
3783 /*
3784  * Enter the given page into the pmap.
3785  * The map must be locked shared.
3786  * The vm object must NOT be locked.
3787  *
3788  * @param need_retry if not null, avoid making a (potentially) blocking call into
3789  * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3790  */
3791 static kern_return_t
vm_fault_pmap_enter(pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options,bool * need_retry)3792 vm_fault_pmap_enter(
3793 	pmap_t pmap,
3794 	vm_map_offset_t vaddr,
3795 	vm_map_size_t fault_page_size,
3796 	vm_map_offset_t fault_phys_offset,
3797 	vm_page_t m,
3798 	vm_prot_t *prot,
3799 	vm_prot_t caller_prot,
3800 	vm_prot_t fault_type,
3801 	bool wired,
3802 	int pmap_options,
3803 	bool *need_retry)
3804 {
3805 	kern_return_t kr;
3806 	if (need_retry != NULL) {
3807 		/*
3808 		 * Although we don't hold a lock on this object, we hold a lock
3809 		 * on the top object in the chain. To prevent a deadlock, we
3810 		 * can't allow the pmap layer to block.
3811 		 */
3812 		pmap_options |= PMAP_OPTIONS_NOWAIT;
3813 	}
3814 	kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3815 	    fault_page_size, fault_phys_offset,
3816 	    m, prot, caller_prot, fault_type, wired, pmap_options);
3817 	if (kr == KERN_RESOURCE_SHORTAGE) {
3818 		if (need_retry) {
3819 			/*
3820 			 * There's nothing we can do here since we hold the
3821 			 * lock on the top object in the chain. The caller
3822 			 * will need to deal with this by dropping that lock and retrying.
3823 			 */
3824 			*need_retry = true;
3825 			vm_pmap_enter_retried++;
3826 		}
3827 	}
3828 	return kr;
3829 }
3830 
3831 /*
3832  * Enter the given page into the pmap.
3833  * The vm map must be locked shared.
3834  * The vm object must be locked exclusive, unless this is a soft fault.
3835  * For a soft fault, the object must be locked shared or exclusive.
3836  *
3837  * @param need_retry if not null, avoid making a (potentially) blocking call into
3838  * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3839  */
3840 static kern_return_t
vm_fault_pmap_enter_with_object_lock(vm_object_t object,pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_page_t m,vm_prot_t * prot,vm_prot_t caller_prot,vm_prot_t fault_type,bool wired,int pmap_options,bool * need_retry,uint8_t * object_lock_type)3841 vm_fault_pmap_enter_with_object_lock(
3842 	vm_object_t object,
3843 	pmap_t pmap,
3844 	vm_map_offset_t vaddr,
3845 	vm_map_size_t fault_page_size,
3846 	vm_map_offset_t fault_phys_offset,
3847 	vm_page_t m,
3848 	vm_prot_t *prot,
3849 	vm_prot_t caller_prot,
3850 	vm_prot_t fault_type,
3851 	bool wired,
3852 	int pmap_options,
3853 	bool *need_retry,
3854 	uint8_t *object_lock_type)
3855 {
3856 	kern_return_t kr;
3857 
3858 	assert(need_retry != NULL);
3859 	*need_retry = false;
3860 
3861 	/*
3862 	 * Prevent a deadlock by not
3863 	 * holding the object lock if we need to wait for a page in
3864 	 * pmap_enter() - <rdar://problem/7138958>
3865 	 */
3866 	kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3867 	    fault_page_size, fault_phys_offset,
3868 	    m, prot, caller_prot, fault_type, wired, pmap_options | PMAP_OPTIONS_NOWAIT);
3869 #if __x86_64__
3870 	if (kr == KERN_INVALID_ARGUMENT &&
3871 	    pmap == PMAP_NULL &&
3872 	    wired) {
3873 		/*
3874 		 * Wiring a page in a pmap-less VM map:
3875 		 * VMware's "vmmon" kernel extension does this
3876 		 * to grab pages.
3877 		 * Let it proceed even though the PMAP_ENTER() failed.
3878 		 */
3879 		kr = KERN_SUCCESS;
3880 	}
3881 #endif /* __x86_64__ */
3882 
3883 	if (kr == KERN_RESOURCE_SHORTAGE) {
3884 		/*
3885 		 * We can't drop the object lock(s) here to retry the pmap_enter()
3886 		 * in a blocking way so that it can expand the page table as needed.
3887 		 * That would allow vm_object_copy_delayed() to create a new copy object
3888 		 * and change the copy-on-write obligations.
3889 		 * Our only recourse is to deal with it at a higher level where we can
3890 		 * drop both locks, expand the page table and retry the fault.
3891 		 */
3892 		*need_retry = true;
3893 		vm_pmap_enter_retried++;
3894 		goto done;
3895 	}
3896 
3897 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
3898 	if ((*prot & VM_PROT_WRITE) && m->vmp_unmodified_ro) {
3899 		if (*object_lock_type == OBJECT_LOCK_SHARED) {
3900 			boolean_t was_busy = m->vmp_busy;
3901 			m->vmp_busy = TRUE;
3902 
3903 			*object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3904 
3905 			if (vm_object_lock_upgrade(object) == FALSE) {
3906 				vm_object_lock(object);
3907 			}
3908 
3909 			if (!was_busy) {
3910 				vm_page_wakeup_done(object, m);
3911 			}
3912 		}
3913 		vm_object_lock_assert_exclusive(object);
3914 		vm_page_lockspin_queues();
3915 		m->vmp_unmodified_ro = false;
3916 		vm_page_unlock_queues();
3917 		os_atomic_dec(&compressor_ro_uncompressed, relaxed);
3918 
3919 		vm_object_compressor_pager_state_clr(VM_PAGE_OBJECT(m), m->vmp_offset);
3920 	}
3921 #else /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
3922 #pragma unused(object)
3923 #pragma unused(object_lock_type)
3924 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
3925 
3926 done:
3927 	return kr;
3928 }
3929 
3930 /*
3931  * Prepare to enter a page into the pmap by checking CS, protection bits,
3932  * and setting mapped bits on the page_t.
3933  * Does not modify the page's paging queue.
3934  *
3935  * page queue lock must NOT be held
3936  * m->vmp_object must be locked
3937  *
3938  * NOTE: m->vmp_object could be locked "shared" only if we are called
3939  * from vm_fault() as part of a soft fault.
3940  */
3941 static kern_return_t
vm_fault_enter_prepare(vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_prot_t * prot,vm_prot_t caller_prot,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_prot_t fault_type,vm_object_fault_info_t fault_info,int * type_of_fault,bool * page_needs_data_sync,bool * page_needs_sleep)3942 vm_fault_enter_prepare(
3943 	vm_page_t m,
3944 	pmap_t pmap,
3945 	vm_map_offset_t vaddr,
3946 	vm_prot_t *prot,
3947 	vm_prot_t caller_prot,
3948 	vm_map_size_t fault_page_size,
3949 	vm_map_offset_t fault_phys_offset,
3950 	vm_prot_t fault_type,
3951 	vm_object_fault_info_t fault_info,
3952 	int *type_of_fault,
3953 	bool *page_needs_data_sync,
3954 	bool *page_needs_sleep)
3955 {
3956 	kern_return_t   kr;
3957 	bool            is_tainted = false;
3958 	vm_object_t     object;
3959 	boolean_t       cs_bypass = fault_info->cs_bypass;
3960 
3961 	object = VM_PAGE_OBJECT(m);
3962 
3963 	vm_object_lock_assert_held(object);
3964 
3965 #if KASAN
3966 	if (pmap == kernel_pmap) {
3967 		kasan_notify_address(vaddr, PAGE_SIZE);
3968 	}
3969 #endif
3970 
3971 #if CODE_SIGNING_MONITOR
3972 	if (csm_address_space_exempt(pmap) == KERN_SUCCESS) {
3973 		cs_bypass = TRUE;
3974 	}
3975 #endif
3976 
3977 	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
3978 
3979 	if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
3980 		vm_object_lock_assert_exclusive(object);
3981 	} else if ((fault_type & VM_PROT_WRITE) == 0 &&
3982 	    !fault_info->fi_change_wiring &&
3983 	    (!m->vmp_wpmapped
3984 #if VM_OBJECT_ACCESS_TRACKING
3985 	    || object->access_tracking
3986 #endif /* VM_OBJECT_ACCESS_TRACKING */
3987 	    )) {
3988 		/*
3989 		 * This is not a "write" fault, so we
3990 		 * might not have taken the object lock
3991 		 * exclusively and we might not be able
3992 		 * to update the "wpmapped" bit in
3993 		 * vm_fault_enter().
3994 		 * Let's just grant read access to
3995 		 * the page for now and we'll
3996 		 * soft-fault again if we need write
3997 		 * access later...
3998 		 */
3999 
4000 		/* This had better not be a JIT page. */
4001 		if (pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
4002 			/*
4003 			 * This pmap enforces extra constraints for this set of
4004 			 * protections, so we can't modify them.
4005 			 */
4006 			if (!cs_bypass) {
4007 				panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x !cs_bypass",
4008 				    __FUNCTION__, pmap, (uint64_t)vaddr,
4009 				    *prot, fault_info->pmap_options);
4010 			}
4011 		} else {
4012 			*prot &= ~VM_PROT_WRITE;
4013 		}
4014 	}
4015 	if (m->vmp_pmapped == FALSE) {
4016 		if (m->vmp_clustered) {
4017 			if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
4018 				/*
4019 				 * found it in the cache, but this
4020 				 * is the first fault-in of the page (m->vmp_pmapped == FALSE)
4021 				 * so it must have come in as part of
4022 				 * a cluster... account 1 pagein against it
4023 				 */
4024 				if (object->internal) {
4025 					*type_of_fault = DBG_PAGEIND_FAULT;
4026 				} else {
4027 					*type_of_fault = DBG_PAGEINV_FAULT;
4028 				}
4029 
4030 				VM_PAGE_COUNT_AS_PAGEIN(m);
4031 			}
4032 			VM_PAGE_CONSUME_CLUSTERED(m);
4033 		}
4034 	}
4035 
4036 	if (*type_of_fault != DBG_COW_FAULT) {
4037 		DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
4038 
4039 		if (pmap == kernel_pmap) {
4040 			DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
4041 		}
4042 	}
4043 
4044 	kr = vm_fault_pmap_validate_page(pmap, m, vaddr, *prot, fault_info, page_needs_sleep);
4045 	if (__improbable((kr != KERN_SUCCESS) || *page_needs_sleep)) {
4046 		return kr;
4047 	}
4048 	kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr,
4049 	    *prot, caller_prot, fault_page_size, fault_phys_offset,
4050 	    fault_info, &is_tainted);
4051 	if (kr == KERN_SUCCESS) {
4052 		/*
4053 		 * We either have a good page, or a tainted page that has been accepted by the process.
4054 		 * In both cases the page will be entered into the pmap.
4055 		 */
4056 		*page_needs_data_sync = vm_fault_enter_set_mapped(object, m, *prot, fault_type);
4057 		if ((fault_type & VM_PROT_WRITE) && is_tainted) {
4058 			/*
4059 			 * This page is tainted but we're inserting it anyways.
4060 			 * Since it's writeable, we need to disconnect it from other pmaps
4061 			 * now so those processes can take note.
4062 			 */
4063 
4064 			/*
4065 			 * We can only get here
4066 			 * because of the CSE logic
4067 			 */
4068 			assert(pmap_get_vm_map_cs_enforced(pmap));
4069 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
4070 			/*
4071 			 * If we are faulting for a write, we can clear
4072 			 * the execute bit - that will ensure the page is
4073 			 * checked again before being executable, which
4074 			 * protects against a map switch.
4075 			 * This only happens the first time the page
4076 			 * gets tainted, so we won't get stuck here
4077 			 * to make an already writeable page executable.
4078 			 */
4079 			if (!cs_bypass) {
4080 				if (pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
4081 					/*
4082 					 * This pmap enforces extra constraints
4083 					 * for this set of protections, so we
4084 					 * can't change the protections.
4085 					 */
4086 					panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x",
4087 					    __FUNCTION__, pmap,
4088 					    (uint64_t)vaddr, *prot,
4089 					    fault_info->pmap_options);
4090 				}
4091 				*prot &= ~VM_PROT_EXECUTE;
4092 			}
4093 		}
4094 		assert(VM_PAGE_OBJECT(m) == object);
4095 
4096 #if VM_OBJECT_ACCESS_TRACKING
4097 		if (object->access_tracking) {
4098 			DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
4099 			if (fault_type & VM_PROT_WRITE) {
4100 				object->access_tracking_writes++;
4101 				vm_object_access_tracking_writes++;
4102 			} else {
4103 				object->access_tracking_reads++;
4104 				vm_object_access_tracking_reads++;
4105 			}
4106 		}
4107 #endif /* VM_OBJECT_ACCESS_TRACKING */
4108 	}
4109 
4110 	return kr;
4111 }
4112 
4113 /*
4114  * page queue lock must NOT be held
4115  * m->vmp_object must be locked
4116  *
4117  * NOTE: m->vmp_object could be locked "shared" only if we are called
4118  * from vm_fault() as part of a soft fault.  If so, we must be
4119  * careful not to modify the VM object in any way that is not
4120  * legal under a shared lock...
4121  */
4122 kern_return_t
vm_fault_enter(vm_page_t m,pmap_t pmap,vm_map_offset_t vaddr,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,vm_prot_t prot,vm_prot_t caller_prot,boolean_t wired,vm_tag_t wire_tag,vm_object_fault_info_t fault_info,bool * need_retry,int * type_of_fault,uint8_t * object_lock_type,bool * page_needs_sleep)4123 vm_fault_enter(
4124 	vm_page_t m,
4125 	pmap_t pmap,
4126 	vm_map_offset_t vaddr,
4127 	vm_map_size_t fault_page_size,
4128 	vm_map_offset_t fault_phys_offset,
4129 	vm_prot_t prot,
4130 	vm_prot_t caller_prot,
4131 	boolean_t wired,
4132 	vm_tag_t  wire_tag,
4133 	vm_object_fault_info_t fault_info,
4134 	bool *need_retry,
4135 	int *type_of_fault,
4136 	uint8_t *object_lock_type,
4137 	bool *page_needs_sleep)
4138 {
4139 	kern_return_t   kr;
4140 	vm_object_t     object;
4141 	bool            page_needs_data_sync;
4142 	vm_prot_t       fault_type;
4143 	int             pmap_options = fault_info->pmap_options;
4144 
4145 	assert(need_retry != NULL);
4146 
4147 	if (vm_page_is_guard(m)) {
4148 		return KERN_SUCCESS;
4149 	}
4150 
4151 	fault_type = fault_info->fi_change_wiring ? VM_PROT_NONE : caller_prot;
4152 
4153 	assertf(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL, "m=%p", m);
4154 	kr = vm_fault_enter_prepare(m, pmap, vaddr, &prot, caller_prot,
4155 	    fault_page_size, fault_phys_offset, fault_type,
4156 	    fault_info, type_of_fault, &page_needs_data_sync, page_needs_sleep);
4157 	object = VM_PAGE_OBJECT(m);
4158 
4159 	vm_fault_enqueue_page(object, m, wired, fault_info->fi_change_wiring, wire_tag, fault_info->no_cache, type_of_fault, kr);
4160 
4161 	if (__probable((kr == KERN_SUCCESS) && !(*page_needs_sleep))) {
4162 		if (page_needs_data_sync) {
4163 			pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
4164 		}
4165 
4166 		if (fault_info->fi_xnu_user_debug && !object->code_signed) {
4167 			pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
4168 		}
4169 
4170 
4171 		kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
4172 		    fault_page_size, fault_phys_offset, m,
4173 		    &prot, caller_prot, fault_type, wired, pmap_options, need_retry, object_lock_type);
4174 	}
4175 
4176 	return kr;
4177 }
4178 
4179 kern_return_t
vm_pre_fault_with_info(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t prot,vm_object_fault_info_t fault_info)4180 vm_pre_fault_with_info(
4181 	vm_map_t                map,
4182 	vm_map_offset_t         vaddr,
4183 	vm_prot_t               prot,
4184 	vm_object_fault_info_t  fault_info)
4185 {
4186 	assert(fault_info != NULL);
4187 	if (pmap_find_phys(map->pmap, vaddr) == 0) {
4188 		return vm_fault_internal(map,
4189 		           vaddr,               /* vaddr */
4190 		           prot,                /* fault_type */
4191 		           VM_KERN_MEMORY_NONE, /* tag - not wiring */
4192 		           NULL,                /* caller_pmap */
4193 		           0,                   /* caller_pmap_addr */
4194 		           NULL,
4195 		           fault_info);
4196 	}
4197 	return KERN_SUCCESS;
4198 }
4199 
4200 /*
4201  * Fault on the given vaddr iff the page is not already entered in the pmap.
4202  */
4203 kern_return_t
vm_pre_fault(vm_map_offset_t vaddr,vm_prot_t prot)4204 vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
4205 {
4206 	struct vm_object_fault_info fault_info = {
4207 		.interruptible = THREAD_UNINT,
4208 	};
4209 	return vm_pre_fault_with_info(current_map(), vaddr, prot, &fault_info);
4210 }
4211 
4212 /*
4213  *	Routine:	vm_fault
4214  *	Purpose:
4215  *		Handle page faults, including pseudo-faults
4216  *		used to change the wiring status of pages.
4217  *	Returns:
4218  *		Explicit continuations have been removed.
4219  *	Implementation:
4220  *		vm_fault and vm_fault_page save mucho state
4221  *		in the moral equivalent of a closure.  The state
4222  *		structure is allocated when first entering vm_fault
4223  *		and deallocated when leaving vm_fault.
4224  */
4225 
4226 extern uint64_t get_current_unique_pid(void);
4227 
4228 unsigned long vm_fault_collapse_total = 0;
4229 unsigned long vm_fault_collapse_skipped = 0;
4230 
4231 
4232 kern_return_t
vm_fault_external(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t fault_type,boolean_t change_wiring,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr)4233 vm_fault_external(
4234 	vm_map_t        map,
4235 	vm_map_offset_t vaddr,
4236 	vm_prot_t       fault_type,
4237 	boolean_t       change_wiring,
4238 	int             interruptible,
4239 	pmap_t          caller_pmap,
4240 	vm_map_offset_t caller_pmap_addr)
4241 {
4242 	struct vm_object_fault_info fault_info = {
4243 		.interruptible = interruptible,
4244 		.fi_change_wiring = change_wiring,
4245 	};
4246 
4247 	return vm_fault_internal(map, vaddr, fault_type,
4248 	           change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
4249 	           caller_pmap, caller_pmap_addr,
4250 	           NULL, &fault_info);
4251 }
4252 
4253 kern_return_t
vm_fault(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t fault_type,boolean_t change_wiring,vm_tag_t wire_tag,int interruptible,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr)4254 vm_fault(
4255 	vm_map_t        map,
4256 	vm_map_offset_t vaddr,
4257 	vm_prot_t       fault_type,
4258 	boolean_t       change_wiring,
4259 	vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
4260 	int             interruptible,
4261 	pmap_t          caller_pmap,
4262 	vm_map_offset_t caller_pmap_addr)
4263 {
4264 	struct vm_object_fault_info fault_info = {
4265 		.interruptible = interruptible,
4266 		.fi_change_wiring = change_wiring,
4267 	};
4268 
4269 	return vm_fault_internal(map, vaddr, fault_type, wire_tag,
4270 	           caller_pmap, caller_pmap_addr,
4271 	           NULL, &fault_info);
4272 }
4273 
4274 static boolean_t
current_proc_is_privileged(void)4275 current_proc_is_privileged(void)
4276 {
4277 	return csproc_get_platform_binary(current_proc());
4278 }
4279 
4280 uint64_t vm_copied_on_read = 0;
4281 uint64_t vm_copied_on_read_kernel_map = 0;
4282 uint64_t vm_copied_on_read_platform_map = 0;
4283 
4284 /*
4285  * Cleanup after a vm_fault_enter.
4286  * At this point, the fault should either have failed (kr != KERN_SUCCESS)
4287  * or the page should be in the pmap and on the correct paging queue.
4288  *
4289  * Precondition:
4290  * map must be locked shared.
4291  * m_object must be locked.
4292  * If top_object != VM_OBJECT_NULL, it must be locked.
4293  * real_map must be locked.
4294  *
4295  * Postcondition:
4296  * map will be unlocked
4297  * m_object will be unlocked
4298  * top_object will be unlocked
4299  * If real_map != map, it will be unlocked
4300  */
4301 static void
vm_fault_complete(vm_map_t map,vm_map_t real_map,vm_object_t object,vm_object_t m_object,vm_page_t m,vm_map_offset_t offset,vm_map_offset_t trace_real_vaddr,vm_object_fault_info_t fault_info,vm_prot_t caller_prot,vm_map_offset_t real_vaddr,int type_of_fault,bool need_retry,kern_return_t kr,ppnum_t * physpage_p,vm_prot_t prot,vm_object_t top_object,boolean_t need_collapse,vm_map_offset_t cur_offset,vm_prot_t fault_type,vm_object_t * written_on_object,memory_object_t * written_on_pager,vm_object_offset_t * written_on_offset)4302 vm_fault_complete(
4303 	vm_map_t map,
4304 	vm_map_t real_map,
4305 	vm_object_t object,
4306 	vm_object_t m_object,
4307 	vm_page_t m,
4308 	vm_map_offset_t offset,
4309 	vm_map_offset_t trace_real_vaddr,
4310 	vm_object_fault_info_t fault_info,
4311 	vm_prot_t caller_prot,
4312 #if CONFIG_DTRACE
4313 	vm_map_offset_t real_vaddr,
4314 #else
4315 	__unused vm_map_offset_t real_vaddr,
4316 #endif /* CONFIG_DTRACE */
4317 	int type_of_fault,
4318 	bool need_retry,
4319 	kern_return_t kr,
4320 	ppnum_t *physpage_p,
4321 	vm_prot_t prot,
4322 	vm_object_t top_object,
4323 	boolean_t need_collapse,
4324 	vm_map_offset_t cur_offset,
4325 	vm_prot_t fault_type,
4326 	vm_object_t *written_on_object,
4327 	memory_object_t *written_on_pager,
4328 	vm_object_offset_t *written_on_offset)
4329 {
4330 	int     event_code = 0;
4331 
4332 	vm_map_lock_assert_shared(map);
4333 	vm_object_lock_assert_held(m_object);
4334 	if (top_object != VM_OBJECT_NULL) {
4335 		vm_object_lock_assert_held(top_object);
4336 	}
4337 	vm_map_lock_assert_held(real_map);
4338 
4339 	if (m_object->internal) {
4340 		event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
4341 	} else if (m_object->object_is_shared_cache) {
4342 		event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
4343 	} else {
4344 		event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
4345 	}
4346 	KDBG_RELEASE(event_code | DBG_FUNC_NONE, trace_real_vaddr, (fault_info->user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid());
4347 	if (!need_retry) {
4348 		KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid());
4349 	}
4350 	DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
4351 	if (kr == KERN_SUCCESS &&
4352 	    physpage_p != NULL) {
4353 		/* for vm_map_wire_and_extract() */
4354 		*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
4355 		if (prot & VM_PROT_WRITE) {
4356 			vm_object_lock_assert_exclusive(m_object);
4357 			m->vmp_dirty = TRUE;
4358 		}
4359 	}
4360 
4361 	if (top_object != VM_OBJECT_NULL) {
4362 		/*
4363 		 * It's safe to drop the top object
4364 		 * now that we've done our
4365 		 * vm_fault_enter().  Any other fault
4366 		 * in progress for that virtual
4367 		 * address will either find our page
4368 		 * and translation or put in a new page
4369 		 * and translation.
4370 		 */
4371 		vm_object_unlock(top_object);
4372 		top_object = VM_OBJECT_NULL;
4373 	}
4374 
4375 	if (need_collapse == TRUE) {
4376 		vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
4377 	}
4378 
4379 	if (!need_retry &&
4380 	    (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
4381 		/*
4382 		 * evaluate access pattern and update state
4383 		 * vm_fault_deactivate_behind depends on the
4384 		 * state being up to date
4385 		 */
4386 		vm_fault_is_sequential(m_object, cur_offset, fault_info->behavior);
4387 
4388 		vm_fault_deactivate_behind(m_object, cur_offset, fault_info->behavior);
4389 	}
4390 	/*
4391 	 * That's it, clean up and return.
4392 	 */
4393 	if (m->vmp_busy) {
4394 		vm_object_lock_assert_exclusive(m_object);
4395 		vm_page_wakeup_done(m_object, m);
4396 	}
4397 
4398 	if (!need_retry && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
4399 		vm_object_paging_begin(m_object);
4400 
4401 		assert3p(*written_on_object, ==, VM_OBJECT_NULL);
4402 		*written_on_object = m_object;
4403 		*written_on_pager = m_object->pager;
4404 		*written_on_offset = m_object->paging_offset + m->vmp_offset;
4405 	}
4406 	vm_object_unlock(object);
4407 
4408 	vm_map_unlock_read(map);
4409 	if (real_map != map) {
4410 		vm_map_unlock(real_map);
4411 	}
4412 }
4413 
4414 static inline int
vm_fault_type_for_tracing(boolean_t need_copy_on_read,int type_of_fault)4415 vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
4416 {
4417 	if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
4418 		return DBG_COR_FAULT;
4419 	}
4420 	return type_of_fault;
4421 }
4422 
4423 uint64_t vm_fault_resilient_media_initiate = 0;
4424 uint64_t vm_fault_resilient_media_retry = 0;
4425 uint64_t vm_fault_resilient_media_proceed = 0;
4426 uint64_t vm_fault_resilient_media_release = 0;
4427 uint64_t vm_fault_resilient_media_abort1 = 0;
4428 uint64_t vm_fault_resilient_media_abort2 = 0;
4429 
4430 #if MACH_ASSERT
4431 int vm_fault_resilient_media_inject_error1_rate = 0;
4432 int vm_fault_resilient_media_inject_error1 = 0;
4433 int vm_fault_resilient_media_inject_error2_rate = 0;
4434 int vm_fault_resilient_media_inject_error2 = 0;
4435 int vm_fault_resilient_media_inject_error3_rate = 0;
4436 int vm_fault_resilient_media_inject_error3 = 0;
4437 #endif /* MACH_ASSERT */
4438 
4439 kern_return_t
vm_fault_internal(vm_map_t map,vm_map_offset_t vaddr,vm_prot_t caller_prot,vm_tag_t wire_tag,pmap_t caller_pmap,vm_map_offset_t caller_pmap_addr,ppnum_t * physpage_p,vm_object_fault_info_t fault_info)4440 vm_fault_internal(
4441 	vm_map_t           map,
4442 	vm_map_offset_t    vaddr,
4443 	vm_prot_t          caller_prot,
4444 	vm_tag_t           wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
4445 	pmap_t             caller_pmap,
4446 	vm_map_offset_t    caller_pmap_addr,
4447 	ppnum_t            *physpage_p,
4448 	vm_object_fault_info_t fault_info)
4449 {
4450 	vm_map_version_t        version;        /* Map version for verificiation */
4451 	boolean_t               wired;          /* Should mapping be wired down? */
4452 	vm_object_t             object;         /* Top-level object */
4453 	vm_object_offset_t      offset;         /* Top-level offset */
4454 	vm_prot_t               prot;           /* Protection for mapping */
4455 	vm_object_t             old_copy_object; /* Saved copy object */
4456 	uint64_t                old_copy_version;
4457 	vm_page_t               result_page;    /* Result of vm_fault_page */
4458 	vm_page_t               top_page;       /* Placeholder page */
4459 	kern_return_t           kr;
4460 
4461 	vm_page_t               m;      /* Fast access to result_page */
4462 	kern_return_t           error_code;
4463 	vm_object_t             cur_object;
4464 	vm_object_t             m_object = NULL;
4465 	vm_object_offset_t      cur_offset;
4466 	vm_page_t               cur_m;
4467 	vm_object_t             new_object;
4468 	int                     type_of_fault;
4469 	pmap_t                  pmap;
4470 	wait_interrupt_t        interruptible_state;
4471 	vm_map_t                real_map = map;
4472 	vm_map_t                original_map = map;
4473 	bool                    object_locks_dropped = FALSE;
4474 	vm_prot_t               fault_type;
4475 	vm_prot_t               original_fault_type;
4476 	bool                    need_collapse = FALSE;
4477 	bool                    need_retry = false;
4478 	uint8_t                 object_lock_type = 0;
4479 	uint8_t                 cur_object_lock_type;
4480 	vm_object_t             top_object = VM_OBJECT_NULL;
4481 	vm_object_t             written_on_object = VM_OBJECT_NULL;
4482 	memory_object_t         written_on_pager = NULL;
4483 	vm_object_offset_t      written_on_offset = 0;
4484 	int                     throttle_delay;
4485 	int                     compressed_count_delta;
4486 	vm_grab_options_t       grab_options;
4487 	bool                    need_copy;
4488 	bool                    need_copy_on_read;
4489 	vm_map_offset_t         trace_vaddr;
4490 	vm_map_offset_t         trace_real_vaddr;
4491 	vm_map_size_t           fault_page_size;
4492 	vm_map_size_t           fault_page_mask;
4493 	int                     fault_page_shift;
4494 	vm_map_offset_t         fault_phys_offset;
4495 	vm_map_offset_t         real_vaddr;
4496 	bool                    resilient_media_retry = false;
4497 	bool                    resilient_media_ref_transfer = false;
4498 	vm_object_t             resilient_media_object = VM_OBJECT_NULL;
4499 	vm_object_offset_t      resilient_media_offset = (vm_object_offset_t)-1;
4500 	bool                    page_needs_data_sync = false;
4501 	/*
4502 	 * Was the VM object contended when vm_map_lookup_and_lock_object locked it?
4503 	 * If so, the zero fill path will drop the lock
4504 	 * NB: Ideally we would always drop the lock rather than rely on
4505 	 * this heuristic, but vm_object_unlock currently takes > 30 cycles.
4506 	 */
4507 	bool                    object_is_contended = false;
4508 
4509 	vmlp_api_start(VM_FAULT_INTERNAL);
4510 
4511 #if HAS_MTE || HAS_MTE_EMULATION_SHIMS
4512 	/*
4513 	 * We may be faulting on a tagged address. Canonicalize it here so we have
4514 	 * a chance to find it in the VM map.
4515 	 */
4516 	if (current_task_has_sec_enabled()) {
4517 		vaddr = vm_memtag_canonicalize(map, vaddr);
4518 	}
4519 #endif /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
4520 
4521 	real_vaddr = vaddr;
4522 	trace_real_vaddr = vaddr;
4523 
4524 	/*
4525 	 * Some (kernel) submaps are marked with "should never fault", so that
4526 	 * guard pages in such submaps do not need to use fictitious
4527 	 * placeholders at all, while not causing ZFOD pages to be made
4528 	 * (which is the default behavior otherwise).
4529 	 *
4530 	 * We also want capture the fault address easily so that the zone
4531 	 * allocator might present an enhanced panic log.
4532 	 */
4533 	if (map->never_faults) {
4534 		assert(map->pmap == kernel_pmap);
4535 		vmlp_api_end(VM_FAULT_INTERNAL, KERN_INVALID_ADDRESS);
4536 		return KERN_INVALID_ADDRESS;
4537 	}
4538 
4539 	if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
4540 		fault_phys_offset = (vm_map_offset_t)-1;
4541 		fault_page_size = VM_MAP_PAGE_SIZE(original_map);
4542 		fault_page_mask = VM_MAP_PAGE_MASK(original_map);
4543 		fault_page_shift = VM_MAP_PAGE_SHIFT(original_map);
4544 		if (fault_page_size < PAGE_SIZE) {
4545 			DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
4546 			vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
4547 		}
4548 	} else {
4549 		fault_phys_offset = 0;
4550 		fault_page_size = PAGE_SIZE;
4551 		fault_page_mask = PAGE_MASK;
4552 		fault_page_shift = PAGE_SHIFT;
4553 		vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
4554 	}
4555 
4556 	if (map == kernel_map) {
4557 		trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
4558 		trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
4559 	} else {
4560 		trace_vaddr = vaddr;
4561 	}
4562 
4563 	KDBG_RELEASE(
4564 		(VMDBG_CODE(DBG_VM_FAULT_INTERNAL)) | DBG_FUNC_START,
4565 		((uint64_t)trace_vaddr >> 32),
4566 		trace_vaddr,
4567 		(map == kernel_map));
4568 
4569 	if (get_preemption_level() != 0) {
4570 		KDBG_RELEASE(
4571 			(VMDBG_CODE(DBG_VM_FAULT_INTERNAL)) | DBG_FUNC_END,
4572 			((uint64_t)trace_vaddr >> 32),
4573 			trace_vaddr,
4574 			KERN_FAILURE);
4575 
4576 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NONZERO_PREEMPTION_LEVEL), 0 /* arg */);
4577 		vmlp_api_end(VM_FAULT_INTERNAL, KERN_FAILURE);
4578 		return KERN_FAILURE;
4579 	}
4580 
4581 	thread_t cthread = current_thread();
4582 
4583 	if (cthread->th_vm_faults_disabled) {
4584 		KDBG_RELEASE(
4585 			(MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4586 			((uint64_t)trace_vaddr >> 32),
4587 			trace_vaddr,
4588 			KERN_FAILURE);
4589 		ktriage_record(thread_tid(cthread),
4590 		    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
4591 		    KDBG_TRIAGE_RESERVED,
4592 		    KDBG_TRIAGE_VM_FAULTS_DISABLED),
4593 		    0 /* arg */);
4594 		vmlp_api_end(VM_FAULT_INTERNAL, KERN_FAILURE);
4595 		return KERN_FAILURE;
4596 	}
4597 
4598 	bool     rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
4599 	bool     page_sleep_needed = false;
4600 	uint64_t fstart = 0;
4601 
4602 	if (rtfault) {
4603 		fstart = mach_continuous_time();
4604 	}
4605 
4606 	assert(fault_info != NULL);
4607 	interruptible_state = thread_interrupt_level(fault_info->interruptible);
4608 
4609 	fault_type = (fault_info->fi_change_wiring ? VM_PROT_NONE : caller_prot);
4610 
4611 	counter_inc(&vm_statistics_faults);
4612 	counter_inc(&current_task()->faults);
4613 	original_fault_type = fault_type;
4614 
4615 	need_copy = FALSE;
4616 	if (fault_type & VM_PROT_WRITE) {
4617 		need_copy = TRUE;
4618 	}
4619 
4620 	if (need_copy || fault_info->fi_change_wiring) {
4621 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4622 	} else {
4623 		object_lock_type = OBJECT_LOCK_SHARED;
4624 	}
4625 
4626 	cur_object_lock_type = OBJECT_LOCK_SHARED;
4627 
4628 	if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
4629 		if (compressor_map) {
4630 			if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
4631 				panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
4632 			}
4633 		}
4634 	}
4635 RetryFault:
4636 	assert3p(written_on_object, ==, VM_OBJECT_NULL);
4637 
4638 	/*
4639 	 * assume we will hit a page in the cache
4640 	 * otherwise, explicitly override with
4641 	 * the real fault type once we determine it
4642 	 */
4643 	type_of_fault = DBG_CACHE_HIT_FAULT;
4644 
4645 	/*
4646 	 *	Find the backing store object and offset into
4647 	 *	it to begin the search.
4648 	 */
4649 	fault_type = original_fault_type;
4650 	map = original_map;
4651 	vm_map_lock_read(map);
4652 
4653 	if (resilient_media_retry) {
4654 		/*
4655 		 * If we have to insert a fake zero-filled page to hide
4656 		 * a media failure to provide the real page, we need to
4657 		 * resolve any pending copy-on-write on this mapping.
4658 		 * VM_PROT_COPY tells vm_map_lookup_and_lock_object() to deal
4659 		 * with that even if this is not a "write" fault.
4660 		 */
4661 		need_copy = TRUE;
4662 		/*
4663 		 * If the top object is COPY_DELAYED and has a "copy" object,
4664 		 * we would have to push our zero-filled page to this copy
4665 		 * object before allowing it to be modified, so let's consider
4666 		 * this as a read-only fault for now.  If this was a write
4667 		 * fault, we'll fault again on the read-only zero-filled page
4668 		 * and fulfill our copy-on-write obligations then.
4669 		 */
4670 		fault_type = VM_PROT_READ;
4671 		/*
4672 		 * We need the object's exclusive lock to insert the
4673 		 * zero-filled page.
4674 		 */
4675 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4676 		vm_fault_resilient_media_retry++;
4677 	}
4678 
4679 	kr = vm_map_lookup_and_lock_object(&map, vaddr,
4680 	    (fault_type | (need_copy ? VM_PROT_COPY : 0)),
4681 	    object_lock_type, &version,
4682 	    &object, &offset, &prot, &wired,
4683 	    fault_info,
4684 	    &real_map,
4685 	    &object_is_contended);
4686 	object_is_contended = false; /* avoid unsafe optimization */
4687 
4688 	if (kr != KERN_SUCCESS) {
4689 		vm_map_unlock_read(map);
4690 		/*
4691 		 * This can be seen in a crash report if indeed the
4692 		 * thread is crashing due to an invalid access in a non-existent
4693 		 * range.
4694 		 * Turning this OFF for now because it is noisy and not always fatal
4695 		 * eg prefaulting.
4696 		 *
4697 		 * if (kr == KERN_INVALID_ADDRESS) {
4698 		 *	ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_ADDRESS_NOT_FOUND), 0);
4699 		 * }
4700 		 */
4701 		goto done;
4702 	}
4703 
4704 	pmap = real_map->pmap;
4705 	fault_info->io_sync = FALSE;
4706 	fault_info->mark_zf_absent = FALSE;
4707 	fault_info->batch_pmap_op = FALSE;
4708 
4709 
4710 	if (resilient_media_retry) {
4711 		/*
4712 		 * We're retrying this fault after having detected a media
4713 		 * failure from a "resilient_media" mapping.
4714 		 * Check that the mapping is still pointing at the object
4715 		 * that just failed to provide a page.
4716 		 */
4717 		assert(resilient_media_object != VM_OBJECT_NULL);
4718 		assert(resilient_media_offset != (vm_object_offset_t)-1);
4719 		if ((object != VM_OBJECT_NULL &&
4720 		    object == resilient_media_object &&
4721 		    offset == resilient_media_offset &&
4722 		    fault_info->resilient_media)
4723 #if MACH_ASSERT
4724 		    && (vm_fault_resilient_media_inject_error1_rate == 0 ||
4725 		    (++vm_fault_resilient_media_inject_error1 % vm_fault_resilient_media_inject_error1_rate) != 0)
4726 #endif /* MACH_ASSERT */
4727 		    ) {
4728 			/*
4729 			 * This mapping still points at the same object
4730 			 * and is still "resilient_media": proceed in
4731 			 * "recovery-from-media-failure" mode, where we'll
4732 			 * insert a zero-filled page in the top object.
4733 			 */
4734 //                     printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
4735 			vm_fault_resilient_media_proceed++;
4736 		} else {
4737 			/* not recovering: reset state and retry fault */
4738 //                     printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info->resilient_media, object, resilient_media_object, offset, resilient_media_offset);
4739 			vm_object_unlock(object);
4740 			if (real_map != map) {
4741 				vm_map_unlock(real_map);
4742 			}
4743 			vm_map_unlock_read(map);
4744 			/* release our extra reference on failed object */
4745 //                     printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
4746 			vm_object_deallocate(resilient_media_object);
4747 			resilient_media_object = VM_OBJECT_NULL;
4748 			resilient_media_offset = (vm_object_offset_t)-1;
4749 			resilient_media_retry = false;
4750 			vm_fault_resilient_media_abort1++;
4751 			goto RetryFault;
4752 		}
4753 	} else {
4754 		assert(resilient_media_object == VM_OBJECT_NULL);
4755 		resilient_media_offset = (vm_object_offset_t)-1;
4756 	}
4757 
4758 	/*
4759 	 * If the page is wired, we must fault for the current protection
4760 	 * value, to avoid further faults.
4761 	 */
4762 	if (wired) {
4763 		fault_type = prot | VM_PROT_WRITE;
4764 	}
4765 	if (wired || need_copy) {
4766 		/*
4767 		 * since we're treating this fault as a 'write'
4768 		 * we must hold the top object lock exclusively
4769 		 */
4770 		if (object_lock_type == OBJECT_LOCK_SHARED) {
4771 			object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4772 
4773 			if (vm_object_lock_upgrade(object) == FALSE) {
4774 				/*
4775 				 * couldn't upgrade, so explictly
4776 				 * take the lock exclusively
4777 				 */
4778 				vm_object_lock(object);
4779 			}
4780 		}
4781 	}
4782 
4783 #if     VM_FAULT_CLASSIFY
4784 	/*
4785 	 *	Temporary data gathering code
4786 	 */
4787 	vm_fault_classify(object, offset, fault_type);
4788 #endif
4789 	/*
4790 	 *	Fast fault code.  The basic idea is to do as much as
4791 	 *	possible while holding the map lock and object locks.
4792 	 *      Busy pages are not used until the object lock has to
4793 	 *	be dropped to do something (copy, zero fill, pmap enter).
4794 	 *	Similarly, paging references aren't acquired until that
4795 	 *	point, and object references aren't used.
4796 	 *
4797 	 *	If we can figure out what to do
4798 	 *	(zero fill, copy on write, pmap enter) while holding
4799 	 *	the locks, then it gets done.  Otherwise, we give up,
4800 	 *	and use the original fault path (which doesn't hold
4801 	 *	the map lock, and relies on busy pages).
4802 	 *	The give up cases include:
4803 	 *              - Have to talk to pager.
4804 	 *		- Page is busy, absent or in error.
4805 	 *		- Pager has locked out desired access.
4806 	 *		- Fault needs to be restarted.
4807 	 *		- Have to push page into copy object.
4808 	 *
4809 	 *	The code is an infinite loop that moves one level down
4810 	 *	the shadow chain each time.  cur_object and cur_offset
4811 	 *      refer to the current object being examined. object and offset
4812 	 *	are the original object from the map.  The loop is at the
4813 	 *	top level if and only if object and cur_object are the same.
4814 	 *
4815 	 *	Invariants:  Map lock is held throughout.  Lock is held on
4816 	 *		original object and cur_object (if different) when
4817 	 *		continuing or exiting loop.
4818 	 *
4819 	 */
4820 
4821 #if defined(__arm64__)
4822 	/*
4823 	 * Fail if reading an execute-only page in a
4824 	 * pmap that enforces execute-only protection.
4825 	 */
4826 	if (fault_type == VM_PROT_READ &&
4827 	    (prot & VM_PROT_EXECUTE) &&
4828 	    !(prot & VM_PROT_READ) &&
4829 	    pmap_enforces_execute_only(pmap)) {
4830 		vm_object_unlock(object);
4831 		vm_map_unlock_read(map);
4832 		if (real_map != map) {
4833 			vm_map_unlock(real_map);
4834 		}
4835 		kr = KERN_PROTECTION_FAILURE;
4836 		goto done;
4837 	}
4838 #endif
4839 
4840 	fault_phys_offset = (vm_map_offset_t)offset - vm_map_trunc_page((vm_map_offset_t)offset, PAGE_MASK);
4841 
4842 	/*
4843 	 * If this page is to be inserted in a copy delay object
4844 	 * for writing, and if the object has a copy, then the
4845 	 * copy delay strategy is implemented in the slow fault page.
4846 	 */
4847 	if ((object->copy_strategy == MEMORY_OBJECT_COPY_DELAY ||
4848 	    object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK) &&
4849 	    object->vo_copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
4850 		assert(!resilient_media_retry); /* should be read-only fault */
4851 		goto handle_copy_delay;
4852 	}
4853 
4854 	cur_object = object;
4855 	cur_offset = offset;
4856 
4857 	grab_options = vm_page_grab_options_for_object(object);
4858 #if HAS_MTE
4859 	if (!(grab_options & VM_PAGE_GRAB_MTE) &&
4860 	    mteinfo_vm_tag_can_use_tag_storage((vm_tag_t)fault_info->user_tag)) {
4861 		grab_options |= VM_PAGE_GRAB_ALLOW_TAG_STORAGE;
4862 	}
4863 #endif /* HAS_MTE */
4864 
4865 	while (TRUE) {
4866 		if (!cur_object->pager_created &&
4867 		    cur_object->phys_contiguous) { /* superpage */
4868 			break;
4869 		}
4870 
4871 		if (cur_object->blocked_access) {
4872 			/*
4873 			 * Access to this VM object has been blocked.
4874 			 * Let the slow path handle it.
4875 			 */
4876 			break;
4877 		}
4878 
4879 		m = vm_page_lookup(cur_object, vm_object_trunc_page(cur_offset));
4880 		m_object = NULL;
4881 
4882 		if (m != VM_PAGE_NULL) {
4883 			m_object = cur_object;
4884 
4885 			if (__improbable(page_sleep_needed)) {
4886 				/*
4887 				 * If a prior iteration of the loop requested vm_page_sleep(), re-validate the page
4888 				 * to see if it's still needed.
4889 				 */
4890 				kr = vm_fault_pmap_validate_page(pmap, m, vaddr, prot, fault_info, &page_sleep_needed);
4891 				if (__improbable(kr != KERN_SUCCESS)) {
4892 					vm_object_unlock(object);
4893 					if (object != cur_object) {
4894 						vm_object_unlock(cur_object);
4895 					}
4896 					vm_map_unlock_read(map);
4897 					if (real_map != map) {
4898 						vm_map_unlock(real_map);
4899 					}
4900 					goto done;
4901 				}
4902 			}
4903 			if (m->vmp_busy || page_sleep_needed) {
4904 				page_sleep_needed = false;
4905 				wait_result_t   result;
4906 
4907 				/*
4908 				 * in order to vm_page_sleep(), we must
4909 				 * have object that 'm' belongs to locked exclusively
4910 				 */
4911 				if (object != cur_object) {
4912 					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4913 						cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4914 
4915 						if (vm_object_lock_upgrade(cur_object) == FALSE) {
4916 							/*
4917 							 * couldn't upgrade so go do a full retry
4918 							 * immediately since we can no longer be
4919 							 * certain about cur_object (since we
4920 							 * don't hold a reference on it)...
4921 							 * first drop the top object lock
4922 							 */
4923 							vm_object_unlock(object);
4924 
4925 							vm_map_unlock_read(map);
4926 							if (real_map != map) {
4927 								vm_map_unlock(real_map);
4928 							}
4929 
4930 							goto RetryFault;
4931 						}
4932 					}
4933 				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
4934 					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4935 
4936 					if (vm_object_lock_upgrade(object) == FALSE) {
4937 						/*
4938 						 * couldn't upgrade, so explictly take the lock
4939 						 * exclusively and go relookup the page since we
4940 						 * will have dropped the object lock and
4941 						 * a different thread could have inserted
4942 						 * a page at this offset
4943 						 * no need for a full retry since we're
4944 						 * at the top level of the object chain
4945 						 */
4946 						vm_object_lock(object);
4947 
4948 						continue;
4949 					}
4950 				}
4951 				if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
4952 					/*
4953 					 * m->vmp_busy == TRUE and the object is locked exclusively
4954 					 * if m->pageout_queue == TRUE after we acquire the
4955 					 * queues lock, we are guaranteed that it is stable on
4956 					 * the pageout queue and therefore reclaimable
4957 					 *
4958 					 * NOTE: this is only true for the internal pageout queue
4959 					 * in the compressor world
4960 					 */
4961 					assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
4962 
4963 					vm_page_lock_queues();
4964 
4965 					if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
4966 						vm_pageout_throttle_up(m);
4967 						vm_page_unlock_queues();
4968 
4969 						vm_page_wakeup_done(m_object, m);
4970 						goto reclaimed_from_pageout;
4971 					}
4972 					vm_page_unlock_queues();
4973 				}
4974 				if (object != cur_object) {
4975 					vm_object_unlock(object);
4976 				}
4977 
4978 				vm_map_unlock_read(map);
4979 				if (real_map != map) {
4980 					vm_map_unlock(real_map);
4981 				}
4982 
4983 				result = vm_page_sleep(cur_object, m, fault_info->interruptible, LCK_SLEEP_UNLOCK);
4984 				if (result == THREAD_AWAKENED || result == THREAD_RESTART) {
4985 					goto RetryFault;
4986 				}
4987 
4988 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_BUSYPAGE_WAIT_INTERRUPTED), 0 /* arg */);
4989 				kr = KERN_ABORTED;
4990 				goto done;
4991 			}
4992 reclaimed_from_pageout:
4993 			if (m->vmp_laundry) {
4994 				if (object != cur_object) {
4995 					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4996 						cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4997 
4998 						vm_object_unlock(object);
4999 						vm_object_unlock(cur_object);
5000 
5001 						vm_map_unlock_read(map);
5002 						if (real_map != map) {
5003 							vm_map_unlock(real_map);
5004 						}
5005 
5006 						goto RetryFault;
5007 					}
5008 				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
5009 					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5010 
5011 					if (vm_object_lock_upgrade(object) == FALSE) {
5012 						/*
5013 						 * couldn't upgrade, so explictly take the lock
5014 						 * exclusively and go relookup the page since we
5015 						 * will have dropped the object lock and
5016 						 * a different thread could have inserted
5017 						 * a page at this offset
5018 						 * no need for a full retry since we're
5019 						 * at the top level of the object chain
5020 						 */
5021 						vm_object_lock(object);
5022 
5023 						continue;
5024 					}
5025 				}
5026 				vm_object_lock_assert_exclusive(VM_PAGE_OBJECT(m));
5027 				vm_pageout_steal_laundry(m, FALSE);
5028 			}
5029 
5030 
5031 			if (vm_page_is_guard(m)) {
5032 				/*
5033 				 * Guard page: let the slow path deal with it
5034 				 */
5035 				break;
5036 			}
5037 			if (m->vmp_unusual && (m->vmp_error || m->vmp_restart ||
5038 			    vm_page_is_private(m) || m->vmp_absent)) {
5039 				/*
5040 				 * Unusual case... let the slow path deal with it
5041 				 */
5042 				break;
5043 			}
5044 			if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
5045 				if (object != cur_object) {
5046 					vm_object_unlock(object);
5047 				}
5048 				vm_map_unlock_read(map);
5049 				if (real_map != map) {
5050 					vm_map_unlock(real_map);
5051 				}
5052 				vm_object_unlock(cur_object);
5053 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
5054 				kr = KERN_MEMORY_ERROR;
5055 				goto done;
5056 			}
5057 			assert(m_object == VM_PAGE_OBJECT(m));
5058 
5059 			if (vm_fault_cs_need_validation(map->pmap, m, m_object,
5060 			    PAGE_SIZE, 0) ||
5061 			    (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
5062 upgrade_lock_and_retry:
5063 				/*
5064 				 * We might need to validate this page
5065 				 * against its code signature, so we
5066 				 * want to hold the VM object exclusively.
5067 				 */
5068 				if (object != cur_object) {
5069 					if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5070 						vm_object_unlock(object);
5071 						vm_object_unlock(cur_object);
5072 
5073 						cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5074 
5075 						vm_map_unlock_read(map);
5076 						if (real_map != map) {
5077 							vm_map_unlock(real_map);
5078 						}
5079 
5080 						goto RetryFault;
5081 					}
5082 				} else if (object_lock_type == OBJECT_LOCK_SHARED) {
5083 					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5084 
5085 					if (vm_object_lock_upgrade(object) == FALSE) {
5086 						/*
5087 						 * couldn't upgrade, so explictly take the lock
5088 						 * exclusively and go relookup the page since we
5089 						 * will have dropped the object lock and
5090 						 * a different thread could have inserted
5091 						 * a page at this offset
5092 						 * no need for a full retry since we're
5093 						 * at the top level of the object chain
5094 						 */
5095 						vm_object_lock(object);
5096 
5097 						continue;
5098 					}
5099 				}
5100 			}
5101 			/*
5102 			 *	Two cases of map in faults:
5103 			 *	    - At top level w/o copy object.
5104 			 *	    - Read fault anywhere.
5105 			 *		--> must disallow write.
5106 			 */
5107 
5108 			if (object == cur_object && object->vo_copy == VM_OBJECT_NULL) {
5109 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
5110 				if ((fault_type & VM_PROT_WRITE) && m->vmp_unmodified_ro) {
5111 					assert(cur_object == VM_PAGE_OBJECT(m));
5112 					assert(cur_object->internal);
5113 					vm_object_lock_assert_exclusive(cur_object);
5114 					vm_page_lockspin_queues();
5115 					m->vmp_unmodified_ro = false;
5116 					vm_page_unlock_queues();
5117 					os_atomic_dec(&compressor_ro_uncompressed, relaxed);
5118 					vm_object_compressor_pager_state_clr(cur_object, m->vmp_offset);
5119 				}
5120 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
5121 				goto FastPmapEnter;
5122 			}
5123 
5124 			if (!need_copy &&
5125 			    !fault_info->no_copy_on_read &&
5126 			    cur_object != object &&
5127 			    !cur_object->internal &&
5128 			    !cur_object->pager_trusted &&
5129 			    !cur_object->code_signed &&
5130 			    vm_protect_privileged_from_untrusted &&
5131 			    (current_proc_is_privileged() ||
5132 			    vm_kernel_map_is_kernel(map) ||
5133 			    vm_map_is_platform_binary(map))) {
5134 				/*
5135 				 * We're faulting on a page in "object" and
5136 				 * went down the shadow chain to "cur_object"
5137 				 * to find out that "cur_object"'s pager
5138 				 * is not "trusted", i.e. we can not trust it
5139 				 * to always return the same contents.
5140 				 * Since the target is a "privileged" process,
5141 				 * let's treat this as a copy-on-read fault, as
5142 				 * if it was a copy-on-write fault.
5143 				 * Once "object" gets a copy of this page, it
5144 				 * won't have to rely on "cur_object" to
5145 				 * provide the contents again.
5146 				 *
5147 				 * This is done by setting "need_copy" and
5148 				 * retrying the fault from the top with the
5149 				 * appropriate locking.
5150 				 *
5151 				 * Special case: if the mapping is executable
5152 				 * and the untrusted object is code-signed and
5153 				 * the process is "cs_enforced", we do not
5154 				 * copy-on-read because that would break
5155 				 * code-signing enforcement expectations (an
5156 				 * executable page must belong to a code-signed
5157 				 * object) and we can rely on code-signing
5158 				 * to re-validate the page if it gets evicted
5159 				 * and paged back in.
5160 				 */
5161 //				printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
5162 				vm_copied_on_read++;
5163 				if (!current_proc_is_privileged()) {
5164 					/* not a privileged proc but still copy-on-read... */
5165 					if (vm_kernel_map_is_kernel(map)) {
5166 						/* ... because target map is a kernel map */
5167 						vm_copied_on_read_kernel_map++;
5168 					} else {
5169 						/* ... because target map is "platform" */
5170 						vm_copied_on_read_platform_map++;
5171 					}
5172 				}
5173 				need_copy = TRUE;
5174 
5175 				vm_object_unlock(object);
5176 				vm_object_unlock(cur_object);
5177 				object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5178 				vm_map_unlock_read(map);
5179 				if (real_map != map) {
5180 					vm_map_unlock(real_map);
5181 				}
5182 				goto RetryFault;
5183 			}
5184 
5185 			if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
5186 				if (pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
5187 					/*
5188 					 * For a protection that the pmap cares
5189 					 * about, we must hand over the full
5190 					 * set of protections (so that the pmap
5191 					 * layer can apply any desired policy).
5192 					 * This means that cs_bypass must be
5193 					 * set, as this can force us to pass
5194 					 * RWX.
5195 					 */
5196 					if (!fault_info->cs_bypass) {
5197 						panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x",
5198 						    __FUNCTION__, pmap,
5199 						    (uint64_t)vaddr, prot,
5200 						    fault_info->pmap_options);
5201 					}
5202 				} else {
5203 					prot &= ~VM_PROT_WRITE;
5204 				}
5205 
5206 				if (object != cur_object) {
5207 					/*
5208 					 * We still need to hold the top object
5209 					 * lock here to prevent a race between
5210 					 * a read fault (taking only "shared"
5211 					 * locks) and a write fault (taking
5212 					 * an "exclusive" lock on the top
5213 					 * object.
5214 					 * Otherwise, as soon as we release the
5215 					 * top lock, the write fault could
5216 					 * proceed and actually complete before
5217 					 * the read fault, and the copied page's
5218 					 * translation could then be overwritten
5219 					 * by the read fault's translation for
5220 					 * the original page.
5221 					 *
5222 					 * Let's just record what the top object
5223 					 * is and we'll release it later.
5224 					 */
5225 					top_object = object;
5226 
5227 					/*
5228 					 * switch to the object that has the new page
5229 					 */
5230 					object = cur_object;
5231 					object_lock_type = cur_object_lock_type;
5232 				}
5233 FastPmapEnter:
5234 				assert(m_object == VM_PAGE_OBJECT(m));
5235 
5236 				if (resilient_media_retry && (prot & VM_PROT_WRITE)) {
5237 					/*
5238 					 * We might have bypassed some copy-on-write
5239 					 * mechanism to get here (theoretically inserting
5240 					 * a zero-filled page in the top object to avoid
5241 					 * raising an exception on an unavailable page at
5242 					 * the bottom of the shadow chain.
5243 					 * So let's not grant write access to this page yet.
5244 					 * If write access is needed, the next fault should
5245 					 * handle any copy-on-write obligations.
5246 					 */
5247 					if (pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
5248 						/*
5249 						 * For a protection that the pmap cares
5250 						 * about, we must hand over the full
5251 						 * set of protections (so that the pmap
5252 						 * layer can apply any desired policy).
5253 						 * This means that cs_bypass must be
5254 						 * set, as this can force us to pass
5255 						 * RWX.
5256 						 */
5257 						if (!fault_info->cs_bypass) {
5258 							panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x",
5259 							    __FUNCTION__, pmap,
5260 							    (uint64_t)vaddr, prot,
5261 							    fault_info->pmap_options);
5262 						}
5263 					} else {
5264 						prot &= ~VM_PROT_WRITE;
5265 					}
5266 				}
5267 
5268 				/*
5269 				 * prepare for the pmap_enter...
5270 				 * object and map are both locked
5271 				 * m contains valid data
5272 				 * object == m->vmp_object
5273 				 * cur_object == NULL or it's been unlocked
5274 				 * no paging references on either object or cur_object
5275 				 */
5276 
5277 				if (fault_page_size < PAGE_SIZE) {
5278 					DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, caller_pmap, (uint64_t)caller_pmap_addr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
5279 					assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
5280 					    fault_phys_offset < PAGE_SIZE),
5281 					    "0x%llx\n", (uint64_t)fault_phys_offset);
5282 				} else {
5283 					assertf(fault_phys_offset == 0,
5284 					    "0x%llx\n", (uint64_t)fault_phys_offset);
5285 				}
5286 
5287 				if (__improbable(rtfault &&
5288 				    !m->vmp_realtime &&
5289 				    vm_pageout_protect_realtime)) {
5290 					vm_page_lock_queues();
5291 					if (!m->vmp_realtime) {
5292 						m->vmp_realtime = true;
5293 						VM_COUNTER_INC(&vm_page_realtime_count);
5294 					}
5295 					vm_page_unlock_queues();
5296 				}
5297 				assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p object=%p", m, m_object, object);
5298 				assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
5299 				need_retry = false;
5300 				if (caller_pmap) {
5301 					kr = vm_fault_enter(m,
5302 					    caller_pmap,
5303 					    caller_pmap_addr,
5304 					    fault_page_size,
5305 					    fault_phys_offset,
5306 					    prot,
5307 					    caller_prot,
5308 					    wired,
5309 					    wire_tag,
5310 					    fault_info,
5311 					    &need_retry,
5312 					    &type_of_fault,
5313 					    &object_lock_type,
5314 					    &page_sleep_needed);
5315 				} else {
5316 					kr = vm_fault_enter(m,
5317 					    pmap,
5318 					    vaddr,
5319 					    fault_page_size,
5320 					    fault_phys_offset,
5321 					    prot,
5322 					    caller_prot,
5323 					    wired,
5324 					    wire_tag,
5325 					    fault_info,
5326 					    &need_retry,
5327 					    &type_of_fault,
5328 					    &object_lock_type,
5329 					    &page_sleep_needed);
5330 				}
5331 
5332 				vm_fault_complete(
5333 					map,
5334 					real_map,
5335 					object,
5336 					m_object,
5337 					m,
5338 					offset,
5339 					trace_real_vaddr,
5340 					fault_info,
5341 					caller_prot,
5342 					real_vaddr,
5343 					vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
5344 					need_retry || page_sleep_needed,
5345 					kr,
5346 					physpage_p,
5347 					prot,
5348 					top_object,
5349 					need_collapse,
5350 					cur_offset,
5351 					fault_type,
5352 					&written_on_object,
5353 					&written_on_pager,
5354 					&written_on_offset);
5355 				top_object = VM_OBJECT_NULL;
5356 				if (need_retry) {
5357 					/*
5358 					 * vm_fault_enter couldn't complete the PMAP_ENTER...
5359 					 * at this point we don't hold any locks so it's safe
5360 					 * to ask the pmap layer to expand the page table to
5361 					 * accommodate this mapping... once expanded, we'll
5362 					 * re-drive the fault which should result in vm_fault_enter
5363 					 * being able to successfully enter the mapping this time around
5364 					 */
5365 					(void)pmap_enter_options(
5366 						pmap, vaddr, 0, 0, 0, 0, 0,
5367 						PMAP_OPTIONS_NOENTER, NULL, PMAP_MAPPING_TYPE_INFER);
5368 
5369 					need_retry = false;
5370 					goto RetryFault;
5371 				}
5372 				if (page_sleep_needed) {
5373 					goto RetryFault;
5374 				}
5375 				goto done;
5376 			}
5377 			/*
5378 			 * COPY ON WRITE FAULT
5379 			 */
5380 			assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
5381 
5382 			/*
5383 			 * If objects match, then
5384 			 * object->vo_copy must not be NULL (else control
5385 			 * would be in previous code block), and we
5386 			 * have a potential push into the copy object
5387 			 * with which we can't cope with here.
5388 			 */
5389 			if (cur_object == object) {
5390 				/*
5391 				 * must take the slow path to
5392 				 * deal with the copy push
5393 				 */
5394 				break;
5395 			}
5396 
5397 			/*
5398 			 * This is now a shadow based copy on write
5399 			 * fault -- it requires a copy up the shadow
5400 			 * chain.
5401 			 */
5402 			assert(m_object == VM_PAGE_OBJECT(m));
5403 
5404 			if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
5405 			    vm_fault_cs_need_validation(NULL, m, m_object,
5406 			    PAGE_SIZE, 0)) {
5407 				goto upgrade_lock_and_retry;
5408 			}
5409 
5410 #if MACH_ASSERT
5411 			if (resilient_media_retry &&
5412 			    vm_fault_resilient_media_inject_error2_rate != 0 &&
5413 			    (++vm_fault_resilient_media_inject_error2 % vm_fault_resilient_media_inject_error2_rate) == 0) {
5414 				/* inject an error */
5415 				cur_m = m;
5416 				m = VM_PAGE_NULL;
5417 				m_object = VM_OBJECT_NULL;
5418 				break;
5419 			}
5420 #endif /* MACH_ASSERT */
5421 			/*
5422 			 * Allocate a page in the original top level
5423 			 * object. Give up if allocate fails.  Also
5424 			 * need to remember current page, as it's the
5425 			 * source of the copy.
5426 			 *
5427 			 * at this point we hold locks on both
5428 			 * object and cur_object... no need to take
5429 			 * paging refs or mark pages BUSY since
5430 			 * we don't drop either object lock until
5431 			 * the page has been copied and inserted
5432 			 */
5433 			cur_m = m;
5434 			m = vm_page_grab_options(grab_options);
5435 			m_object = NULL;
5436 
5437 			if (m == VM_PAGE_NULL) {
5438 				/*
5439 				 * no free page currently available...
5440 				 * must take the slow path
5441 				 */
5442 				break;
5443 			}
5444 
5445 			/*
5446 			 * Now do the copy.  Mark the source page busy...
5447 			 *
5448 			 *	NOTE: This code holds the map lock across
5449 			 *	the page copy.
5450 			 */
5451 			vm_page_copy(cur_m, m);
5452 			vm_page_insert(m, object, vm_object_trunc_page(offset));
5453 			if (VM_MAP_PAGE_MASK(map) != PAGE_MASK) {
5454 				DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map, (uint64_t)vaddr, cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
5455 			}
5456 			m_object = object;
5457 			SET_PAGE_DIRTY(m, FALSE);
5458 
5459 			/*
5460 			 * Now cope with the source page and object
5461 			 */
5462 			if (os_ref_get_count_raw(&object->ref_count) > 1 &&
5463 			    cur_m->vmp_pmapped) {
5464 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
5465 			} else if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
5466 				/*
5467 				 * We've copied the full 16K page but we're
5468 				 * about to call vm_fault_enter() only for
5469 				 * the 4K chunk we're faulting on.  The other
5470 				 * three 4K chunks in that page could still
5471 				 * be pmapped in this pmap.
5472 				 * Since the VM object layer thinks that the
5473 				 * entire page has been dealt with and the
5474 				 * original page might no longer be needed,
5475 				 * it might collapse/bypass the original VM
5476 				 * object and free its pages, which would be
5477 				 * bad (and would trigger pmap_verify_free()
5478 				 * assertions) if the other 4K chunks are still
5479 				 * pmapped.
5480 				 */
5481 				/*
5482 				 * XXX FBDP TODO4K: to be revisisted
5483 				 * Technically, we need to pmap_disconnect()
5484 				 * only the target pmap's mappings for the 4K
5485 				 * chunks of this 16K VM page.  If other pmaps
5486 				 * have PTEs on these chunks, that means that
5487 				 * the associated VM map must have a reference
5488 				 * on the VM object, so no need to worry about
5489 				 * those.
5490 				 * pmap_protect() for each 4K chunk would be
5491 				 * better but we'd have to check which chunks
5492 				 * are actually mapped before and after this
5493 				 * one.
5494 				 * A full-blown pmap_disconnect() is easier
5495 				 * for now but not efficient.
5496 				 */
5497 				DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, VM_PAGE_GET_PHYS_PAGE(cur_m));
5498 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
5499 			}
5500 
5501 			if (cur_m->vmp_clustered) {
5502 				VM_PAGE_COUNT_AS_PAGEIN(cur_m);
5503 				VM_PAGE_CONSUME_CLUSTERED(cur_m);
5504 				vm_fault_is_sequential(cur_object, cur_offset, fault_info->behavior);
5505 			}
5506 			need_collapse = TRUE;
5507 
5508 			if (!cur_object->internal &&
5509 			    cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
5510 				/*
5511 				 * The object from which we've just
5512 				 * copied a page is most probably backed
5513 				 * by a vnode.  We don't want to waste too
5514 				 * much time trying to collapse the VM objects
5515 				 * and create a bottleneck when several tasks
5516 				 * map the same file.
5517 				 */
5518 				if (cur_object->vo_copy == object) {
5519 					/*
5520 					 * Shared mapping or no COW yet.
5521 					 * We can never collapse a copy
5522 					 * object into its backing object.
5523 					 */
5524 					need_collapse = FALSE;
5525 				} else if (cur_object->vo_copy == object->shadow &&
5526 				    object->shadow->resident_page_count == 0) {
5527 					/*
5528 					 * Shared mapping after a COW occurred.
5529 					 */
5530 					need_collapse = FALSE;
5531 				}
5532 			}
5533 			vm_object_unlock(cur_object);
5534 
5535 			if (need_collapse == FALSE) {
5536 				vm_fault_collapse_skipped++;
5537 			}
5538 			vm_fault_collapse_total++;
5539 
5540 			type_of_fault = DBG_COW_FAULT;
5541 			counter_inc(&vm_statistics_cow_faults);
5542 			DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
5543 			counter_inc(&current_task()->cow_faults);
5544 
5545 			goto FastPmapEnter;
5546 		} else {
5547 			/*
5548 			 * No page at cur_object, cur_offset... m == NULL
5549 			 */
5550 			if (cur_object->pager_created) {
5551 				vm_external_state_t compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
5552 
5553 				if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
5554 					int             my_fault_type;
5555 					vm_compressor_options_t         c_flags = C_DONT_BLOCK;
5556 					bool            insert_cur_object = FALSE;
5557 
5558 					/*
5559 					 * May have to talk to a pager...
5560 					 * if so, take the slow path by
5561 					 * doing a 'break' from the while (TRUE) loop
5562 					 *
5563 					 * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
5564 					 * if the compressor is active and the page exists there
5565 					 */
5566 					if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
5567 						break;
5568 					}
5569 
5570 					if (map == kernel_map || real_map == kernel_map) {
5571 						/*
5572 						 * can't call into the compressor with the kernel_map
5573 						 * lock held, since the compressor may try to operate
5574 						 * on the kernel map in order to return an empty c_segment
5575 						 */
5576 						break;
5577 					}
5578 					if (object != cur_object) {
5579 						if (fault_type & VM_PROT_WRITE) {
5580 							c_flags |= C_KEEP;
5581 						} else {
5582 							insert_cur_object = TRUE;
5583 						}
5584 					}
5585 					if (insert_cur_object == TRUE) {
5586 						if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5587 							cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5588 
5589 							if (vm_object_lock_upgrade(cur_object) == FALSE) {
5590 								/*
5591 								 * couldn't upgrade so go do a full retry
5592 								 * immediately since we can no longer be
5593 								 * certain about cur_object (since we
5594 								 * don't hold a reference on it)...
5595 								 * first drop the top object lock
5596 								 */
5597 								vm_object_unlock(object);
5598 
5599 								vm_map_unlock_read(map);
5600 								if (real_map != map) {
5601 									vm_map_unlock(real_map);
5602 								}
5603 
5604 								goto RetryFault;
5605 							}
5606 						}
5607 					} else if (object_lock_type == OBJECT_LOCK_SHARED) {
5608 						object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5609 
5610 						if (object != cur_object) {
5611 							/*
5612 							 * we can't go for the upgrade on the top
5613 							 * lock since the upgrade may block waiting
5614 							 * for readers to drain... since we hold
5615 							 * cur_object locked at this point, waiting
5616 							 * for the readers to drain would represent
5617 							 * a lock order inversion since the lock order
5618 							 * for objects is the reference order in the
5619 							 * shadown chain
5620 							 */
5621 							vm_object_unlock(object);
5622 							vm_object_unlock(cur_object);
5623 
5624 							vm_map_unlock_read(map);
5625 							if (real_map != map) {
5626 								vm_map_unlock(real_map);
5627 							}
5628 
5629 							goto RetryFault;
5630 						}
5631 						if (vm_object_lock_upgrade(object) == FALSE) {
5632 							/*
5633 							 * couldn't upgrade, so explictly take the lock
5634 							 * exclusively and go relookup the page since we
5635 							 * will have dropped the object lock and
5636 							 * a different thread could have inserted
5637 							 * a page at this offset
5638 							 * no need for a full retry since we're
5639 							 * at the top level of the object chain
5640 							 */
5641 							vm_object_lock(object);
5642 
5643 							continue;
5644 						}
5645 					}
5646 
5647 #if HAS_MTE
5648 					if (vm_object_is_mte_mappable(object)) {
5649 						c_flags |= C_MTE;
5650 					}
5651 #endif /* HAS_MTE */
5652 					m = vm_page_grab_options(grab_options);
5653 					m_object = NULL;
5654 
5655 					if (m == VM_PAGE_NULL) {
5656 						/*
5657 						 * no free page currently available...
5658 						 * must take the slow path
5659 						 */
5660 						break;
5661 					}
5662 
5663 					/*
5664 					 * The object is and remains locked
5665 					 * so no need to take a
5666 					 * "paging_in_progress" reference.
5667 					 */
5668 					bool      shared_lock;
5669 					if ((object == cur_object &&
5670 					    object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
5671 					    (object != cur_object &&
5672 					    cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
5673 						shared_lock = FALSE;
5674 					} else {
5675 						shared_lock = TRUE;
5676 					}
5677 
5678 					kr = vm_compressor_pager_get(
5679 						cur_object->pager,
5680 						(vm_object_trunc_page(cur_offset)
5681 						+ cur_object->paging_offset),
5682 						VM_PAGE_GET_PHYS_PAGE(m),
5683 						&my_fault_type,
5684 						c_flags,
5685 						&compressed_count_delta);
5686 
5687 					vm_compressor_pager_count(
5688 						cur_object->pager,
5689 						compressed_count_delta,
5690 						shared_lock,
5691 						cur_object);
5692 
5693 					if (kr != KERN_SUCCESS) {
5694 						vm_page_release(m,
5695 						    VMP_RELEASE_NONE);
5696 						m = VM_PAGE_NULL;
5697 					}
5698 					/*
5699 					 * If vm_compressor_pager_get() returns
5700 					 * KERN_MEMORY_FAILURE, then the
5701 					 * compressed data is permanently lost,
5702 					 * so return this error immediately.
5703 					 */
5704 					if (kr == KERN_MEMORY_FAILURE) {
5705 						if (object != cur_object) {
5706 							vm_object_unlock(cur_object);
5707 						}
5708 						vm_object_unlock(object);
5709 						vm_map_unlock_read(map);
5710 						if (real_map != map) {
5711 							vm_map_unlock(real_map);
5712 						}
5713 
5714 						goto done;
5715 					} else if (kr != KERN_SUCCESS) {
5716 						break;
5717 					}
5718 					m->vmp_dirty = TRUE;
5719 #if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
5720 					if ((fault_type & VM_PROT_WRITE) == 0) {
5721 						prot &= ~VM_PROT_WRITE;
5722 						/*
5723 						 * The page, m, has yet to be inserted
5724 						 * into an object. So we are fine with
5725 						 * the object/cur_object lock being held
5726 						 * shared.
5727 						 */
5728 						vm_page_lockspin_queues();
5729 						m->vmp_unmodified_ro = true;
5730 						vm_page_unlock_queues();
5731 						os_atomic_inc(&compressor_ro_uncompressed, relaxed);
5732 					}
5733 #endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
5734 
5735 					/*
5736 					 * If the object is purgeable, its
5737 					 * owner's purgeable ledgers will be
5738 					 * updated in vm_page_insert() but the
5739 					 * page was also accounted for in a
5740 					 * "compressed purgeable" ledger, so
5741 					 * update that now.
5742 					 */
5743 					if (object != cur_object &&
5744 					    !insert_cur_object) {
5745 						/*
5746 						 * We're not going to insert
5747 						 * the decompressed page into
5748 						 * the object it came from.
5749 						 *
5750 						 * We're dealing with a
5751 						 * copy-on-write fault on
5752 						 * "object".
5753 						 * We're going to decompress
5754 						 * the page directly into the
5755 						 * target "object" while
5756 						 * keepin the compressed
5757 						 * page for "cur_object", so
5758 						 * no ledger update in that
5759 						 * case.
5760 						 */
5761 					} else if (((cur_object->purgable ==
5762 					    VM_PURGABLE_DENY) &&
5763 					    (!cur_object->vo_ledger_tag)) ||
5764 					    (cur_object->vo_owner ==
5765 					    NULL)) {
5766 						/*
5767 						 * "cur_object" is not purgeable
5768 						 * and is not ledger-taged, or
5769 						 * there's no owner for it,
5770 						 * so no owner's ledgers to
5771 						 * update.
5772 						 */
5773 					} else {
5774 						/*
5775 						 * One less compressed
5776 						 * purgeable/tagged page for
5777 						 * cur_object's owner.
5778 						 */
5779 						if (compressed_count_delta) {
5780 							vm_object_owner_compressed_update(
5781 								cur_object,
5782 								-1);
5783 						}
5784 					}
5785 
5786 					if (insert_cur_object) {
5787 						vm_page_insert(m, cur_object, vm_object_trunc_page(cur_offset));
5788 						m_object = cur_object;
5789 					} else {
5790 						vm_page_insert(m, object, vm_object_trunc_page(offset));
5791 						m_object = object;
5792 					}
5793 
5794 					if (!HAS_DEFAULT_CACHEABILITY(m_object->wimg_bits & VM_WIMG_MASK)) {
5795 						/*
5796 						 * If the page is not cacheable,
5797 						 * we can't let its contents
5798 						 * linger in the data cache
5799 						 * after the decompression.
5800 						 */
5801 						pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
5802 					}
5803 
5804 					type_of_fault = my_fault_type;
5805 
5806 					VM_STAT_DECOMPRESSIONS();
5807 
5808 					if (cur_object != object) {
5809 						if (insert_cur_object) {
5810 							top_object = object;
5811 							/*
5812 							 * switch to the object that has the new page
5813 							 */
5814 							object = cur_object;
5815 							object_lock_type = cur_object_lock_type;
5816 						} else {
5817 							vm_object_unlock(cur_object);
5818 							cur_object = object;
5819 						}
5820 					}
5821 					goto FastPmapEnter;
5822 				}
5823 				/*
5824 				 * existence map present and indicates
5825 				 * that the pager doesn't have this page
5826 				 */
5827 			}
5828 			if (cur_object->shadow == VM_OBJECT_NULL ||
5829 			    resilient_media_retry) {
5830 				/*
5831 				 * Zero fill fault.  Page gets
5832 				 * inserted into the original object.
5833 				 */
5834 				if (cur_object->shadow_severed ||
5835 				    VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
5836 				    cur_object == compressor_object ||
5837 				    is_kernel_object(cur_object)) {
5838 					if (object != cur_object) {
5839 						vm_object_unlock(cur_object);
5840 					}
5841 					vm_object_unlock(object);
5842 
5843 					vm_map_unlock_read(map);
5844 					if (real_map != map) {
5845 						vm_map_unlock(real_map);
5846 					}
5847 					if (VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object)) {
5848 						ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), 0 /* arg */);
5849 					}
5850 
5851 					if (cur_object->shadow_severed) {
5852 						ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_SHADOW_SEVERED), 0 /* arg */);
5853 					}
5854 
5855 					kr = KERN_MEMORY_ERROR;
5856 					goto done;
5857 				}
5858 				if (cur_object != object) {
5859 					vm_object_unlock(cur_object);
5860 
5861 					cur_object = object;
5862 				}
5863 				if (object_lock_type == OBJECT_LOCK_SHARED) {
5864 					object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5865 
5866 					if (vm_object_lock_upgrade(object) == FALSE) {
5867 						/*
5868 						 * couldn't upgrade so do a full retry on the fault
5869 						 * since we dropped the object lock which
5870 						 * could allow another thread to insert
5871 						 * a page at this offset
5872 						 */
5873 						vm_map_unlock_read(map);
5874 						if (real_map != map) {
5875 							vm_map_unlock(real_map);
5876 						}
5877 
5878 						goto RetryFault;
5879 					}
5880 				}
5881 				if (!object->internal) {
5882 					panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
5883 				}
5884 #if MACH_ASSERT
5885 				if (resilient_media_retry &&
5886 				    vm_fault_resilient_media_inject_error3_rate != 0 &&
5887 				    (++vm_fault_resilient_media_inject_error3 % vm_fault_resilient_media_inject_error3_rate) == 0) {
5888 					/* inject an error */
5889 					m_object = NULL;
5890 					break;
5891 				}
5892 #endif /* MACH_ASSERT */
5893 
5894 				m = vm_page_grab_options(grab_options);
5895 				m_object = NULL;
5896 
5897 				if (m == VM_PAGE_NULL) {
5898 					/*
5899 					 * no free page currently available...
5900 					 * must take the slow path
5901 					 */
5902 					break;
5903 				}
5904 				m_object = object;
5905 				vm_page_insert(m, m_object, vm_object_trunc_page(offset));
5906 
5907 				if ((prot & VM_PROT_WRITE) &&
5908 				    !(fault_type & VM_PROT_WRITE) &&
5909 				    object->vo_copy != VM_OBJECT_NULL) {
5910 					/*
5911 					 * This is not a write fault and
5912 					 * we might have a copy-on-write
5913 					 * obligation to honor (copy object or
5914 					 * "needs_copy" map entry), so do not
5915 					 * give write access yet.
5916 					 * We'll need to catch the first write
5917 					 * to resolve the copy-on-write by
5918 					 * pushing this page to a copy object
5919 					 * or making a shadow object.
5920 					 */
5921 					if (pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
5922 						/*
5923 						 * This pmap enforces extra
5924 						 * constraints for this set of
5925 						 * protections, so we can't
5926 						 * change the protections.
5927 						 * We would expect code-signing
5928 						 * to be bypassed in this case.
5929 						 */
5930 						if (!fault_info->cs_bypass) {
5931 							panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x",
5932 							    __FUNCTION__,
5933 							    pmap,
5934 							    (uint64_t)vaddr,
5935 							    prot,
5936 							    fault_info->pmap_options);
5937 						}
5938 					} else {
5939 						prot &= ~VM_PROT_WRITE;
5940 					}
5941 				}
5942 				if (resilient_media_retry) {
5943 					/*
5944 					 * Not a real write, so no reason to assert.
5945 					 * We've just allocated a new page for this
5946 					 * <object,offset> so we know nobody has any
5947 					 * PTE pointing at any previous version of this
5948 					 * page and no copy-on-write is involved here.
5949 					 * We're just inserting a page of zeroes at this
5950 					 * stage of the shadow chain because the pager
5951 					 * for the lowest object in the shadow chain
5952 					 * said it could not provide that page and we
5953 					 * want to avoid failing the fault and causing
5954 					 * a crash on this "resilient_media" mapping.
5955 					 */
5956 				} else {
5957 					assertf(!((fault_type & VM_PROT_WRITE) && object->vo_copy),
5958 					    "map %p va 0x%llx wrong path for write fault (fault_type 0x%x) on object %p with copy %p\n",
5959 					    map, (uint64_t)vaddr, fault_type, object, object->vo_copy);
5960 				}
5961 
5962 				vm_object_t saved_copy_object;
5963 				uint64_t saved_copy_version;
5964 				saved_copy_object = object->vo_copy;
5965 				saved_copy_version = object->vo_copy_version;
5966 
5967 				/*
5968 				 * Zeroing the page and entering into it into the pmap
5969 				 * represents a significant amount of the zero fill fault handler's work.
5970 				 *
5971 				 * To improve fault scalability, we'll drop the object lock, if it appears contended,
5972 				 * now that we've inserted the page into the vm object.
5973 				 * Before dropping the lock, we need to check protection bits and set the
5974 				 * mapped bits on the page. Then we can mark the page busy, drop the lock,
5975 				 * zero it, and do the pmap enter. We'll need to reacquire the lock
5976 				 * to clear the busy bit and wake up any waiters.
5977 				 */
5978 				vm_fault_cs_clear(m);
5979 				m->vmp_pmapped = TRUE;
5980 				if (map->no_zero_fill) {
5981 					type_of_fault = DBG_NZF_PAGE_FAULT;
5982 				} else {
5983 					type_of_fault = DBG_ZERO_FILL_FAULT;
5984 				}
5985 				{
5986 					pmap_t destination_pmap;
5987 					vm_map_offset_t destination_pmap_vaddr;
5988 					vm_prot_t enter_fault_type;
5989 					if (caller_pmap) {
5990 						destination_pmap = caller_pmap;
5991 						destination_pmap_vaddr = caller_pmap_addr;
5992 					} else {
5993 						destination_pmap = pmap;
5994 						destination_pmap_vaddr = vaddr;
5995 					}
5996 					if (fault_info->fi_change_wiring) {
5997 						enter_fault_type = VM_PROT_NONE;
5998 					} else {
5999 						enter_fault_type = caller_prot;
6000 					}
6001 					assertf(VM_PAGE_OBJECT(m) == object, "m=%p object=%p", m, object);
6002 					kr = vm_fault_enter_prepare(m,
6003 					    destination_pmap,
6004 					    destination_pmap_vaddr,
6005 					    &prot,
6006 					    caller_prot,
6007 					    fault_page_size,
6008 					    fault_phys_offset,
6009 					    enter_fault_type,
6010 					    fault_info,
6011 					    &type_of_fault,
6012 					    &page_needs_data_sync,
6013 					    &page_sleep_needed);
6014 
6015 					assert(!page_sleep_needed);
6016 					if (kr != KERN_SUCCESS) {
6017 						goto zero_fill_cleanup;
6018 					}
6019 
6020 					if (object_is_contended) {
6021 						/*
6022 						 * At this point the page is in the vm object, but not on a paging queue.
6023 						 * Since it's accessible to another thread but its contents are invalid
6024 						 * (it hasn't been zeroed) mark it busy before dropping the object lock.
6025 						 */
6026 						m->vmp_busy = TRUE;
6027 						vm_object_paging_begin(object); /* keep object alive */
6028 						vm_object_unlock(object);
6029 					}
6030 					if (type_of_fault == DBG_ZERO_FILL_FAULT) {
6031 						/*
6032 						 * Now zero fill page...
6033 						 * the page is probably going to
6034 						 * be written soon, so don't bother
6035 						 * to clear the modified bit
6036 						 *
6037 						 *   NOTE: This code holds the map
6038 						 *   lock across the zero fill.
6039 						 */
6040 						vm_page_zero_fill(
6041 							m
6042 #if HAS_MTE
6043 							, true /* zero_tags */
6044 #endif /* HAS_MTE */
6045 							);
6046 						counter_inc(&vm_statistics_zero_fill_count);
6047 						DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
6048 					}
6049 
6050 					if (object_is_contended) {
6051 						/*
6052 						 * It's not safe to do the pmap_enter() without holding
6053 						 * the object lock because its "vo_copy" could change.
6054 						 */
6055 						object_is_contended = false; /* get out of that code path */
6056 
6057 						vm_object_lock(object);
6058 						vm_object_paging_end(object);
6059 						if (object->vo_copy != saved_copy_object ||
6060 						    object->vo_copy_version != saved_copy_version) {
6061 							/*
6062 							 * The COPY_DELAY copy-on-write situation for
6063 							 * this VM object has changed while it was
6064 							 * unlocked, so do not grant write access to
6065 							 * this page.
6066 							 * The write access will fault again and we'll
6067 							 * resolve the copy-on-write then.
6068 							 */
6069 							if (pmap_has_prot_policy(pmap,
6070 							    fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE,
6071 							    prot)) {
6072 								/* we should not do CoW on pmap_has_prot_policy mappings */
6073 								panic("%s: map %p va 0x%llx obj %p,%llu saved %p,%llu: unexpected CoW",
6074 								    __FUNCTION__,
6075 								    map, (uint64_t)vaddr,
6076 								    object, object->vo_copy_version,
6077 								    saved_copy_object, saved_copy_version);
6078 							} else {
6079 								/* the pmap layer is OK with changing the PTE's prot */
6080 								prot &= ~VM_PROT_WRITE;
6081 							}
6082 						}
6083 					}
6084 
6085 					if (page_needs_data_sync) {
6086 						pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
6087 					}
6088 
6089 					if (fault_info->fi_xnu_user_debug &&
6090 					    !object->code_signed) {
6091 						fault_info->pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
6092 					}
6093 					if (object_is_contended) {
6094 						panic("object_is_contended");
6095 						kr = vm_fault_pmap_enter(destination_pmap, destination_pmap_vaddr,
6096 						    fault_page_size, fault_phys_offset,
6097 						    m, &prot, caller_prot, enter_fault_type, wired,
6098 						    fault_info->pmap_options, &need_retry);
6099 						vm_object_lock(object);
6100 						assertf(!((prot & VM_PROT_WRITE) && object->vo_copy),
6101 						    "prot 0x%x object %p copy %p\n",
6102 						    prot, object, object->vo_copy);
6103 					} else {
6104 						need_retry = false;
6105 						kr = vm_fault_pmap_enter_with_object_lock(object, destination_pmap, destination_pmap_vaddr,
6106 						    fault_page_size, fault_phys_offset,
6107 						    m, &prot, caller_prot, enter_fault_type, wired,
6108 						    fault_info->pmap_options, &need_retry, &object_lock_type);
6109 					}
6110 				}
6111 zero_fill_cleanup:
6112 				if (!VM_DYNAMIC_PAGING_ENABLED() &&
6113 				    (object->purgable == VM_PURGABLE_DENY ||
6114 				    object->purgable == VM_PURGABLE_NONVOLATILE ||
6115 				    object->purgable == VM_PURGABLE_VOLATILE)) {
6116 					vm_page_lockspin_queues();
6117 					if (!VM_DYNAMIC_PAGING_ENABLED()) {
6118 						vm_fault_enqueue_throttled_locked(m);
6119 					}
6120 					vm_page_unlock_queues();
6121 				}
6122 				vm_fault_enqueue_page(object, m, wired, fault_info->fi_change_wiring, wire_tag, fault_info->no_cache, &type_of_fault, kr);
6123 
6124 				if (__improbable(rtfault &&
6125 				    !m->vmp_realtime &&
6126 				    vm_pageout_protect_realtime)) {
6127 					vm_page_lock_queues();
6128 					if (!m->vmp_realtime) {
6129 						m->vmp_realtime = true;
6130 						VM_COUNTER_INC(&vm_page_realtime_count);
6131 					}
6132 					vm_page_unlock_queues();
6133 				}
6134 				vm_fault_complete(
6135 					map,
6136 					real_map,
6137 					object,
6138 					m_object,
6139 					m,
6140 					offset,
6141 					trace_real_vaddr,
6142 					fault_info,
6143 					caller_prot,
6144 					real_vaddr,
6145 					type_of_fault,
6146 					need_retry,
6147 					kr,
6148 					physpage_p,
6149 					prot,
6150 					top_object,
6151 					need_collapse,
6152 					cur_offset,
6153 					fault_type,
6154 					&written_on_object,
6155 					&written_on_pager,
6156 					&written_on_offset);
6157 				top_object = VM_OBJECT_NULL;
6158 				if (need_retry) {
6159 					/*
6160 					 * vm_fault_enter couldn't complete the PMAP_ENTER...
6161 					 * at this point we don't hold any locks so it's safe
6162 					 * to ask the pmap layer to expand the page table to
6163 					 * accommodate this mapping... once expanded, we'll
6164 					 * re-drive the fault which should result in vm_fault_enter
6165 					 * being able to successfully enter the mapping this time around
6166 					 */
6167 					(void)pmap_enter_options(
6168 						pmap, vaddr, 0, 0, 0, 0, 0,
6169 						PMAP_OPTIONS_NOENTER, NULL, PMAP_MAPPING_TYPE_INFER);
6170 
6171 					need_retry = FALSE;
6172 					goto RetryFault;
6173 				}
6174 				goto done;
6175 			}
6176 			/*
6177 			 * On to the next level in the shadow chain
6178 			 */
6179 			cur_offset += cur_object->vo_shadow_offset;
6180 			new_object = cur_object->shadow;
6181 			fault_phys_offset = cur_offset - vm_object_trunc_page(cur_offset);
6182 
6183 			/*
6184 			 * take the new_object's lock with the indicated state
6185 			 */
6186 			if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
6187 				vm_object_lock_shared(new_object);
6188 			} else {
6189 				vm_object_lock(new_object);
6190 			}
6191 
6192 			if (cur_object != object) {
6193 				vm_object_unlock(cur_object);
6194 			}
6195 
6196 			cur_object = new_object;
6197 
6198 			continue;
6199 		}
6200 	}
6201 	/*
6202 	 * Cleanup from fast fault failure.  Drop any object
6203 	 * lock other than original and drop map lock.
6204 	 */
6205 	if (object != cur_object) {
6206 		vm_object_unlock(cur_object);
6207 	}
6208 
6209 	/*
6210 	 * must own the object lock exclusively at this point
6211 	 */
6212 	if (object_lock_type == OBJECT_LOCK_SHARED) {
6213 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
6214 
6215 		if (vm_object_lock_upgrade(object) == FALSE) {
6216 			/*
6217 			 * couldn't upgrade, so explictly
6218 			 * take the lock exclusively
6219 			 * no need to retry the fault at this
6220 			 * point since "vm_fault_page" will
6221 			 * completely re-evaluate the state
6222 			 */
6223 			vm_object_lock(object);
6224 		}
6225 	}
6226 
6227 handle_copy_delay:
6228 	vm_map_unlock_read(map);
6229 	if (real_map != map) {
6230 		vm_map_unlock(real_map);
6231 	}
6232 
6233 	if (__improbable(object == compressor_object ||
6234 	    is_kernel_object(object))) {
6235 		/*
6236 		 * These objects are explicitly managed and populated by the
6237 		 * kernel.  The virtual ranges backed by these objects should
6238 		 * either have wired pages or "holes" that are not supposed to
6239 		 * be accessed at all until they get explicitly populated.
6240 		 * We should never have to resolve a fault on a mapping backed
6241 		 * by one of these VM objects and providing a zero-filled page
6242 		 * would be wrong here, so let's fail the fault and let the
6243 		 * caller crash or recover.
6244 		 */
6245 		vm_object_unlock(object);
6246 		kr = KERN_MEMORY_ERROR;
6247 		goto done;
6248 	}
6249 
6250 	resilient_media_ref_transfer = false;
6251 	if (resilient_media_retry) {
6252 		/*
6253 		 * We could get here if we failed to get a free page
6254 		 * to zero-fill and had to take the slow path again.
6255 		 * Reset our "recovery-from-failed-media" state.
6256 		 */
6257 		assert(resilient_media_object != VM_OBJECT_NULL);
6258 		assert(resilient_media_offset != (vm_object_offset_t)-1);
6259 		/* release our extra reference on failed object */
6260 //             printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
6261 		if (object == resilient_media_object) {
6262 			/*
6263 			 * We're holding "object"'s lock, so we can't release
6264 			 * our extra reference at this point.
6265 			 * We need an extra reference on "object" anyway
6266 			 * (see below), so let's just transfer this reference.
6267 			 */
6268 			resilient_media_ref_transfer = true;
6269 		} else {
6270 			vm_object_deallocate(resilient_media_object);
6271 		}
6272 		resilient_media_object = VM_OBJECT_NULL;
6273 		resilient_media_offset = (vm_object_offset_t)-1;
6274 		resilient_media_retry = false;
6275 		vm_fault_resilient_media_abort2++;
6276 	}
6277 
6278 	/*
6279 	 * Make a reference to this object to
6280 	 * prevent its disposal while we are messing with
6281 	 * it.  Once we have the reference, the map is free
6282 	 * to be diddled.  Since objects reference their
6283 	 * shadows (and copies), they will stay around as well.
6284 	 */
6285 	if (resilient_media_ref_transfer) {
6286 		/* we already have an extra reference on this object */
6287 		resilient_media_ref_transfer = false;
6288 	} else {
6289 		vm_object_reference_locked(object);
6290 	}
6291 	vm_object_paging_begin(object);
6292 
6293 	set_thread_pagein_error(cthread, 0);
6294 	error_code = 0;
6295 
6296 	result_page = VM_PAGE_NULL;
6297 	vm_fault_return_t err = vm_fault_page(object, offset, fault_type,
6298 	    (fault_info->fi_change_wiring && !wired),
6299 	    FALSE,                /* page not looked up */
6300 	    &prot, &result_page, &top_page,
6301 	    &type_of_fault,
6302 	    &error_code, map->no_zero_fill,
6303 	    fault_info);
6304 
6305 	/*
6306 	 * if kr != VM_FAULT_SUCCESS, then the paging reference
6307 	 * has been dropped and the object unlocked... the ref_count
6308 	 * is still held
6309 	 *
6310 	 * if kr == VM_FAULT_SUCCESS, then the paging reference
6311 	 * is still held along with the ref_count on the original object
6312 	 *
6313 	 *	the object is returned locked with a paging reference
6314 	 *
6315 	 *	if top_page != NULL, then it's BUSY and the
6316 	 *	object it belongs to has a paging reference
6317 	 *	but is returned unlocked
6318 	 */
6319 	if (err != VM_FAULT_SUCCESS &&
6320 	    err != VM_FAULT_SUCCESS_NO_VM_PAGE) {
6321 		if (err == VM_FAULT_MEMORY_ERROR &&
6322 		    fault_info->resilient_media) {
6323 			assertf(object->internal, "object %p", object);
6324 			/*
6325 			 * This fault failed but the mapping was
6326 			 * "media resilient", so we'll retry the fault in
6327 			 * recovery mode to get a zero-filled page in the
6328 			 * top object.
6329 			 * Keep the reference on the failing object so
6330 			 * that we can check that the mapping is still
6331 			 * pointing to it when we retry the fault.
6332 			 */
6333 //                     printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
6334 			assert(!resilient_media_retry); /* no double retry */
6335 			assert(resilient_media_object == VM_OBJECT_NULL);
6336 			assert(resilient_media_offset == (vm_object_offset_t)-1);
6337 			resilient_media_retry = true;
6338 			resilient_media_object = object;
6339 			resilient_media_offset = offset;
6340 //                     printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
6341 			vm_fault_resilient_media_initiate++;
6342 			goto RetryFault;
6343 		} else {
6344 			/*
6345 			 * we didn't succeed, lose the object reference
6346 			 * immediately.
6347 			 */
6348 			vm_object_deallocate(object);
6349 			object = VM_OBJECT_NULL; /* no longer valid */
6350 		}
6351 
6352 		/*
6353 		 * See why we failed, and take corrective action.
6354 		 */
6355 		switch (err) {
6356 		case VM_FAULT_SUCCESS:
6357 		case VM_FAULT_SUCCESS_NO_VM_PAGE:
6358 			/* These aren't possible but needed to make the switch exhaustive */
6359 			break;
6360 		case VM_FAULT_MEMORY_SHORTAGE:
6361 			if (vm_page_wait((fault_info->fi_change_wiring) ?
6362 			    THREAD_UNINT :
6363 			    THREAD_ABORTSAFE)) {
6364 				goto RetryFault;
6365 			}
6366 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_MEMORY_SHORTAGE), 0 /* arg */);
6367 			OS_FALLTHROUGH;
6368 		case VM_FAULT_INTERRUPTED:
6369 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), 0 /* arg */);
6370 			kr = KERN_ABORTED;
6371 			goto done;
6372 		case VM_FAULT_RETRY:
6373 			goto RetryFault;
6374 		case VM_FAULT_MEMORY_ERROR:
6375 			if (error_code) {
6376 				kr = error_code;
6377 			} else {
6378 				kr = KERN_MEMORY_ERROR;
6379 			}
6380 			goto done;
6381 		case VM_FAULT_BUSY:
6382 			kr = KERN_ALREADY_WAITING;
6383 			goto done;
6384 		}
6385 	}
6386 	m = result_page;
6387 	m_object = NULL;
6388 
6389 	if (m != VM_PAGE_NULL) {
6390 		m_object = VM_PAGE_OBJECT(m);
6391 		assert((fault_info->fi_change_wiring && !wired) ?
6392 		    (top_page == VM_PAGE_NULL) :
6393 		    ((top_page == VM_PAGE_NULL) == (m_object == object)));
6394 	}
6395 
6396 	/*
6397 	 * What to do with the resulting page from vm_fault_page
6398 	 * if it doesn't get entered into the physical map:
6399 	 */
6400 #define RELEASE_PAGE(m)                                 \
6401 	MACRO_BEGIN                                     \
6402 	vm_page_wakeup_done(VM_PAGE_OBJECT(m), m);                            \
6403 	if ( !VM_PAGE_PAGEABLE(m)) {                    \
6404 	        vm_page_lockspin_queues();              \
6405 	        if ( !VM_PAGE_PAGEABLE(m))              \
6406 	                vm_page_activate(m);            \
6407 	        vm_page_unlock_queues();                \
6408 	}                                               \
6409 	MACRO_END
6410 
6411 
6412 	object_locks_dropped = FALSE;
6413 	/*
6414 	 * We must verify that the maps have not changed
6415 	 * since our last lookup. vm_map_verify() needs the
6416 	 * map lock (shared) but we are holding object locks.
6417 	 * So we do a try_lock() first and, if that fails, we
6418 	 * drop the object locks and go in for the map lock again.
6419 	 */
6420 	if (m != VM_PAGE_NULL) {
6421 		old_copy_object = m_object->vo_copy;
6422 		old_copy_version = m_object->vo_copy_version;
6423 	} else {
6424 		old_copy_object = VM_OBJECT_NULL;
6425 		old_copy_version = 0;
6426 	}
6427 	if (!vm_map_try_lock_read(original_map)) {
6428 		if (m != VM_PAGE_NULL) {
6429 			vm_object_unlock(m_object);
6430 		} else {
6431 			vm_object_unlock(object);
6432 		}
6433 
6434 		object_locks_dropped = TRUE;
6435 
6436 		vm_map_lock_read(original_map);
6437 	}
6438 
6439 	if ((map != original_map) || !vm_map_verify(map, &version)) {
6440 		if (object_locks_dropped == FALSE) {
6441 			if (m != VM_PAGE_NULL) {
6442 				vm_object_unlock(m_object);
6443 			} else {
6444 				vm_object_unlock(object);
6445 			}
6446 
6447 			object_locks_dropped = TRUE;
6448 		}
6449 
6450 		/*
6451 		 * no object locks are held at this point
6452 		 */
6453 		vm_object_t             retry_object;
6454 		vm_object_offset_t      retry_offset;
6455 		vm_prot_t               retry_prot;
6456 
6457 		/*
6458 		 * To avoid trying to write_lock the map while another
6459 		 * thread has it read_locked (in vm_map_pageable), we
6460 		 * do not try for write permission.  If the page is
6461 		 * still writable, we will get write permission.  If it
6462 		 * is not, or has been marked needs_copy, we enter the
6463 		 * mapping without write permission, and will merely
6464 		 * take another fault.
6465 		 */
6466 		map = original_map;
6467 
6468 		kr = vm_map_lookup_and_lock_object(&map, vaddr,
6469 		    fault_type & ~VM_PROT_WRITE,
6470 		    OBJECT_LOCK_EXCLUSIVE, &version,
6471 		    &retry_object, &retry_offset, &retry_prot,
6472 		    &wired,
6473 		    fault_info,
6474 		    &real_map,
6475 		    NULL);
6476 		pmap = real_map->pmap;
6477 
6478 		if (kr != KERN_SUCCESS) {
6479 			vm_map_unlock_read(map);
6480 
6481 			if (m != VM_PAGE_NULL) {
6482 				assert(VM_PAGE_OBJECT(m) == m_object);
6483 
6484 				/*
6485 				 * retake the lock so that
6486 				 * we can drop the paging reference
6487 				 * in vm_fault_cleanup and do the
6488 				 * vm_page_wakeup_done() in RELEASE_PAGE
6489 				 */
6490 				vm_object_lock(m_object);
6491 
6492 				RELEASE_PAGE(m);
6493 
6494 				vm_fault_cleanup(m_object, top_page);
6495 			} else {
6496 				/*
6497 				 * retake the lock so that
6498 				 * we can drop the paging reference
6499 				 * in vm_fault_cleanup
6500 				 */
6501 				vm_object_lock(object);
6502 
6503 				vm_fault_cleanup(object, top_page);
6504 			}
6505 			vm_object_deallocate(object);
6506 
6507 			if (kr == KERN_INVALID_ADDRESS) {
6508 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_ADDRESS_NOT_FOUND), 0 /* arg */);
6509 			}
6510 			goto done;
6511 		}
6512 		vm_object_unlock(retry_object);
6513 
6514 		if ((retry_object != object) || (retry_offset != offset)) {
6515 			vm_map_unlock_read(map);
6516 			if (real_map != map) {
6517 				vm_map_unlock(real_map);
6518 			}
6519 
6520 			if (m != VM_PAGE_NULL) {
6521 				assert(VM_PAGE_OBJECT(m) == m_object);
6522 
6523 				/*
6524 				 * retake the lock so that
6525 				 * we can drop the paging reference
6526 				 * in vm_fault_cleanup and do the
6527 				 * vm_page_wakeup_done() in RELEASE_PAGE
6528 				 */
6529 				vm_object_lock(m_object);
6530 
6531 				RELEASE_PAGE(m);
6532 
6533 				vm_fault_cleanup(m_object, top_page);
6534 			} else {
6535 				/*
6536 				 * retake the lock so that
6537 				 * we can drop the paging reference
6538 				 * in vm_fault_cleanup
6539 				 */
6540 				vm_object_lock(object);
6541 
6542 				vm_fault_cleanup(object, top_page);
6543 			}
6544 			vm_object_deallocate(object);
6545 
6546 			goto RetryFault;
6547 		}
6548 		/*
6549 		 * Check whether the protection has changed or the object
6550 		 * has been copied while we left the map unlocked.
6551 		 */
6552 		if (pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, retry_prot)) {
6553 			/* If the pmap layer cares, pass the full set. */
6554 			prot = retry_prot;
6555 		} else {
6556 			prot &= retry_prot;
6557 		}
6558 	}
6559 
6560 	if (object_locks_dropped == TRUE) {
6561 		if (m != VM_PAGE_NULL) {
6562 			assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p", m, m_object);
6563 			assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
6564 			vm_object_lock(m_object);
6565 		} else {
6566 			vm_object_lock(object);
6567 		}
6568 
6569 		object_locks_dropped = FALSE;
6570 	}
6571 
6572 	if ((prot & VM_PROT_WRITE) &&
6573 	    m != VM_PAGE_NULL &&
6574 	    (m_object->vo_copy != old_copy_object ||
6575 	    m_object->vo_copy_version != old_copy_version)) {
6576 		/*
6577 		 * The copy object changed while the top-level object
6578 		 * was unlocked, so take away write permission.
6579 		 */
6580 		if (pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
6581 			/*
6582 			 * This pmap enforces extra constraints for this set
6583 			 * of protections, so we can't change the protections.
6584 			 * This mapping should have been setup to avoid
6585 			 * copy-on-write since that requires removing write
6586 			 * access.
6587 			 */
6588 			panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x m%p obj %p copyobj %p",
6589 			    __FUNCTION__, pmap, (uint64_t)vaddr, prot,
6590 			    fault_info->pmap_options,
6591 			    m, m_object, m_object->vo_copy);
6592 		}
6593 		prot &= ~VM_PROT_WRITE;
6594 	}
6595 
6596 	if (!need_copy &&
6597 	    !fault_info->no_copy_on_read &&
6598 	    m != VM_PAGE_NULL &&
6599 	    VM_PAGE_OBJECT(m) != object &&
6600 	    !VM_PAGE_OBJECT(m)->pager_trusted &&
6601 	    vm_protect_privileged_from_untrusted &&
6602 	    !VM_PAGE_OBJECT(m)->code_signed &&
6603 	    current_proc_is_privileged()) {
6604 		/*
6605 		 * We found the page we want in an "untrusted" VM object
6606 		 * down the shadow chain.  Since the target is "privileged"
6607 		 * we want to perform a copy-on-read of that page, so that the
6608 		 * mapped object gets a stable copy and does not have to
6609 		 * rely on the "untrusted" object to provide the same
6610 		 * contents if the page gets reclaimed and has to be paged
6611 		 * in again later on.
6612 		 *
6613 		 * Special case: if the mapping is executable and the untrusted
6614 		 * object is code-signed and the process is "cs_enforced", we
6615 		 * do not copy-on-read because that would break code-signing
6616 		 * enforcement expectations (an executable page must belong
6617 		 * to a code-signed object) and we can rely on code-signing
6618 		 * to re-validate the page if it gets evicted and paged back in.
6619 		 */
6620 //		printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
6621 		vm_copied_on_read++;
6622 		need_copy_on_read = TRUE;
6623 		need_copy = TRUE;
6624 	} else {
6625 		need_copy_on_read = FALSE;
6626 	}
6627 
6628 	/*
6629 	 * If we want to wire down this page, but no longer have
6630 	 * adequate permissions, we must start all over.
6631 	 * If we decided to copy-on-read, we must also start all over.
6632 	 */
6633 	if ((wired && (fault_type != (prot | VM_PROT_WRITE))) ||
6634 	    need_copy_on_read) {
6635 		vm_map_unlock_read(map);
6636 		if (real_map != map) {
6637 			vm_map_unlock(real_map);
6638 		}
6639 
6640 		if (m != VM_PAGE_NULL) {
6641 			assert(VM_PAGE_OBJECT(m) == m_object);
6642 
6643 			RELEASE_PAGE(m);
6644 
6645 			vm_fault_cleanup(m_object, top_page);
6646 		} else {
6647 			vm_fault_cleanup(object, top_page);
6648 		}
6649 
6650 		vm_object_deallocate(object);
6651 
6652 		goto RetryFault;
6653 	}
6654 	if (m != VM_PAGE_NULL) {
6655 		/*
6656 		 * Put this page into the physical map.
6657 		 * We had to do the unlock above because pmap_enter
6658 		 * may cause other faults.  The page may be on
6659 		 * the pageout queues.  If the pageout daemon comes
6660 		 * across the page, it will remove it from the queues.
6661 		 */
6662 		if (fault_page_size < PAGE_SIZE) {
6663 			DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx pa 0x%llx(0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
6664 			assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
6665 			    fault_phys_offset < PAGE_SIZE),
6666 			    "0x%llx\n", (uint64_t)fault_phys_offset);
6667 		} else {
6668 			assertf(fault_phys_offset == 0,
6669 			    "0x%llx\n", (uint64_t)fault_phys_offset);
6670 		}
6671 		assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p", m, m_object);
6672 		assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
6673 		need_retry = false;
6674 		if (caller_pmap) {
6675 			kr = vm_fault_enter(m,
6676 			    caller_pmap,
6677 			    caller_pmap_addr,
6678 			    fault_page_size,
6679 			    fault_phys_offset,
6680 			    prot,
6681 			    caller_prot,
6682 			    wired,
6683 			    wire_tag,
6684 			    fault_info,
6685 			    &need_retry,
6686 			    &type_of_fault,
6687 			    &object_lock_type,
6688 			    &page_sleep_needed);
6689 		} else {
6690 			kr = vm_fault_enter(m,
6691 			    pmap,
6692 			    vaddr,
6693 			    fault_page_size,
6694 			    fault_phys_offset,
6695 			    prot,
6696 			    caller_prot,
6697 			    wired,
6698 			    wire_tag,
6699 			    fault_info,
6700 			    &need_retry,
6701 			    &type_of_fault,
6702 			    &object_lock_type,
6703 			    &page_sleep_needed);
6704 		}
6705 		assert(VM_PAGE_OBJECT(m) == m_object);
6706 
6707 		{
6708 			int     event_code = 0;
6709 
6710 			if (m_object->internal) {
6711 				event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
6712 			} else if (m_object->object_is_shared_cache) {
6713 				event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
6714 			} else {
6715 				event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
6716 			}
6717 
6718 			KDBG_RELEASE(event_code | DBG_FUNC_NONE, trace_real_vaddr, (fault_info->user_tag << 16) | (caller_prot << 8) | vm_fault_type_for_tracing(need_copy_on_read, type_of_fault), m->vmp_offset, get_current_unique_pid());
6719 			KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_SLOW), get_current_unique_pid());
6720 
6721 			DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
6722 		}
6723 		if ((kr != KERN_SUCCESS) || page_sleep_needed || need_retry) {
6724 			/* abort this page fault */
6725 			vm_page_wakeup_done(m_object, m);
6726 			vm_fault_cleanup(m_object, top_page);
6727 			vm_object_deallocate(object);
6728 
6729 			if (need_retry) {
6730 				/*
6731 				 * We could not expand the page table while holding an
6732 				 * object lock.
6733 				 * Expand it now and retry the fault.
6734 				 */
6735 				assert3u(kr, ==, KERN_RESOURCE_SHORTAGE);
6736 				if (caller_pmap) {
6737 					(void)pmap_enter_options(
6738 						caller_pmap, caller_pmap_addr, 0, 0, 0, 0, 0,
6739 						PMAP_OPTIONS_NOENTER, NULL,
6740 						PMAP_MAPPING_TYPE_INFER);
6741 				} else {
6742 					(void)pmap_enter_options(
6743 						pmap, vaddr, 0, 0, 0, 0, 0,
6744 						PMAP_OPTIONS_NOENTER, NULL,
6745 						PMAP_MAPPING_TYPE_INFER);
6746 				}
6747 				need_retry = FALSE;
6748 				kr = KERN_SUCCESS; /* retry fault instead of failing below */
6749 			}
6750 
6751 			vm_map_unlock_read(map);
6752 			if (real_map != map) {
6753 				vm_map_unlock(real_map);
6754 			}
6755 
6756 			if (kr != KERN_SUCCESS) {
6757 				goto done;
6758 			}
6759 			goto RetryFault;
6760 		}
6761 		if (physpage_p != NULL) {
6762 			/* for vm_map_wire_and_extract() */
6763 			*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6764 			if (prot & VM_PROT_WRITE) {
6765 				vm_object_lock_assert_exclusive(m_object);
6766 				m->vmp_dirty = TRUE;
6767 			}
6768 		}
6769 	} else {
6770 		vm_map_entry_t          entry;
6771 		vm_map_offset_t         laddr;
6772 		vm_map_offset_t         ldelta, hdelta;
6773 
6774 		/*
6775 		 * do a pmap block mapping from the physical address
6776 		 * in the object
6777 		 */
6778 
6779 		if (real_map != map) {
6780 			vm_map_unlock(real_map);
6781 		}
6782 
6783 		if (original_map != map) {
6784 			vm_map_unlock_read(map);
6785 			vm_map_lock_read(original_map);
6786 			map = original_map;
6787 		}
6788 		real_map = map;
6789 
6790 		laddr = vaddr;
6791 		hdelta = ldelta = (vm_map_offset_t)0xFFFFFFFFFFFFF000ULL;
6792 
6793 		while (vm_map_lookup_entry(map, laddr, &entry)) {
6794 			if (ldelta > (laddr - entry->vme_start)) {
6795 				ldelta = laddr - entry->vme_start;
6796 			}
6797 			if (hdelta > (entry->vme_end - laddr)) {
6798 				hdelta = entry->vme_end - laddr;
6799 			}
6800 			if (entry->is_sub_map) {
6801 				vm_map_t sub_map;
6802 				bool use_pmap;
6803 
6804 				laddr = ((laddr - entry->vme_start)
6805 				    + VME_OFFSET(entry));
6806 				vm_map_lock_read(VME_SUBMAP(entry));
6807 				sub_map = VME_SUBMAP(entry);
6808 				use_pmap = entry->use_pmap;
6809 				entry = VM_MAP_ENTRY_NULL; /* not valid after unlock */
6810 				if (map != real_map) {
6811 					vm_map_unlock_read(map);
6812 				}
6813 				if (use_pmap) {
6814 					vm_map_unlock_read(real_map);
6815 					real_map = sub_map;
6816 				}
6817 				map = sub_map;
6818 			} else {
6819 				break;
6820 			}
6821 		}
6822 
6823 		if (vm_map_lookup_entry(map, laddr, &entry) &&
6824 		    (!entry->is_sub_map) &&
6825 		    (object != VM_OBJECT_NULL) &&
6826 		    (VME_OBJECT(entry) == object)) {
6827 			uint16_t superpage;
6828 
6829 			if (!object->pager_created &&
6830 			    object->phys_contiguous &&
6831 			    VME_OFFSET(entry) == 0 &&
6832 			    (entry->vme_end - entry->vme_start == object->vo_size) &&
6833 			    VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
6834 				superpage = VM_MEM_SUPERPAGE;
6835 			} else {
6836 				superpage = 0;
6837 			}
6838 
6839 			if (superpage && physpage_p) {
6840 				/* for vm_map_wire_and_extract() */
6841 				*physpage_p = (ppnum_t)
6842 				    ((((vm_map_offset_t)
6843 				    object->vo_shadow_offset)
6844 				    + VME_OFFSET(entry)
6845 				    + (laddr - entry->vme_start))
6846 				    >> PAGE_SHIFT);
6847 			}
6848 
6849 			/*
6850 			 * Set up a block mapped area
6851 			 */
6852 			assert((uint32_t)((ldelta + hdelta) >> fault_page_shift) == ((ldelta + hdelta) >> fault_page_shift));
6853 			pmap_t block_map_pmap;
6854 			addr64_t block_map_va;
6855 			pmap_paddr_t block_map_pa = (pmap_paddr_t)(((vm_map_offset_t)(object->vo_shadow_offset)) +
6856 			    VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta);
6857 			int block_map_wimg = VM_WIMG_MASK & (int)object->wimg_bits;
6858 			if (caller_pmap) {
6859 				block_map_pmap = caller_pmap;
6860 				block_map_va = (addr64_t)(caller_pmap_addr - ldelta);
6861 			} else {
6862 				block_map_pmap = real_map->pmap;
6863 				block_map_va = (addr64_t)(vaddr - ldelta);
6864 			}
6865 #if HAS_MTE
6866 			/*
6867 			 * We hit this path if we return SUCCESS from vm_fault_page but don't
6868 			 * return a page. This happens if we're trying to fault in a
6869 			 * phys_contiguous object (used by device pagers and superpages), or
6870 			 * if the page is non-VM managed. Both of these cases are not
6871 			 * expected to occur with MTE.
6872 			 */
6873 			assert(!vm_should_override_mte_cacheattr(block_map_pmap, object, block_map_va, block_map_pa));
6874 #endif /* HAS_MTE */
6875 			kr = pmap_map_block_addr(block_map_pmap,
6876 			    block_map_va,
6877 			    block_map_pa,
6878 			    (uint32_t)((ldelta + hdelta) >> fault_page_shift),
6879 			    prot,
6880 			    block_map_wimg | superpage,
6881 			    0);
6882 
6883 			if (kr != KERN_SUCCESS) {
6884 				goto cleanup;
6885 			}
6886 		}
6887 	}
6888 
6889 	/*
6890 	 * Success
6891 	 */
6892 	kr = KERN_SUCCESS;
6893 
6894 	/*
6895 	 * TODO: could most of the done cases just use cleanup?
6896 	 */
6897 cleanup:
6898 	/*
6899 	 * Unlock everything, and return
6900 	 */
6901 	vm_map_unlock_read(map);
6902 	if (real_map != map) {
6903 		vm_map_unlock(real_map);
6904 	}
6905 
6906 	if (m != VM_PAGE_NULL) {
6907 		if (__improbable(rtfault &&
6908 		    !m->vmp_realtime &&
6909 		    vm_pageout_protect_realtime)) {
6910 			vm_page_lock_queues();
6911 			if (!m->vmp_realtime) {
6912 				m->vmp_realtime = true;
6913 				VM_COUNTER_INC(&vm_page_realtime_count);
6914 			}
6915 			vm_page_unlock_queues();
6916 		}
6917 		assert(VM_PAGE_OBJECT(m) == m_object);
6918 
6919 		if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
6920 			vm_object_paging_begin(m_object);
6921 
6922 			assert3p(written_on_object, ==, VM_OBJECT_NULL);
6923 			written_on_object = m_object;
6924 			written_on_pager = m_object->pager;
6925 			written_on_offset = m_object->paging_offset + m->vmp_offset;
6926 		}
6927 		vm_page_wakeup_done(m_object, m);
6928 
6929 		vm_fault_cleanup(m_object, top_page);
6930 	} else {
6931 		vm_fault_cleanup(object, top_page);
6932 	}
6933 
6934 	vm_object_deallocate(object);
6935 
6936 #undef  RELEASE_PAGE
6937 
6938 done:
6939 	thread_interrupt_level(interruptible_state);
6940 
6941 	if (resilient_media_object != VM_OBJECT_NULL) {
6942 		assert(resilient_media_retry);
6943 		assert(resilient_media_offset != (vm_object_offset_t)-1);
6944 		/* release extra reference on failed object */
6945 //             printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
6946 		vm_object_deallocate(resilient_media_object);
6947 		resilient_media_object = VM_OBJECT_NULL;
6948 		resilient_media_offset = (vm_object_offset_t)-1;
6949 		resilient_media_retry = false;
6950 		vm_fault_resilient_media_release++;
6951 	}
6952 	assert(!resilient_media_retry);
6953 
6954 	/*
6955 	 * Only I/O throttle on faults which cause a pagein/swapin.
6956 	 */
6957 	if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
6958 		throttle_lowpri_io(1);
6959 	} else {
6960 		if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
6961 			if ((throttle_delay = vm_page_throttled(TRUE))) {
6962 				if (vm_debug_events) {
6963 					if (type_of_fault == DBG_COMPRESSOR_FAULT) {
6964 						VM_DEBUG_EVENT(vmf_compressordelay, DBG_VM_FAULT_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6965 					} else if (type_of_fault == DBG_COW_FAULT) {
6966 						VM_DEBUG_EVENT(vmf_cowdelay, DBG_VM_FAULT_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6967 					} else {
6968 						VM_DEBUG_EVENT(vmf_zfdelay, DBG_VM_FAULT_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6969 					}
6970 				}
6971 				__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
6972 			}
6973 		}
6974 	}
6975 
6976 	if (written_on_object) {
6977 		vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
6978 
6979 		vm_object_lock(written_on_object);
6980 		vm_object_paging_end(written_on_object);
6981 		vm_object_unlock(written_on_object);
6982 
6983 		written_on_object = VM_OBJECT_NULL;
6984 	}
6985 
6986 	if (rtfault) {
6987 		vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
6988 	}
6989 
6990 	KDBG_RELEASE(
6991 		(VMDBG_CODE(DBG_VM_FAULT_INTERNAL)) | DBG_FUNC_END,
6992 		((uint64_t)trace_vaddr >> 32),
6993 		trace_vaddr,
6994 		kr,
6995 		vm_fault_type_for_tracing(need_copy_on_read, type_of_fault));
6996 
6997 	if (fault_page_size < PAGE_SIZE && kr != KERN_SUCCESS) {
6998 		DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map, original_map, (uint64_t)trace_real_vaddr, kr);
6999 	}
7000 
7001 	vmlp_api_end(VM_FAULT_INTERNAL, KERN_FAILURE);
7002 	return kr;
7003 }
7004 
7005 /*
7006  *	vm_fault_wire:
7007  *
7008  *	Wire down a range of virtual addresses in a map.
7009  */
7010 kern_return_t
vm_fault_wire(vm_map_t map,vm_map_entry_t entry,vm_prot_t prot,vm_tag_t wire_tag,pmap_t pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)7011 vm_fault_wire(
7012 	vm_map_t        map,
7013 	vm_map_entry_t  entry,
7014 	vm_prot_t       prot,
7015 	vm_tag_t        wire_tag,
7016 	pmap_t          pmap,
7017 	vm_map_offset_t pmap_addr,
7018 	ppnum_t         *physpage_p)
7019 {
7020 	vm_map_offset_t va;
7021 	vm_map_offset_t end_addr = entry->vme_end;
7022 	kern_return_t   rc;
7023 	vm_map_size_t   effective_page_size;
7024 
7025 	assert(entry->in_transition);
7026 
7027 	if (!entry->is_sub_map &&
7028 	    VME_OBJECT(entry) != VM_OBJECT_NULL &&
7029 	    VME_OBJECT(entry)->phys_contiguous) {
7030 		return KERN_SUCCESS;
7031 	}
7032 
7033 	/*
7034 	 *	Inform the physical mapping system that the
7035 	 *	range of addresses may not fault, so that
7036 	 *	page tables and such can be locked down as well.
7037 	 */
7038 
7039 	pmap_pageable(pmap, pmap_addr,
7040 	    pmap_addr + (end_addr - entry->vme_start), FALSE);
7041 
7042 	/*
7043 	 *	We simulate a fault to get the page and enter it
7044 	 *	in the physical map.
7045 	 */
7046 
7047 	effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
7048 	for (va = entry->vme_start;
7049 	    va < end_addr;
7050 	    va += effective_page_size) {
7051 		rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
7052 		    pmap_addr + (va - entry->vme_start),
7053 		    physpage_p);
7054 		if (rc != KERN_SUCCESS) {
7055 			struct vm_object_fault_info fault_info = {
7056 				.interruptible = (pmap == kernel_pmap) ? THREAD_UNINT : THREAD_ABORTSAFE,
7057 				.behavior = VM_BEHAVIOR_SEQUENTIAL,
7058 				.fi_change_wiring = true,
7059 			};
7060 			if (os_sub_overflow(end_addr, va, &fault_info.cluster_size)) {
7061 				fault_info.cluster_size = UPL_SIZE_MAX;
7062 			}
7063 			rc = vm_fault_internal(map, va, prot, wire_tag,
7064 			    pmap,
7065 			    (pmap_addr +
7066 			    (va - entry->vme_start)),
7067 			    physpage_p,
7068 			    &fault_info);
7069 			DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
7070 		}
7071 
7072 		if (rc != KERN_SUCCESS) {
7073 			struct vm_map_entry     tmp_entry = *entry;
7074 
7075 			/* unwire wired pages */
7076 			tmp_entry.vme_end = va;
7077 			vm_fault_unwire(map, &tmp_entry, FALSE,
7078 			    pmap, pmap_addr, tmp_entry.vme_end);
7079 
7080 			return rc;
7081 		}
7082 	}
7083 	return KERN_SUCCESS;
7084 }
7085 
7086 /*
7087  *	vm_fault_unwire:
7088  *
7089  *	Unwire a range of virtual addresses in a map.
7090  */
7091 void
vm_fault_unwire(vm_map_t map,vm_map_entry_t entry,boolean_t deallocate,pmap_t pmap,vm_map_offset_t pmap_addr,vm_map_offset_t end_addr)7092 vm_fault_unwire(
7093 	vm_map_t        map,
7094 	vm_map_entry_t  entry,
7095 	boolean_t       deallocate,
7096 	pmap_t          pmap,
7097 	vm_map_offset_t pmap_addr,
7098 	vm_map_offset_t end_addr)
7099 {
7100 	vm_map_offset_t va;
7101 	vm_object_t     object;
7102 	struct vm_object_fault_info fault_info = {
7103 		.interruptible = THREAD_UNINT,
7104 	};
7105 	unsigned int    unwired_pages;
7106 	vm_map_size_t   effective_page_size;
7107 
7108 	object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
7109 
7110 	/*
7111 	 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
7112 	 * do anything since such memory is wired by default.  So we don't have
7113 	 * anything to undo here.
7114 	 */
7115 
7116 	if (object != VM_OBJECT_NULL && object->phys_contiguous) {
7117 		return;
7118 	}
7119 
7120 	fault_info.interruptible = THREAD_UNINT;
7121 	fault_info.behavior = entry->behavior;
7122 	fault_info.user_tag = VME_ALIAS(entry);
7123 	if (entry->iokit_acct ||
7124 	    (!entry->is_sub_map && !entry->use_pmap)) {
7125 		fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
7126 	}
7127 	fault_info.lo_offset = VME_OFFSET(entry);
7128 	fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
7129 	fault_info.no_cache = entry->no_cache;
7130 	fault_info.stealth = TRUE;
7131 	if (entry->vme_xnu_user_debug) {
7132 		/*
7133 		 * Modified code-signed executable region: wired pages must
7134 		 * have been copied, so they should be XNU_USER_DEBUG rather
7135 		 * than XNU_USER_EXEC.
7136 		 */
7137 		fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
7138 	}
7139 
7140 	unwired_pages = 0;
7141 
7142 	/*
7143 	 *	Since the pages are wired down, we must be able to
7144 	 *	get their mappings from the physical map system.
7145 	 */
7146 
7147 	effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
7148 	for (va = entry->vme_start;
7149 	    va < end_addr;
7150 	    va += effective_page_size) {
7151 		if (object == VM_OBJECT_NULL) {
7152 			if (pmap) {
7153 				pmap_change_wiring(pmap,
7154 				    pmap_addr + (va - entry->vme_start), FALSE);
7155 			}
7156 			(void) vm_fault(map, va, VM_PROT_NONE,
7157 			    TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
7158 		} else {
7159 			vm_prot_t       prot;
7160 			vm_page_t       result_page;
7161 			vm_page_t       top_page;
7162 			vm_object_t     result_object;
7163 			vm_fault_return_t result;
7164 
7165 			/* cap cluster size at maximum UPL size */
7166 			upl_size_t cluster_size;
7167 			if (os_sub_overflow(end_addr, va, &cluster_size)) {
7168 				cluster_size = UPL_SIZE_MAX;
7169 			}
7170 			fault_info.cluster_size = cluster_size;
7171 
7172 			do {
7173 				prot = VM_PROT_NONE;
7174 
7175 				vm_object_lock(object);
7176 				vm_object_paging_begin(object);
7177 				result_page = VM_PAGE_NULL;
7178 				result = vm_fault_page(
7179 					object,
7180 					(VME_OFFSET(entry) +
7181 					(va - entry->vme_start)),
7182 					VM_PROT_NONE, TRUE,
7183 					FALSE, /* page not looked up */
7184 					&prot, &result_page, &top_page,
7185 					(int *)0,
7186 					NULL, map->no_zero_fill,
7187 					&fault_info);
7188 			} while (result == VM_FAULT_RETRY);
7189 
7190 			/*
7191 			 * If this was a mapping to a file on a device that has been forcibly
7192 			 * unmounted, then we won't get a page back from vm_fault_page().  Just
7193 			 * move on to the next one in case the remaining pages are mapped from
7194 			 * different objects.  During a forced unmount, the object is terminated
7195 			 * so the alive flag will be false if this happens.  A forced unmount will
7196 			 * will occur when an external disk is unplugged before the user does an
7197 			 * eject, so we don't want to panic in that situation.
7198 			 */
7199 
7200 			if (result == VM_FAULT_MEMORY_ERROR) {
7201 				if (!object->alive) {
7202 					continue;
7203 				}
7204 				if (!object->internal && object->pager == NULL) {
7205 					continue;
7206 				}
7207 			}
7208 
7209 			if (result == VM_FAULT_MEMORY_ERROR &&
7210 			    is_kernel_object(object)) {
7211 				/*
7212 				 * This must have been allocated with
7213 				 * KMA_KOBJECT and KMA_VAONLY and there's
7214 				 * no physical page at this offset.
7215 				 * We're done (no page to free).
7216 				 */
7217 				assert(deallocate);
7218 				continue;
7219 			}
7220 
7221 			if (result != VM_FAULT_SUCCESS) {
7222 				panic("vm_fault_unwire: failure");
7223 			}
7224 
7225 			result_object = VM_PAGE_OBJECT(result_page);
7226 
7227 			if (deallocate) {
7228 				assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
7229 				    vm_page_fictitious_addr);
7230 				pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
7231 				if (VM_PAGE_WIRED(result_page)) {
7232 					unwired_pages++;
7233 				}
7234 				VM_PAGE_FREE(result_page);
7235 			} else {
7236 				if (pmap && !vm_page_is_guard(result_page)) {
7237 					pmap_change_wiring(pmap,
7238 					    pmap_addr + (va - entry->vme_start), FALSE);
7239 				}
7240 
7241 
7242 				if (VM_PAGE_WIRED(result_page)) {
7243 					vm_page_lockspin_queues();
7244 					vm_page_unwire(result_page, TRUE);
7245 					vm_page_unlock_queues();
7246 					unwired_pages++;
7247 				}
7248 				if (entry->zero_wired_pages &&
7249 				    (entry->protection & VM_PROT_WRITE) &&
7250 #if __arm64e__
7251 				    !entry->used_for_tpro &&
7252 #endif /* __arm64e__ */
7253 				    !entry->used_for_jit) {
7254 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
7255 				}
7256 
7257 				vm_page_wakeup_done(result_object, result_page);
7258 			}
7259 			vm_fault_cleanup(result_object, top_page);
7260 		}
7261 	}
7262 
7263 	/*
7264 	 *	Inform the physical mapping system that the range
7265 	 *	of addresses may fault, so that page tables and
7266 	 *	such may be unwired themselves.
7267 	 */
7268 
7269 	pmap_pageable(pmap, pmap_addr,
7270 	    pmap_addr + (end_addr - entry->vme_start), TRUE);
7271 
7272 	if (is_kernel_object(object)) {
7273 		/*
7274 		 * Would like to make user_tag in vm_object_fault_info
7275 		 * vm_tag_t (unsigned short) but user_tag derives its value from
7276 		 * VME_ALIAS(entry) at a few places and VME_ALIAS, in turn, casts
7277 		 * to an _unsigned int_ which is used by non-fault_info paths throughout the
7278 		 * code at many places.
7279 		 *
7280 		 * So, for now, an explicit truncation to unsigned short (vm_tag_t).
7281 		 */
7282 		assertf((fault_info.user_tag & VME_ALIAS_MASK) == fault_info.user_tag,
7283 		    "VM Tag truncated from 0x%x to 0x%x\n", fault_info.user_tag, (fault_info.user_tag & VME_ALIAS_MASK));
7284 		vm_tag_update_size((vm_tag_t) fault_info.user_tag, -ptoa_64(unwired_pages), NULL);
7285 	}
7286 }
7287 
7288 /*
7289  *	vm_fault_wire_fast:
7290  *
7291  *	Handle common case of a wire down page fault at the given address.
7292  *	If successful, the page is inserted into the associated physical map.
7293  *	The map entry is passed in to avoid the overhead of a map lookup.
7294  *
7295  *	NOTE: the given address should be truncated to the
7296  *	proper page address.
7297  *
7298  *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
7299  *	a standard error specifying why the fault is fatal is returned.
7300  *
7301  *	The map in question must be referenced, and remains so.
7302  *	Caller has a read lock on the map.
7303  *
7304  *	This is a stripped version of vm_fault() for wiring pages.  Anything
7305  *	other than the common case will return KERN_FAILURE, and the caller
7306  *	is expected to call vm_fault().
7307  */
7308 static kern_return_t
vm_fault_wire_fast(__unused vm_map_t map,vm_map_offset_t va,__unused vm_prot_t caller_prot,vm_tag_t wire_tag,vm_map_entry_t entry,pmap_t pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)7309 vm_fault_wire_fast(
7310 	__unused vm_map_t       map,
7311 	vm_map_offset_t va,
7312 	__unused vm_prot_t       caller_prot,
7313 	vm_tag_t        wire_tag,
7314 	vm_map_entry_t  entry,
7315 	pmap_t          pmap,
7316 	vm_map_offset_t pmap_addr,
7317 	ppnum_t         *physpage_p)
7318 {
7319 	vm_object_t             object;
7320 	vm_object_offset_t      offset;
7321 	vm_page_t               m;
7322 	vm_prot_t               prot;
7323 	thread_t                thread = current_thread();
7324 	int                     type_of_fault;
7325 	kern_return_t           kr;
7326 	vm_map_size_t           fault_page_size;
7327 	vm_map_offset_t         fault_phys_offset;
7328 	struct vm_object_fault_info fault_info = {
7329 		.interruptible = THREAD_UNINT,
7330 	};
7331 	uint8_t                 object_lock_type = 0;
7332 
7333 	counter_inc(&vm_statistics_faults);
7334 
7335 	if (thread != THREAD_NULL) {
7336 		counter_inc(&get_threadtask(thread)->faults);
7337 	}
7338 
7339 /*
7340  *	Recovery actions
7341  */
7342 
7343 #undef  RELEASE_PAGE
7344 #define RELEASE_PAGE(m) {                               \
7345 	vm_page_wakeup_done(VM_PAGE_OBJECT(m), m);                            \
7346 	vm_page_lockspin_queues();                      \
7347 	vm_page_unwire(m, TRUE);                        \
7348 	vm_page_unlock_queues();                        \
7349 }
7350 
7351 
7352 #undef  UNLOCK_THINGS
7353 #define UNLOCK_THINGS   {                               \
7354 	vm_object_paging_end(object);                      \
7355 	vm_object_unlock(object);                          \
7356 }
7357 
7358 #undef  UNLOCK_AND_DEALLOCATE
7359 #define UNLOCK_AND_DEALLOCATE   {                       \
7360 	UNLOCK_THINGS;                                  \
7361 	vm_object_deallocate(object);                   \
7362 }
7363 /*
7364  *	Give up and have caller do things the hard way.
7365  */
7366 
7367 #define GIVE_UP {                                       \
7368 	UNLOCK_AND_DEALLOCATE;                          \
7369 	return(KERN_FAILURE);                           \
7370 }
7371 
7372 
7373 	/*
7374 	 *	If this entry is not directly to a vm_object, bail out.
7375 	 */
7376 	if (entry->is_sub_map) {
7377 		assert(physpage_p == NULL);
7378 		return KERN_FAILURE;
7379 	}
7380 
7381 	/*
7382 	 *	Find the backing store object and offset into it.
7383 	 */
7384 
7385 	object = VME_OBJECT(entry);
7386 	offset = (va - entry->vme_start) + VME_OFFSET(entry);
7387 	prot = entry->protection;
7388 
7389 	/*
7390 	 *	Make a reference to this object to prevent its
7391 	 *	disposal while we are messing with it.
7392 	 */
7393 
7394 	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
7395 	vm_object_lock(object);
7396 	vm_object_reference_locked(object);
7397 	vm_object_paging_begin(object);
7398 
7399 	/*
7400 	 *	INVARIANTS (through entire routine):
7401 	 *
7402 	 *	1)	At all times, we must either have the object
7403 	 *		lock or a busy page in some object to prevent
7404 	 *		some other thread from trying to bring in
7405 	 *		the same page.
7406 	 *
7407 	 *	2)	Once we have a busy page, we must remove it from
7408 	 *		the pageout queues, so that the pageout daemon
7409 	 *		will not grab it away.
7410 	 *
7411 	 */
7412 
7413 	if (entry->needs_copy) {
7414 		panic("attempting to wire needs_copy memory");
7415 	}
7416 
7417 	/*
7418 	 * Since we don't have the machinary to resolve CoW obligations on the fast
7419 	 * path, if we might have to push pages to a copy, just give up.
7420 	 */
7421 	if (object->vo_copy != VM_OBJECT_NULL) {
7422 		GIVE_UP;
7423 	}
7424 
7425 	/*
7426 	 *	Look for page in top-level object.  If it's not there or
7427 	 *	there's something going on, give up.
7428 	 */
7429 	m = vm_page_lookup(object, vm_object_trunc_page(offset));
7430 	if ((m == VM_PAGE_NULL) || (m->vmp_busy) ||
7431 	    (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_absent))) {
7432 		GIVE_UP;
7433 	}
7434 	if (vm_page_is_guard(m)) {
7435 		/*
7436 		 * Guard pages are fictitious pages and are never
7437 		 * entered into a pmap, so let's say it's been wired...
7438 		 */
7439 		kr = KERN_SUCCESS;
7440 		goto done;
7441 	}
7442 
7443 	/*
7444 	 *	Wire the page down now.  All bail outs beyond this
7445 	 *	point must unwire the page.
7446 	 */
7447 
7448 	vm_page_lockspin_queues();
7449 	vm_page_wire(m, wire_tag, TRUE);
7450 	vm_page_unlock_queues();
7451 
7452 	/*
7453 	 *	Mark page busy for other threads.
7454 	 */
7455 	assert(!m->vmp_busy);
7456 	m->vmp_busy = TRUE;
7457 	assert(!m->vmp_absent);
7458 
7459 	fault_info.user_tag = VME_ALIAS(entry);
7460 	fault_info.pmap_options = 0;
7461 	if (entry->iokit_acct ||
7462 	    (!entry->is_sub_map && !entry->use_pmap)) {
7463 		fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
7464 	}
7465 	if (entry->vme_xnu_user_debug) {
7466 		/*
7467 		 * Modified code-signed executable region: wiring will
7468 		 * copy the pages, so they should be XNU_USER_DEBUG rather
7469 		 * than XNU_USER_EXEC.
7470 		 */
7471 		fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
7472 	}
7473 
7474 	if (entry->translated_allow_execute) {
7475 		fault_info.pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
7476 	}
7477 
7478 	fault_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
7479 	fault_phys_offset = offset - vm_object_trunc_page(offset);
7480 
7481 	/*
7482 	 *	Put this page into the physical map.
7483 	 */
7484 	type_of_fault = DBG_CACHE_HIT_FAULT;
7485 	assert3p(VM_PAGE_OBJECT(m), ==, object);
7486 	bool page_sleep_needed = false;
7487 	bool need_retry = false;
7488 	kr = vm_fault_enter(m,
7489 	    pmap,
7490 	    pmap_addr,
7491 	    fault_page_size,
7492 	    fault_phys_offset,
7493 	    prot,
7494 	    prot,
7495 	    TRUE,                  /* wired */
7496 	    wire_tag,
7497 	    &fault_info,
7498 	    &need_retry,
7499 	    &type_of_fault,
7500 	    &object_lock_type, /* Exclusive lock mode. Will remain unchanged.*/
7501 	    &page_sleep_needed);
7502 	if ((kr != KERN_SUCCESS) || page_sleep_needed || need_retry) {
7503 		RELEASE_PAGE(m);
7504 		GIVE_UP;
7505 	}
7506 
7507 
7508 done:
7509 	/*
7510 	 *	Unlock everything, and return
7511 	 */
7512 
7513 	if (physpage_p) {
7514 		/* for vm_map_wire_and_extract() */
7515 		if (kr == KERN_SUCCESS) {
7516 			assert3p(object, ==, VM_PAGE_OBJECT(m));
7517 			*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
7518 			if (prot & VM_PROT_WRITE) {
7519 				vm_object_lock_assert_exclusive(object);
7520 				m->vmp_dirty = TRUE;
7521 			}
7522 		} else {
7523 			*physpage_p = 0;
7524 		}
7525 	}
7526 
7527 	if (m->vmp_busy) {
7528 		vm_page_wakeup_done(object, m);
7529 	}
7530 
7531 	UNLOCK_AND_DEALLOCATE;
7532 
7533 	return kr;
7534 }
7535 
7536 /*
7537  *	Routine:	vm_fault_copy_cleanup
7538  *	Purpose:
7539  *		Release a page used by vm_fault_copy.
7540  */
7541 
7542 static void
vm_fault_copy_cleanup(vm_page_t page,vm_page_t top_page)7543 vm_fault_copy_cleanup(
7544 	vm_page_t       page,
7545 	vm_page_t       top_page)
7546 {
7547 	vm_object_t     object = VM_PAGE_OBJECT(page);
7548 
7549 	vm_object_lock(object);
7550 	vm_page_wakeup_done(object, page);
7551 	if (!VM_PAGE_PAGEABLE(page)) {
7552 		vm_page_lockspin_queues();
7553 		if (!VM_PAGE_PAGEABLE(page)) {
7554 			vm_page_activate(page);
7555 		}
7556 		vm_page_unlock_queues();
7557 	}
7558 	vm_fault_cleanup(object, top_page);
7559 }
7560 
7561 static void
vm_fault_copy_dst_cleanup(vm_page_t page)7562 vm_fault_copy_dst_cleanup(
7563 	vm_page_t       page)
7564 {
7565 	vm_object_t     object;
7566 
7567 	if (page != VM_PAGE_NULL) {
7568 		object = VM_PAGE_OBJECT(page);
7569 		vm_object_lock(object);
7570 		vm_page_lockspin_queues();
7571 		vm_page_unwire(page, TRUE);
7572 		vm_page_unlock_queues();
7573 		vm_object_paging_end(object);
7574 		vm_object_unlock(object);
7575 	}
7576 }
7577 
7578 /*
7579  *	Routine:	vm_fault_copy
7580  *
7581  *	Purpose:
7582  *		Copy pages from one virtual memory object to another --
7583  *		neither the source nor destination pages need be resident.
7584  *
7585  *		Before actually copying a page, the version associated with
7586  *		the destination address map wil be verified.
7587  *
7588  *	In/out conditions:
7589  *		The caller must hold a reference, but not a lock, to
7590  *		each of the source and destination objects and to the
7591  *		destination map.
7592  *
7593  *	Results:
7594  *		Returns KERN_SUCCESS if no errors were encountered in
7595  *		reading or writing the data.  Returns KERN_INTERRUPTED if
7596  *		the operation was interrupted (only possible if the
7597  *		"interruptible" argument is asserted).  Other return values
7598  *		indicate a permanent error in copying the data.
7599  *
7600  *		The actual amount of data copied will be returned in the
7601  *		"copy_size" argument.  In the event that the destination map
7602  *		verification failed, this amount may be less than the amount
7603  *		requested.
7604  */
7605 kern_return_t
vm_fault_copy(vm_object_t src_object,vm_object_offset_t src_offset,vm_map_size_t * copy_size,vm_object_t dst_object,vm_object_offset_t dst_offset,vm_map_t dst_map,vm_map_version_t * dst_version,int interruptible)7606 vm_fault_copy(
7607 	vm_object_t             src_object,
7608 	vm_object_offset_t      src_offset,
7609 	vm_map_size_t           *copy_size,             /* INOUT */
7610 	vm_object_t             dst_object,
7611 	vm_object_offset_t      dst_offset,
7612 	vm_map_t                dst_map,
7613 	vm_map_version_t         *dst_version,
7614 	int                     interruptible)
7615 {
7616 	vm_page_t               result_page;
7617 
7618 	vm_page_t               src_page;
7619 	vm_page_t               src_top_page;
7620 	vm_prot_t               src_prot;
7621 
7622 	vm_page_t               dst_page;
7623 	vm_page_t               dst_top_page;
7624 	vm_prot_t               dst_prot;
7625 
7626 	vm_map_size_t           amount_left;
7627 	vm_object_t             old_copy_object;
7628 	uint64_t                old_copy_version;
7629 	vm_object_t             result_page_object = NULL;
7630 	kern_return_t           error = 0;
7631 	vm_fault_return_t       result;
7632 
7633 	vm_map_size_t           part_size;
7634 	struct vm_object_fault_info fault_info_src = {};
7635 	struct vm_object_fault_info fault_info_dst = {};
7636 
7637 	vmlp_api_start(VM_FAULT_COPY);
7638 	vmlp_range_event(dst_map, dst_offset, *copy_size);
7639 
7640 	/*
7641 	 * In order not to confuse the clustered pageins, align
7642 	 * the different offsets on a page boundary.
7643 	 */
7644 
7645 #define RETURN(x)                                       \
7646 	MACRO_BEGIN                                     \
7647 	*copy_size -= amount_left;                      \
7648 	vmlp_api_end(VM_FAULT_COPY, x);                 \
7649 	MACRO_RETURN(x);                                \
7650 	MACRO_END
7651 
7652 	amount_left = *copy_size;
7653 
7654 	fault_info_src.interruptible = interruptible;
7655 	fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
7656 	fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
7657 	fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
7658 	fault_info_src.stealth = TRUE;
7659 
7660 	fault_info_dst.interruptible = interruptible;
7661 	fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
7662 	fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
7663 	fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
7664 	fault_info_dst.stealth = TRUE;
7665 
7666 	do { /* while (amount_left > 0) */
7667 		/*
7668 		 * There may be a deadlock if both source and destination
7669 		 * pages are the same. To avoid this deadlock, the copy must
7670 		 * start by getting the destination page in order to apply
7671 		 * COW semantics if any.
7672 		 */
7673 
7674 RetryDestinationFault:;
7675 
7676 		dst_prot = VM_PROT_WRITE | VM_PROT_READ;
7677 
7678 		vm_object_lock(dst_object);
7679 		vm_object_paging_begin(dst_object);
7680 
7681 		/* cap cluster size at maximum UPL size */
7682 		upl_size_t cluster_size;
7683 		if (os_convert_overflow(amount_left, &cluster_size)) {
7684 			cluster_size = 0 - (upl_size_t)PAGE_SIZE;
7685 		}
7686 		fault_info_dst.cluster_size = cluster_size;
7687 
7688 		dst_page = VM_PAGE_NULL;
7689 		result = vm_fault_page(dst_object,
7690 		    vm_object_trunc_page(dst_offset),
7691 		    VM_PROT_WRITE | VM_PROT_READ,
7692 		    FALSE,
7693 		    FALSE,                    /* page not looked up */
7694 		    &dst_prot, &dst_page, &dst_top_page,
7695 		    (int *)0,
7696 		    &error,
7697 		    dst_map->no_zero_fill,
7698 		    &fault_info_dst);
7699 		switch (result) {
7700 		case VM_FAULT_SUCCESS:
7701 			break;
7702 		case VM_FAULT_RETRY:
7703 			goto RetryDestinationFault;
7704 		case VM_FAULT_MEMORY_SHORTAGE:
7705 			if (vm_page_wait(interruptible)) {
7706 				goto RetryDestinationFault;
7707 			}
7708 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_COPY_MEMORY_SHORTAGE), 0 /* arg */);
7709 			OS_FALLTHROUGH;
7710 		case VM_FAULT_INTERRUPTED:
7711 			RETURN(MACH_SEND_INTERRUPTED);
7712 		case VM_FAULT_SUCCESS_NO_VM_PAGE:
7713 			/* success but no VM page: fail the copy */
7714 			vm_object_paging_end(dst_object);
7715 			vm_object_unlock(dst_object);
7716 			OS_FALLTHROUGH;
7717 		case VM_FAULT_MEMORY_ERROR:
7718 			if (error) {
7719 				vmlp_api_end(VM_FAULT_COPY, error);
7720 				return error;
7721 			} else {
7722 				vmlp_api_end(VM_FAULT_COPY, KERN_MEMORY_ERROR);
7723 				return KERN_MEMORY_ERROR;
7724 			}
7725 		default:
7726 			panic("vm_fault_copy: unexpected error 0x%x from "
7727 			    "vm_fault_page()\n", result);
7728 		}
7729 		assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
7730 
7731 		assert(dst_object == VM_PAGE_OBJECT(dst_page));
7732 		old_copy_object = dst_object->vo_copy;
7733 		old_copy_version = dst_object->vo_copy_version;
7734 
7735 		/*
7736 		 * There exists the possiblity that the source and
7737 		 * destination page are the same.  But we can't
7738 		 * easily determine that now.  If they are the
7739 		 * same, the call to vm_fault_page() for the
7740 		 * destination page will deadlock.  To prevent this we
7741 		 * wire the page so we can drop busy without having
7742 		 * the page daemon steal the page.  We clean up the
7743 		 * top page  but keep the paging reference on the object
7744 		 * holding the dest page so it doesn't go away.
7745 		 */
7746 
7747 		vm_page_lockspin_queues();
7748 		vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
7749 		vm_page_unlock_queues();
7750 		vm_page_wakeup_done(dst_object, dst_page);
7751 		vm_object_unlock(dst_object);
7752 
7753 		if (dst_top_page != VM_PAGE_NULL) {
7754 			vm_object_lock(dst_object);
7755 			VM_PAGE_FREE(dst_top_page);
7756 			vm_object_paging_end(dst_object);
7757 			vm_object_unlock(dst_object);
7758 		}
7759 
7760 RetrySourceFault:;
7761 
7762 		if (src_object == VM_OBJECT_NULL) {
7763 			/*
7764 			 *	No source object.  We will just
7765 			 *	zero-fill the page in dst_object.
7766 			 */
7767 			src_page = VM_PAGE_NULL;
7768 			result_page = VM_PAGE_NULL;
7769 		} else {
7770 			vm_object_lock(src_object);
7771 			src_page = vm_page_lookup(src_object,
7772 			    vm_object_trunc_page(src_offset));
7773 			if (src_page == dst_page) {
7774 				src_prot = dst_prot;
7775 				result_page = VM_PAGE_NULL;
7776 			} else {
7777 				src_prot = VM_PROT_READ;
7778 				vm_object_paging_begin(src_object);
7779 
7780 				/* cap cluster size at maximum UPL size */
7781 				if (os_convert_overflow(amount_left, &cluster_size)) {
7782 					cluster_size = 0 - (upl_size_t)PAGE_SIZE;
7783 				}
7784 				fault_info_src.cluster_size = cluster_size;
7785 
7786 				result_page = VM_PAGE_NULL;
7787 				result = vm_fault_page(
7788 					src_object,
7789 					vm_object_trunc_page(src_offset),
7790 					VM_PROT_READ, FALSE,
7791 					FALSE, /* page not looked up */
7792 					&src_prot,
7793 					&result_page, &src_top_page,
7794 					(int *)0, &error, FALSE,
7795 					&fault_info_src);
7796 
7797 				switch (result) {
7798 				case VM_FAULT_SUCCESS:
7799 					break;
7800 				case VM_FAULT_RETRY:
7801 					goto RetrySourceFault;
7802 				case VM_FAULT_MEMORY_SHORTAGE:
7803 					if (vm_page_wait(interruptible)) {
7804 						goto RetrySourceFault;
7805 					}
7806 					OS_FALLTHROUGH;
7807 				case VM_FAULT_INTERRUPTED:
7808 					vm_fault_copy_dst_cleanup(dst_page);
7809 					RETURN(MACH_SEND_INTERRUPTED);
7810 				case VM_FAULT_SUCCESS_NO_VM_PAGE:
7811 					/* success but no VM page: fail */
7812 					vm_object_paging_end(src_object);
7813 					vm_object_unlock(src_object);
7814 					OS_FALLTHROUGH;
7815 				case VM_FAULT_MEMORY_ERROR:
7816 					vm_fault_copy_dst_cleanup(dst_page);
7817 					if (error) {
7818 						vmlp_api_end(VM_FAULT_COPY, error);
7819 						return error;
7820 					} else {
7821 						vmlp_api_end(VM_FAULT_COPY, KERN_MEMORY_ERROR);
7822 						return KERN_MEMORY_ERROR;
7823 					}
7824 				default:
7825 					panic("vm_fault_copy(2): unexpected "
7826 					    "error 0x%x from "
7827 					    "vm_fault_page()\n", result);
7828 				}
7829 
7830 				result_page_object = VM_PAGE_OBJECT(result_page);
7831 				assert((src_top_page == VM_PAGE_NULL) ==
7832 				    (result_page_object == src_object));
7833 			}
7834 			assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
7835 			vm_object_unlock(result_page_object);
7836 		}
7837 
7838 		vm_map_lock_read(dst_map);
7839 
7840 		if (!vm_map_verify(dst_map, dst_version)) {
7841 			vm_map_unlock_read(dst_map);
7842 			if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7843 				vm_fault_copy_cleanup(result_page, src_top_page);
7844 			}
7845 			vm_fault_copy_dst_cleanup(dst_page);
7846 			break;
7847 		}
7848 		assert(dst_object == VM_PAGE_OBJECT(dst_page));
7849 
7850 		vm_object_lock(dst_object);
7851 
7852 		if ((dst_object->vo_copy != old_copy_object ||
7853 		    dst_object->vo_copy_version != old_copy_version)) {
7854 			vm_object_unlock(dst_object);
7855 			vm_map_unlock_read(dst_map);
7856 			if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7857 				vm_fault_copy_cleanup(result_page, src_top_page);
7858 			}
7859 			vm_fault_copy_dst_cleanup(dst_page);
7860 			break;
7861 		}
7862 
7863 		/**
7864 		 * Avoid overwriting a page that has become busy while dst_object's lock was dropped.
7865 		 * Re-run the loop at the same position; if necessary, vm_fault_page() will wait
7866 		 * for the destination page to be unbusied.
7867 		 */
7868 		if (__improbable(dst_page->vmp_busy)) {
7869 			vm_object_unlock(dst_object);
7870 			vm_map_unlock_read(dst_map);
7871 			if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7872 				vm_fault_copy_cleanup(result_page, src_top_page);
7873 			}
7874 			vm_fault_copy_dst_cleanup(dst_page);
7875 			continue;
7876 		}
7877 
7878 #if CONFIG_SPTM
7879 		if (__improbable(PMAP_PAGE_IS_USER_EXECUTABLE(dst_page))) {
7880 			/**
7881 			 * We've found a page with an executable frame type, which likely means its physical aperture
7882 			 * mapping is write-protected, so we won't be able to do the copy below.  We'll need to remove
7883 			 * all extant mappings and retype the page, but first we need to make sure we can safely retype.
7884 			 */
7885 			if (__improbable(dst_page->vmp_cleaning || dst_page->vmp_iopl_wired)) {
7886 				/**
7887 				 * Clean up our locking state and source page/object references so that we can safely
7888 				 * sleep on the destination page.
7889 				 */
7890 				vm_object_unlock(dst_object);
7891 				vm_map_unlock_read(dst_map);
7892 				if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7893 					vm_fault_copy_cleanup(result_page, src_top_page);
7894 				}
7895 				vm_object_lock(dst_object);
7896 				assert3p(dst_object, ==, VM_PAGE_OBJECT(dst_page));
7897 				if (dst_page->vmp_iopl_wired) {
7898 					/**
7899 					 * If the page is wired for I/O, we can't safely retype and we can't reasonably
7900 					 * wait for the I/O to finish.
7901 					 */
7902 					vm_object_unlock(dst_object);
7903 					vm_fault_copy_dst_cleanup(dst_page);
7904 					vmlp_api_end(VM_FAULT_COPY, KERN_MEMORY_ERROR);
7905 					return KERN_MEMORY_ERROR;
7906 				} else if (dst_page->vmp_cleaning) {
7907 					/**
7908 					 * We can wait for an in-place clean to finish.
7909 					 * NOTE: The page is still wired and we still hold a paging reference on the object
7910 					 * at this point, both of which will be undone by vm_fault_copy_dst_cleanup().
7911 					 * Is it really safe to sleep on the page in that state?
7912 					 */
7913 					wait_result_t wres = vm_page_sleep(dst_object, dst_page, interruptible, LCK_SLEEP_UNLOCK);
7914 					vm_fault_copy_dst_cleanup(dst_page);
7915 					if (wres == THREAD_AWAKENED || wres == THREAD_RESTART) {
7916 						continue;
7917 					} else {
7918 						vmlp_api_end(VM_FAULT_COPY, KERN_ABORTED);
7919 						return KERN_ABORTED;
7920 					}
7921 				} else {
7922 					/**
7923 					 * The cleaning or I/O state we initially observed went away while the object
7924 					 * lock was dropped.  Since we've torn down much of our state already, we need
7925 					 * to rerun the copy loop at the same position.
7926 					 */
7927 					vm_object_unlock(dst_object);
7928 					vm_fault_copy_dst_cleanup(dst_page);
7929 					continue;
7930 				}
7931 			}
7932 			/**
7933 			 * Remove all existing mappings and retype the page.  Consumers of the page will be forced to
7934 			 * re-fault it and, if necessary, re-validate it for codesigning.
7935 			 */
7936 			pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(dst_page), PMAP_OPTIONS_RETYPE, NULL);
7937 		}
7938 #endif /* CONFIG_SPTM */
7939 
7940 		/**
7941 		 * Copy the page, and note that it is dirty immediately.
7942 		 * NOTE: if we're concerned about lock contention due to holding the object lock across the copy,
7943 		 * we could instead consider marking dst_page busy and dropping the lock, but only if we have some
7944 		 * other means of preventing a CoW bypass on this path.
7945 		 */
7946 
7947 		vm_object_offset_t      src_po, dst_po;
7948 
7949 		src_po = src_offset - vm_object_trunc_page(src_offset);
7950 		dst_po = dst_offset - vm_object_trunc_page(dst_offset);
7951 
7952 		if (dst_po > src_po) {
7953 			part_size = PAGE_SIZE - dst_po;
7954 		} else {
7955 			part_size = PAGE_SIZE - src_po;
7956 		}
7957 		if (part_size > (amount_left)) {
7958 			part_size = amount_left;
7959 		}
7960 
7961 		/**
7962 		 * For the case in which we're copying a full page, we don't want to use vm_page_copy() here
7963 		 * because that will do CS validation (unnecessarily in this case) which requires the source
7964 		 * object lock to be held, which in turn would complicate our locking requirements since we
7965 		 * already hold the destination object lock.  Instead we treat the full-page case as simply
7966 		 * a zero-offset/PAGE_SIZE variant of the partial-page case, which keeps the code simpler
7967 		 * anyway.
7968 		 */
7969 		if (result_page == VM_PAGE_NULL) {
7970 			assert((vm_offset_t) dst_po == dst_po);
7971 			assert((vm_size_t) part_size == part_size);
7972 			vm_page_part_zero_fill(dst_page,
7973 			    (vm_offset_t) dst_po,
7974 			    (vm_size_t) part_size);
7975 		} else {
7976 			assert((vm_offset_t) src_po == src_po);
7977 			assert((vm_offset_t) dst_po == dst_po);
7978 			assert((vm_size_t) part_size == part_size);
7979 			vm_page_part_copy(result_page,
7980 			    (vm_offset_t) src_po,
7981 			    dst_page,
7982 			    (vm_offset_t) dst_po,
7983 			    (vm_size_t)part_size);
7984 			if (!dst_page->vmp_dirty) {
7985 				SET_PAGE_DIRTY(dst_page, TRUE);
7986 			}
7987 		}
7988 		vm_object_unlock(dst_object);
7989 
7990 		/*
7991 		 *	Unlock everything, and return
7992 		 */
7993 
7994 		vm_map_unlock_read(dst_map);
7995 
7996 		if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7997 			vm_fault_copy_cleanup(result_page, src_top_page);
7998 		}
7999 		vm_fault_copy_dst_cleanup(dst_page);
8000 
8001 		amount_left -= part_size;
8002 		src_offset += part_size;
8003 		dst_offset += part_size;
8004 	} while (amount_left > 0);
8005 
8006 	RETURN(KERN_SUCCESS);
8007 #undef  RETURN
8008 
8009 	/*NOTREACHED*/
8010 }
8011 
8012 #if     VM_FAULT_CLASSIFY
8013 /*
8014  *	Temporary statistics gathering support.
8015  */
8016 
8017 /*
8018  *	Statistics arrays:
8019  */
8020 #define VM_FAULT_TYPES_MAX      5
8021 #define VM_FAULT_LEVEL_MAX      8
8022 
8023 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
8024 
8025 #define VM_FAULT_TYPE_ZERO_FILL 0
8026 #define VM_FAULT_TYPE_MAP_IN    1
8027 #define VM_FAULT_TYPE_PAGER     2
8028 #define VM_FAULT_TYPE_COPY      3
8029 #define VM_FAULT_TYPE_OTHER     4
8030 
8031 
8032 void
vm_fault_classify(vm_object_t object,vm_object_offset_t offset,vm_prot_t fault_type)8033 vm_fault_classify(vm_object_t           object,
8034     vm_object_offset_t    offset,
8035     vm_prot_t             fault_type)
8036 {
8037 	int             type, level = 0;
8038 	vm_page_t       m;
8039 
8040 	while (TRUE) {
8041 		m = vm_page_lookup(object, offset);
8042 		if (m != VM_PAGE_NULL) {
8043 			if (m->vmp_busy || m->vmp_error || m->vmp_restart || m->vmp_absent) {
8044 				type = VM_FAULT_TYPE_OTHER;
8045 				break;
8046 			}
8047 			if (((fault_type & VM_PROT_WRITE) == 0) ||
8048 			    ((level == 0) && object->vo_copy == VM_OBJECT_NULL)) {
8049 				type = VM_FAULT_TYPE_MAP_IN;
8050 				break;
8051 			}
8052 			type = VM_FAULT_TYPE_COPY;
8053 			break;
8054 		} else {
8055 			if (object->pager_created) {
8056 				type = VM_FAULT_TYPE_PAGER;
8057 				break;
8058 			}
8059 			if (object->shadow == VM_OBJECT_NULL) {
8060 				type = VM_FAULT_TYPE_ZERO_FILL;
8061 				break;
8062 			}
8063 
8064 			offset += object->vo_shadow_offset;
8065 			object = object->shadow;
8066 			level++;
8067 			continue;
8068 		}
8069 	}
8070 
8071 	if (level > VM_FAULT_LEVEL_MAX) {
8072 		level = VM_FAULT_LEVEL_MAX;
8073 	}
8074 
8075 	vm_fault_stats[type][level] += 1;
8076 
8077 	return;
8078 }
8079 
8080 /* cleanup routine to call from debugger */
8081 
8082 void
vm_fault_classify_init(void)8083 vm_fault_classify_init(void)
8084 {
8085 	int type, level;
8086 
8087 	for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
8088 		for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
8089 			vm_fault_stats[type][level] = 0;
8090 		}
8091 	}
8092 
8093 	return;
8094 }
8095 #endif  /* VM_FAULT_CLASSIFY */
8096 
8097 static inline bool
object_supports_coredump(const vm_object_t object)8098 object_supports_coredump(const vm_object_t object)
8099 {
8100 	switch (object->wimg_bits & VM_WIMG_MASK) {
8101 	case VM_WIMG_DEFAULT:
8102 		return true;
8103 #if HAS_MTE
8104 	case VM_WIMG_MTE:
8105 		return true;
8106 #endif /* HAS_MTE */
8107 	default:
8108 		return false;
8109 	}
8110 }
8111 
8112 vm_offset_t
kdp_lightweight_fault(vm_map_t map,vm_offset_t cur_target_addr,bool multi_cpu)8113 kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr, bool multi_cpu)
8114 {
8115 	vm_map_entry_t  entry;
8116 	vm_object_t     object;
8117 	vm_offset_t     object_offset;
8118 	vm_page_t       m;
8119 	int             compressor_external_state, compressed_count_delta;
8120 	vm_compressor_options_t             compressor_flags = (C_DONT_BLOCK | C_KEEP | C_KDP);
8121 	int             my_fault_type = VM_PROT_READ;
8122 	kern_return_t   kr;
8123 	int effective_page_mask, effective_page_size;
8124 	int             my_cpu_no = cpu_number();
8125 	ppnum_t         decomp_ppnum;
8126 	addr64_t        decomp_paddr;
8127 
8128 	vmlp_api_start(KDP_LIGHTWEIGHT_FAULT);
8129 
8130 	if (multi_cpu) {
8131 		compressor_flags |= C_KDP_MULTICPU;
8132 	}
8133 
8134 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
8135 		effective_page_mask = VM_MAP_PAGE_MASK(map);
8136 		effective_page_size = VM_MAP_PAGE_SIZE(map);
8137 	} else {
8138 		effective_page_mask = PAGE_MASK;
8139 		effective_page_size = PAGE_SIZE;
8140 	}
8141 
8142 	if (not_in_kdp) {
8143 		panic("kdp_lightweight_fault called from outside of debugger context");
8144 	}
8145 
8146 	assert(map != VM_MAP_NULL);
8147 
8148 	assert((cur_target_addr & effective_page_mask) == 0);
8149 	if ((cur_target_addr & effective_page_mask) != 0) {
8150 		vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8151 		return 0;
8152 	}
8153 
8154 	if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
8155 		vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8156 		return 0;
8157 	}
8158 
8159 	if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
8160 		vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8161 		return 0;
8162 	}
8163 
8164 	vmlp_range_event_entry(map, entry);
8165 
8166 	if (entry->is_sub_map) {
8167 		vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8168 		return 0;
8169 	}
8170 
8171 	object = VME_OBJECT(entry);
8172 	if (object == VM_OBJECT_NULL) {
8173 		vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8174 		return 0;
8175 	}
8176 
8177 	object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
8178 
8179 	while (TRUE) {
8180 		if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
8181 			vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8182 			return 0;
8183 		}
8184 
8185 		if (object->pager_created && (object->paging_in_progress ||
8186 		    object->activity_in_progress)) {
8187 			vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8188 			return 0;
8189 		}
8190 
8191 		m = kdp_vm_page_lookup(object, vm_object_trunc_page(object_offset));
8192 
8193 		if (m != VM_PAGE_NULL) {
8194 			if (!object_supports_coredump(object)) {
8195 				vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8196 				return 0;
8197 			}
8198 
8199 			if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done ||
8200 			    m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_cleaning ||
8201 			    m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) {
8202 				vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8203 				return 0;
8204 			}
8205 
8206 			assert(!vm_page_is_private(m));
8207 			if (vm_page_is_private(m)) {
8208 				vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8209 				return 0;
8210 			}
8211 
8212 			assert(!vm_page_is_fictitious(m));
8213 			if (vm_page_is_fictitious(m)) {
8214 				vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8215 				return 0;
8216 			}
8217 
8218 			assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8219 			if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
8220 				vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8221 				return 0;
8222 			}
8223 
8224 			vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, 0);
8225 			return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
8226 		}
8227 
8228 		compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
8229 
8230 		if (multi_cpu) {
8231 			assert(vm_compressor_kdp_state.kc_decompressed_pages_ppnum != NULL);
8232 			assert(vm_compressor_kdp_state.kc_decompressed_pages_paddr != NULL);
8233 			decomp_ppnum = vm_compressor_kdp_state.kc_decompressed_pages_ppnum[my_cpu_no];
8234 			decomp_paddr = vm_compressor_kdp_state.kc_decompressed_pages_paddr[my_cpu_no];
8235 		} else {
8236 			decomp_ppnum = vm_compressor_kdp_state.kc_panic_decompressed_page_ppnum;
8237 			decomp_paddr = vm_compressor_kdp_state.kc_panic_decompressed_page_paddr;
8238 		}
8239 
8240 		if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
8241 			if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
8242 #if HAS_MTE
8243 				if (vm_object_is_mte_mappable(object)) {
8244 					compressor_flags |= C_MTE | C_MTE_DROP_TAGS;
8245 				}
8246 #endif /* HAS_MTE */
8247 				kr = vm_compressor_pager_get(object->pager,
8248 				    vm_object_trunc_page(object_offset + object->paging_offset),
8249 				    decomp_ppnum, &my_fault_type,
8250 				    compressor_flags, &compressed_count_delta);
8251 				if (kr == KERN_SUCCESS) {
8252 					vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, 0);
8253 					return decomp_paddr;
8254 				} else {
8255 					vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8256 					return 0;
8257 				}
8258 			}
8259 		}
8260 
8261 		if (object->shadow == VM_OBJECT_NULL) {
8262 			vmlp_api_end(KDP_LIGHTWEIGHT_FAULT, -1);
8263 			return 0;
8264 		}
8265 
8266 		object_offset += object->vo_shadow_offset;
8267 		object = object->shadow;
8268 	}
8269 }
8270 
8271 /*
8272  * vm_page_validate_cs_fast():
8273  * Performs a few quick checks to determine if the page's code signature
8274  * really needs to be fully validated.  It could:
8275  *	1. have been modified (i.e. automatically tainted),
8276  *	2. have already been validated,
8277  *	3. have already been found to be tainted,
8278  *	4. no longer have a backing store.
8279  * Returns FALSE if the page needs to be fully validated.
8280  */
8281 static boolean_t
vm_page_validate_cs_fast(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)8282 vm_page_validate_cs_fast(
8283 	vm_page_t       page,
8284 	vm_map_size_t   fault_page_size,
8285 	vm_map_offset_t fault_phys_offset)
8286 {
8287 	vm_object_t     object;
8288 
8289 	object = VM_PAGE_OBJECT(page);
8290 	vm_object_lock_assert_held(object);
8291 
8292 	if (page->vmp_wpmapped &&
8293 	    !VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
8294 		/*
8295 		 * This page was mapped for "write" access sometime in the
8296 		 * past and could still be modifiable in the future.
8297 		 * Consider it tainted.
8298 		 * [ If the page was already found to be "tainted", no
8299 		 * need to re-validate. ]
8300 		 */
8301 		vm_object_lock_assert_exclusive(object);
8302 		VMP_CS_SET_VALIDATED(page, fault_page_size, fault_phys_offset, TRUE);
8303 		VMP_CS_SET_TAINTED(page, fault_page_size, fault_phys_offset, TRUE);
8304 		if (cs_debug) {
8305 			printf("CODESIGNING: %s: "
8306 			    "page %p obj %p off 0x%llx "
8307 			    "was modified\n",
8308 			    __FUNCTION__,
8309 			    page, object, page->vmp_offset);
8310 		}
8311 		vm_cs_validated_dirtied++;
8312 	}
8313 
8314 	if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) ||
8315 	    VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
8316 		return TRUE;
8317 	}
8318 	vm_object_lock_assert_exclusive(object);
8319 
8320 #if CHECK_CS_VALIDATION_BITMAP
8321 	kern_return_t kr;
8322 
8323 	kr = vnode_pager_cs_check_validation_bitmap(
8324 		object->pager,
8325 		page->vmp_offset + object->paging_offset,
8326 		CS_BITMAP_CHECK);
8327 	if (kr == KERN_SUCCESS) {
8328 		page->vmp_cs_validated = VMP_CS_ALL_TRUE;
8329 		page->vmp_cs_tainted = VMP_CS_ALL_FALSE;
8330 		vm_cs_bitmap_validated++;
8331 		return TRUE;
8332 	}
8333 #endif /* CHECK_CS_VALIDATION_BITMAP */
8334 
8335 	if (!object->alive || object->terminating || object->pager == NULL) {
8336 		/*
8337 		 * The object is terminating and we don't have its pager
8338 		 * so we can't validate the data...
8339 		 */
8340 		return TRUE;
8341 	}
8342 
8343 	/* we need to really validate this page */
8344 	vm_object_lock_assert_exclusive(object);
8345 	return FALSE;
8346 }
8347 
8348 void
vm_page_validate_cs_mapped_slow(vm_page_t page,const void * kaddr)8349 vm_page_validate_cs_mapped_slow(
8350 	vm_page_t       page,
8351 	const void      *kaddr)
8352 {
8353 	vm_object_t             object;
8354 	memory_object_offset_t  mo_offset;
8355 	memory_object_t         pager;
8356 	struct vnode            *vnode;
8357 	int                     validated, tainted, nx;
8358 
8359 	assert(page->vmp_busy);
8360 	object = VM_PAGE_OBJECT(page);
8361 	vm_object_lock_assert_exclusive(object);
8362 
8363 	vm_cs_validates++;
8364 
8365 	/*
8366 	 * Since we get here to validate a page that was brought in by
8367 	 * the pager, we know that this pager is all setup and ready
8368 	 * by now.
8369 	 */
8370 	assert(object->code_signed);
8371 	assert(!object->internal);
8372 	assert(object->pager != NULL);
8373 	assert(object->pager_ready);
8374 
8375 	pager = object->pager;
8376 	assert(object->paging_in_progress);
8377 	vnode = vnode_pager_lookup_vnode(pager);
8378 	mo_offset = page->vmp_offset + object->paging_offset;
8379 
8380 	/* verify the SHA1 hash for this page */
8381 	validated = 0;
8382 	tainted = 0;
8383 	nx = 0;
8384 	cs_validate_page(vnode,
8385 	    pager,
8386 	    mo_offset,
8387 	    (const void *)((const char *)kaddr),
8388 	    &validated,
8389 	    &tainted,
8390 	    &nx);
8391 
8392 	page->vmp_cs_validated |= validated;
8393 	page->vmp_cs_tainted |= tainted;
8394 	page->vmp_cs_nx |= nx;
8395 
8396 #if CHECK_CS_VALIDATION_BITMAP
8397 	if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
8398 	    page->vmp_cs_tainted == VMP_CS_ALL_FALSE) {
8399 		vnode_pager_cs_check_validation_bitmap(object->pager,
8400 		    mo_offset,
8401 		    CS_BITMAP_SET);
8402 	}
8403 #endif /* CHECK_CS_VALIDATION_BITMAP */
8404 }
8405 
8406 void
vm_page_validate_cs_mapped(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset,const void * kaddr)8407 vm_page_validate_cs_mapped(
8408 	vm_page_t       page,
8409 	vm_map_size_t   fault_page_size,
8410 	vm_map_offset_t fault_phys_offset,
8411 	const void      *kaddr)
8412 {
8413 	if (!vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
8414 		vm_page_validate_cs_mapped_slow(page, kaddr);
8415 	}
8416 }
8417 
8418 static void
vm_page_map_and_validate_cs(vm_object_t object,vm_page_t page)8419 vm_page_map_and_validate_cs(
8420 	vm_object_t     object,
8421 	vm_page_t       page)
8422 {
8423 	vm_object_offset_t      offset;
8424 	vm_map_offset_t         koffset;
8425 	vm_map_size_t           ksize;
8426 	vm_offset_t             kaddr;
8427 	kern_return_t           kr;
8428 	boolean_t               busy_page;
8429 	boolean_t               need_unmap;
8430 
8431 	vm_object_lock_assert_exclusive(object);
8432 
8433 	assert(object->code_signed);
8434 	offset = page->vmp_offset;
8435 
8436 	busy_page = page->vmp_busy;
8437 	if (!busy_page) {
8438 		/* keep page busy while we map (and unlock) the VM object */
8439 		page->vmp_busy = TRUE;
8440 	}
8441 
8442 	/*
8443 	 * Take a paging reference on the VM object
8444 	 * to protect it from collapse or bypass,
8445 	 * and keep it from disappearing too.
8446 	 */
8447 	vm_object_paging_begin(object);
8448 
8449 	/* map the page in the kernel address space */
8450 	ksize = PAGE_SIZE_64;
8451 	koffset = 0;
8452 	need_unmap = FALSE;
8453 	kr = vm_paging_map_object(page,
8454 	    object,
8455 	    offset,
8456 	    VM_PROT_READ,
8457 	    FALSE,                       /* can't unlock object ! */
8458 	    &ksize,
8459 	    &koffset,
8460 	    &need_unmap);
8461 	if (kr != KERN_SUCCESS) {
8462 		panic("%s: could not map page: 0x%x", __FUNCTION__, kr);
8463 	}
8464 	kaddr = CAST_DOWN(vm_offset_t, koffset);
8465 
8466 	/* validate the mapped page */
8467 	vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
8468 
8469 	assert(page->vmp_busy);
8470 	assert(object == VM_PAGE_OBJECT(page));
8471 	vm_object_lock_assert_exclusive(object);
8472 
8473 	if (!busy_page) {
8474 		vm_page_wakeup_done(object, page);
8475 	}
8476 	if (need_unmap) {
8477 		/* unmap the map from the kernel address space */
8478 		vm_paging_unmap_object(object, koffset, koffset + ksize);
8479 		koffset = 0;
8480 		ksize = 0;
8481 		kaddr = 0;
8482 	}
8483 	vm_object_paging_end(object);
8484 }
8485 
8486 void
vm_page_validate_cs(vm_page_t page,vm_map_size_t fault_page_size,vm_map_offset_t fault_phys_offset)8487 vm_page_validate_cs(
8488 	vm_page_t       page,
8489 	vm_map_size_t   fault_page_size,
8490 	vm_map_offset_t fault_phys_offset)
8491 {
8492 	vm_object_t             object;
8493 
8494 	object = VM_PAGE_OBJECT(page);
8495 	vm_object_lock_assert_held(object);
8496 
8497 	if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
8498 		return;
8499 	}
8500 	vm_page_map_and_validate_cs(object, page);
8501 }
8502 
8503 void
vm_page_validate_cs_mapped_chunk(vm_page_t page,const void * kaddr,vm_offset_t chunk_offset,vm_size_t chunk_size,boolean_t * validated_p,unsigned * tainted_p)8504 vm_page_validate_cs_mapped_chunk(
8505 	vm_page_t       page,
8506 	const void      *kaddr,
8507 	vm_offset_t     chunk_offset,
8508 	vm_size_t       chunk_size,
8509 	boolean_t       *validated_p,
8510 	unsigned        *tainted_p)
8511 {
8512 	vm_object_t             object;
8513 	vm_object_offset_t      offset, offset_in_page;
8514 	memory_object_t         pager;
8515 	struct vnode            *vnode;
8516 	boolean_t               validated;
8517 	unsigned                tainted;
8518 
8519 	*validated_p = FALSE;
8520 	*tainted_p = 0;
8521 
8522 	assert(page->vmp_busy);
8523 	object = VM_PAGE_OBJECT(page);
8524 	vm_object_lock_assert_exclusive(object);
8525 
8526 	assert(object->code_signed);
8527 	offset = page->vmp_offset;
8528 
8529 	if (!object->alive || object->terminating || object->pager == NULL) {
8530 		/*
8531 		 * The object is terminating and we don't have its pager
8532 		 * so we can't validate the data...
8533 		 */
8534 		return;
8535 	}
8536 	/*
8537 	 * Since we get here to validate a page that was brought in by
8538 	 * the pager, we know that this pager is all setup and ready
8539 	 * by now.
8540 	 */
8541 	assert(!object->internal);
8542 	assert(object->pager != NULL);
8543 	assert(object->pager_ready);
8544 
8545 	pager = object->pager;
8546 	assert(object->paging_in_progress);
8547 	vnode = vnode_pager_lookup_vnode(pager);
8548 
8549 	/* verify the signature for this chunk */
8550 	offset_in_page = chunk_offset;
8551 	assert(offset_in_page < PAGE_SIZE);
8552 
8553 	tainted = 0;
8554 	validated = cs_validate_range(vnode,
8555 	    pager,
8556 	    (object->paging_offset +
8557 	    offset +
8558 	    offset_in_page),
8559 	    (const void *)((const char *)kaddr
8560 	    + offset_in_page),
8561 	    chunk_size,
8562 	    &tainted);
8563 	if (validated) {
8564 		*validated_p = TRUE;
8565 	}
8566 	if (tainted) {
8567 		*tainted_p = tainted;
8568 	}
8569 }
8570 
8571 static void
vm_rtfrecord_lock(void)8572 vm_rtfrecord_lock(void)
8573 {
8574 	lck_spin_lock(&vm_rtfr_slock);
8575 }
8576 
8577 static void
vm_rtfrecord_unlock(void)8578 vm_rtfrecord_unlock(void)
8579 {
8580 	lck_spin_unlock(&vm_rtfr_slock);
8581 }
8582 
8583 unsigned int
vmrtfaultinfo_bufsz(void)8584 vmrtfaultinfo_bufsz(void)
8585 {
8586 	return vmrtf_num_records * sizeof(vm_rtfault_record_t);
8587 }
8588 
8589 #include <kern/backtrace.h>
8590 
8591 __attribute__((noinline))
8592 static void
vm_record_rtfault(thread_t cthread,uint64_t fstart,vm_map_offset_t fault_vaddr,int type_of_fault)8593 vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
8594 {
8595 	uint64_t fend = mach_continuous_time();
8596 
8597 	uint64_t cfpc = 0;
8598 	uint64_t ctid = cthread->thread_id;
8599 	uint64_t cupid = get_current_unique_pid();
8600 
8601 	uintptr_t bpc = 0;
8602 	errno_t btr = 0;
8603 
8604 	/*
8605 	 * Capture a single-frame backtrace.  This extracts just the program
8606 	 * counter at the point of the fault, and should not use copyin to get
8607 	 * Rosetta save state.
8608 	 */
8609 	struct backtrace_control ctl = {
8610 		.btc_user_thread = cthread,
8611 		.btc_user_copy = backtrace_user_copy_error,
8612 	};
8613 	unsigned int bfrs = backtrace_user(&bpc, 1U, &ctl, NULL);
8614 	if ((btr == 0) && (bfrs > 0)) {
8615 		cfpc = bpc;
8616 	}
8617 
8618 	assert((fstart != 0) && fend >= fstart);
8619 	vm_rtfrecord_lock();
8620 	assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
8621 
8622 	vmrtfrs.vmrtf_total++;
8623 	vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
8624 
8625 	cvmr->rtfabstime = fstart;
8626 	cvmr->rtfduration = fend - fstart;
8627 	cvmr->rtfaddr = fault_vaddr;
8628 	cvmr->rtfpc = cfpc;
8629 	cvmr->rtftype = type_of_fault;
8630 	cvmr->rtfupid = cupid;
8631 	cvmr->rtftid = ctid;
8632 
8633 	if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
8634 		vmrtfrs.vmrtfr_curi = 0;
8635 	}
8636 
8637 	vm_rtfrecord_unlock();
8638 }
8639 
8640 int
vmrtf_extract(uint64_t cupid,__unused boolean_t isroot,unsigned long vrecordsz,void * vrecords,unsigned long * vmrtfrv)8641 vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, unsigned long vrecordsz, void *vrecords, unsigned long *vmrtfrv)
8642 {
8643 	vm_rtfault_record_t *cvmrd = vrecords;
8644 	size_t residue = vrecordsz;
8645 	size_t numextracted = 0;
8646 	boolean_t early_exit = FALSE;
8647 
8648 	vm_rtfrecord_lock();
8649 
8650 	for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
8651 		if (residue < sizeof(vm_rtfault_record_t)) {
8652 			early_exit = TRUE;
8653 			break;
8654 		}
8655 
8656 		if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
8657 #if     DEVELOPMENT || DEBUG
8658 			if (isroot == FALSE) {
8659 				continue;
8660 			}
8661 #else
8662 			continue;
8663 #endif /* DEVDEBUG */
8664 		}
8665 
8666 		*cvmrd = vmrtfrs.vm_rtf_records[vmfi];
8667 		cvmrd++;
8668 		residue -= sizeof(vm_rtfault_record_t);
8669 		numextracted++;
8670 	}
8671 
8672 	vm_rtfrecord_unlock();
8673 
8674 	*vmrtfrv = numextracted;
8675 	return early_exit;
8676 }
8677 
8678 /*
8679  * Only allow one diagnosis to be in flight at a time, to avoid
8680  * creating too much additional memory usage.
8681  */
8682 static volatile uint_t vmtc_diagnosing;
8683 unsigned int vmtc_total = 0;
8684 
8685 /*
8686  * Type used to update telemetry for the diagnosis counts.
8687  */
8688 CA_EVENT(vmtc_telemetry,
8689     CA_INT, vmtc_num_byte,            /* number of corrupt bytes found */
8690     CA_BOOL, vmtc_undiagnosed,        /* undiagnosed because more than 1 at a time */
8691     CA_BOOL, vmtc_not_eligible,       /* the page didn't qualify */
8692     CA_BOOL, vmtc_copyin_fail,        /* unable to copy in the page */
8693     CA_BOOL, vmtc_not_found,          /* no corruption found even though CS failed */
8694     CA_BOOL, vmtc_one_bit_flip,       /* single bit flip */
8695     CA_BOOL, vmtc_testing);           /* caused on purpose by testing */
8696 
8697 #if DEVELOPMENT || DEBUG
8698 /*
8699  * Buffers used to compare before/after page contents.
8700  * Stashed to aid when debugging crashes.
8701  */
8702 static size_t vmtc_last_buffer_size = 0;
8703 static uint64_t *vmtc_last_before_buffer = NULL;
8704 static uint64_t *vmtc_last_after_buffer = NULL;
8705 
8706 /*
8707  * Needed to record corruptions due to testing.
8708  */
8709 static uintptr_t corruption_test_va = 0;
8710 #endif /* DEVELOPMENT || DEBUG */
8711 
8712 /*
8713  * Stash a copy of data from a possibly corrupt page.
8714  */
8715 static uint64_t *
vmtc_get_page_data(vm_map_offset_t code_addr,vm_page_t page)8716 vmtc_get_page_data(
8717 	vm_map_offset_t code_addr,
8718 	vm_page_t       page)
8719 {
8720 	uint64_t        *buffer = NULL;
8721 	addr64_t        buffer_paddr;
8722 	addr64_t        page_paddr;
8723 	extern void     bcopy_phys(addr64_t from, addr64_t to, vm_size_t bytes);
8724 	uint_t          size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
8725 
8726 	/*
8727 	 * Need an aligned buffer to do a physical copy.
8728 	 */
8729 	if (kernel_memory_allocate(kernel_map, (vm_offset_t *)&buffer,
8730 	    size, size - 1, KMA_KOBJECT, VM_KERN_MEMORY_DIAG) != KERN_SUCCESS) {
8731 		return NULL;
8732 	}
8733 	buffer_paddr = kvtophys((vm_offset_t)buffer);
8734 	page_paddr = ptoa(VM_PAGE_GET_PHYS_PAGE(page));
8735 
8736 	/* adjust the page start address if we need only 4K of a 16K page */
8737 	if (size < PAGE_SIZE) {
8738 		uint_t subpage_start = ((code_addr & (PAGE_SIZE - 1)) & ~(size - 1));
8739 		page_paddr += subpage_start;
8740 	}
8741 
8742 	bcopy_phys(page_paddr, buffer_paddr, size);
8743 	return buffer;
8744 }
8745 
8746 /*
8747  * Set things up so we can diagnose a potential text page corruption.
8748  */
8749 static uint64_t *
vmtc_text_page_diagnose_setup(vm_map_offset_t code_addr,vm_page_t page,CA_EVENT_TYPE (vmtc_telemetry)* event)8750 vmtc_text_page_diagnose_setup(
8751 	vm_map_offset_t code_addr,
8752 	vm_page_t       page,
8753 	CA_EVENT_TYPE(vmtc_telemetry) *event)
8754 {
8755 	uint64_t        *buffer = NULL;
8756 
8757 	/*
8758 	 * If another is being diagnosed, skip this one.
8759 	 */
8760 	if (!OSCompareAndSwap(0, 1, &vmtc_diagnosing)) {
8761 		event->vmtc_undiagnosed = true;
8762 		return NULL;
8763 	}
8764 
8765 	/*
8766 	 * Get the contents of the corrupt page.
8767 	 */
8768 	buffer = vmtc_get_page_data(code_addr, page);
8769 	if (buffer == NULL) {
8770 		event->vmtc_copyin_fail = true;
8771 		if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
8772 			panic("Bad compare and swap in setup!");
8773 		}
8774 		return NULL;
8775 	}
8776 	return buffer;
8777 }
8778 
8779 /*
8780  * Diagnose the text page by comparing its contents with
8781  * the one we've previously saved.
8782  */
8783 static void
vmtc_text_page_diagnose(vm_map_offset_t code_addr,uint64_t * old_code_buffer,CA_EVENT_TYPE (vmtc_telemetry)* event)8784 vmtc_text_page_diagnose(
8785 	vm_map_offset_t code_addr,
8786 	uint64_t        *old_code_buffer,
8787 	CA_EVENT_TYPE(vmtc_telemetry) *event)
8788 {
8789 	uint64_t        *new_code_buffer;
8790 	size_t          size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
8791 	uint_t          count = (uint_t)size / sizeof(uint64_t);
8792 	uint_t          diff_count = 0;
8793 	bool            bit_flip = false;
8794 	uint_t          b;
8795 	uint64_t        *new;
8796 	uint64_t        *old;
8797 
8798 	new_code_buffer = kalloc_data(size, Z_WAITOK);
8799 	assert(new_code_buffer != NULL);
8800 	if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - 1), new_code_buffer, size) != 0) {
8801 		/* copyin error, so undo things */
8802 		event->vmtc_copyin_fail = true;
8803 		goto done;
8804 	}
8805 
8806 	new = new_code_buffer;
8807 	old = old_code_buffer;
8808 	for (; count-- > 0; ++new, ++old) {
8809 		if (*new == *old) {
8810 			continue;
8811 		}
8812 
8813 		/*
8814 		 * On first diff, check for a single bit flip
8815 		 */
8816 		if (diff_count == 0) {
8817 			uint64_t x = (*new ^ *old);
8818 			assert(x != 0);
8819 			if ((x & (x - 1)) == 0) {
8820 				bit_flip = true;
8821 				++diff_count;
8822 				continue;
8823 			}
8824 		}
8825 
8826 		/*
8827 		 * count up the number of different bytes.
8828 		 */
8829 		for (b = 0; b < sizeof(uint64_t); ++b) {
8830 			char *n = (char *)new;
8831 			char *o = (char *)old;
8832 			if (n[b] != o[b]) {
8833 				++diff_count;
8834 			}
8835 		}
8836 	}
8837 
8838 	if (diff_count > 1) {
8839 		bit_flip = false;
8840 	}
8841 
8842 	if (diff_count == 0) {
8843 		event->vmtc_not_found = true;
8844 	} else {
8845 		event->vmtc_num_byte = diff_count;
8846 	}
8847 	if (bit_flip) {
8848 		event->vmtc_one_bit_flip = true;
8849 	}
8850 
8851 done:
8852 	/*
8853 	 * Free up the code copy buffers, but save the last
8854 	 * set on development / debug kernels in case they
8855 	 * can provide evidence for debugging memory stomps.
8856 	 */
8857 #if DEVELOPMENT || DEBUG
8858 	if (vmtc_last_before_buffer != NULL) {
8859 		kmem_free(kernel_map, (vm_offset_t)vmtc_last_before_buffer, vmtc_last_buffer_size);
8860 	}
8861 	if (vmtc_last_after_buffer != NULL) {
8862 		kfree_data(vmtc_last_after_buffer, vmtc_last_buffer_size);
8863 	}
8864 	vmtc_last_before_buffer = old_code_buffer;
8865 	vmtc_last_after_buffer = new_code_buffer;
8866 	vmtc_last_buffer_size = size;
8867 #else /* DEVELOPMENT || DEBUG */
8868 	kfree_data(new_code_buffer, size);
8869 	kmem_free(kernel_map, (vm_offset_t)old_code_buffer, size);
8870 #endif /* DEVELOPMENT || DEBUG */
8871 
8872 	/*
8873 	 * We're finished, so clear the diagnosing flag.
8874 	 */
8875 	if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
8876 		panic("Bad compare and swap in diagnose!");
8877 	}
8878 }
8879 
8880 /*
8881  * For the given map, virt address, find the object, offset, and page.
8882  * This has to lookup the map entry, verify protections, walk any shadow chains.
8883  * If found, returns with the object locked.
8884  */
8885 static kern_return_t
vmtc_revalidate_lookup(vm_map_t map,vm_map_offset_t vaddr,vm_object_t * ret_object,vm_object_offset_t * ret_offset,vm_page_t * ret_page,vm_prot_t * ret_prot)8886 vmtc_revalidate_lookup(
8887 	vm_map_t               map,
8888 	vm_map_offset_t        vaddr,
8889 	vm_object_t            *ret_object,
8890 	vm_object_offset_t     *ret_offset,
8891 	vm_page_t              *ret_page,
8892 	vm_prot_t              *ret_prot)
8893 {
8894 	vm_object_t            object;
8895 	vm_object_offset_t     offset;
8896 	vm_page_t              page;
8897 	kern_return_t          kr = KERN_SUCCESS;
8898 	uint8_t                object_lock_type = OBJECT_LOCK_EXCLUSIVE;
8899 	vm_map_version_t       version;
8900 	boolean_t              wired;
8901 	struct vm_object_fault_info fault_info = {
8902 		.interruptible = THREAD_UNINT
8903 	};
8904 	vm_map_t               real_map = NULL;
8905 	vm_prot_t              prot;
8906 	vm_object_t            shadow;
8907 
8908 	vmlp_api_start(VMTC_REVALIDATE_LOOKUP);
8909 
8910 	/*
8911 	 * Find the object/offset for the given location/map.
8912 	 * Note this returns with the object locked.
8913 	 */
8914 restart:
8915 	vm_map_lock_read(map);
8916 	object = VM_OBJECT_NULL;        /* in case we come around the restart path */
8917 	kr = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
8918 	    object_lock_type, &version, &object, &offset, &prot, &wired,
8919 	    &fault_info, &real_map, NULL);
8920 	vm_map_unlock_read(map);
8921 	if (real_map != NULL && real_map != map) {
8922 		vm_map_unlock(real_map);
8923 	}
8924 
8925 	/*
8926 	 * If there's no page here, fail.
8927 	 */
8928 	if (kr != KERN_SUCCESS || object == NULL) {
8929 		kr = KERN_FAILURE;
8930 		goto done;
8931 	}
8932 
8933 	/*
8934 	 * Chase down any shadow chains to find the actual page.
8935 	 */
8936 	for (;;) {
8937 		/*
8938 		 * See if the page is on the current object.
8939 		 */
8940 		page = vm_page_lookup(object, vm_object_trunc_page(offset));
8941 		if (page != NULL) {
8942 			/* restart the lookup */
8943 			if (page->vmp_restart) {
8944 				vm_object_unlock(object);
8945 				goto restart;
8946 			}
8947 
8948 			/*
8949 			 * If this page is busy, we need to wait for it.
8950 			 */
8951 			if (page->vmp_busy) {
8952 				vm_page_sleep(object, page, THREAD_INTERRUPTIBLE, LCK_SLEEP_UNLOCK);
8953 				goto restart;
8954 			}
8955 			break;
8956 		}
8957 
8958 		/*
8959 		 * If the object doesn't have the page and
8960 		 * has no shadow, then we can quit.
8961 		 */
8962 		shadow = object->shadow;
8963 		if (shadow == NULL) {
8964 			kr = KERN_FAILURE;
8965 			goto done;
8966 		}
8967 
8968 		/*
8969 		 * Move to the next object
8970 		 */
8971 		offset += object->vo_shadow_offset;
8972 		vm_object_lock(shadow);
8973 		vm_object_unlock(object);
8974 		object = shadow;
8975 		shadow = VM_OBJECT_NULL;
8976 	}
8977 	*ret_object = object;
8978 	*ret_offset = vm_object_trunc_page(offset);
8979 	*ret_page = page;
8980 	*ret_prot = prot;
8981 
8982 done:
8983 	if (kr != KERN_SUCCESS && object != NULL) {
8984 		vm_object_unlock(object);
8985 	}
8986 	vmlp_api_end(VMTC_REVALIDATE_LOOKUP, kr);
8987 	return kr;
8988 }
8989 
8990 /*
8991  * Check if a page is wired, needs extra locking.
8992  */
8993 static bool
is_page_wired(vm_page_t page)8994 is_page_wired(vm_page_t page)
8995 {
8996 	bool result;
8997 	vm_page_lock_queues();
8998 	result = VM_PAGE_WIRED(page);
8999 	vm_page_unlock_queues();
9000 	return result;
9001 }
9002 
9003 /*
9004  * A fatal process error has occurred in the given task.
9005  * Recheck the code signing of the text page at the given
9006  * address to check for a text page corruption.
9007  *
9008  * Returns KERN_FAILURE if a page was found to be corrupt
9009  * by failing to match its code signature. KERN_SUCCESS
9010  * means the page is either valid or we don't have the
9011  * information to say it's corrupt.
9012  */
9013 kern_return_t
revalidate_text_page(task_t task,vm_map_offset_t code_addr)9014 revalidate_text_page(task_t task, vm_map_offset_t code_addr)
9015 {
9016 	kern_return_t          kr;
9017 	vm_map_t               map;
9018 	vm_object_t            object = NULL;
9019 	vm_object_offset_t     offset;
9020 	vm_page_t              page = NULL;
9021 	struct vnode           *vnode;
9022 	uint64_t               *diagnose_buffer = NULL;
9023 	CA_EVENT_TYPE(vmtc_telemetry) * event = NULL;
9024 	ca_event_t             ca_event = NULL;
9025 	vm_prot_t              prot;
9026 
9027 	map = task->map;
9028 	if (task->map == NULL) {
9029 		return KERN_SUCCESS;
9030 	}
9031 
9032 	kr = vmtc_revalidate_lookup(map, code_addr, &object, &offset, &page, &prot);
9033 	if (kr != KERN_SUCCESS) {
9034 		goto done;
9035 	}
9036 
9037 	/*
9038 	 * The page must be executable.
9039 	 */
9040 	if (!(prot & VM_PROT_EXECUTE)) {
9041 		goto done;
9042 	}
9043 
9044 	/*
9045 	 * The object needs to have a pager.
9046 	 */
9047 	if (object->pager == NULL) {
9048 		goto done;
9049 	}
9050 
9051 	/*
9052 	 * Needs to be a vnode backed page to have a signature.
9053 	 */
9054 	vnode = vnode_pager_lookup_vnode(object->pager);
9055 	if (vnode == NULL) {
9056 		goto done;
9057 	}
9058 
9059 	/*
9060 	 * Object checks to see if we should proceed.
9061 	 */
9062 	if (!object->code_signed ||     /* no code signature to check */
9063 	    object->internal ||         /* internal objects aren't signed */
9064 	    object->terminating ||      /* the object and its pages are already going away */
9065 	    !object->pager_ready) {     /* this should happen, but check shouldn't hurt */
9066 		goto done;
9067 	}
9068 
9069 
9070 	/*
9071 	 * Check the code signature of the page in question.
9072 	 */
9073 	vm_page_map_and_validate_cs(object, page);
9074 
9075 	/*
9076 	 * At this point:
9077 	 * vmp_cs_validated |= validated (set if a code signature exists)
9078 	 * vmp_cs_tainted |= tainted (set if code signature violation)
9079 	 * vmp_cs_nx |= nx;  ??
9080 	 *
9081 	 * if vmp_pmapped then have to pmap_disconnect..
9082 	 * other flags to check on object or page?
9083 	 */
9084 	if (page->vmp_cs_tainted != VMP_CS_ALL_FALSE) {
9085 #if DEBUG || DEVELOPMENT
9086 		/*
9087 		 * On development builds, a boot-arg can be used to cause
9088 		 * a panic, instead of a quiet repair.
9089 		 */
9090 		if (vmtc_panic_instead) {
9091 			panic("Text page corruption detected: vm_page_t 0x%llx", (long long)(uintptr_t)page);
9092 		}
9093 #endif /* DEBUG || DEVELOPMENT */
9094 
9095 		/*
9096 		 * We're going to invalidate this page. Grab a copy of it for comparison.
9097 		 */
9098 		ca_event = CA_EVENT_ALLOCATE(vmtc_telemetry);
9099 		event = ca_event->data;
9100 		diagnose_buffer = vmtc_text_page_diagnose_setup(code_addr, page, event);
9101 
9102 		/*
9103 		 * Invalidate, i.e. toss, the corrupted page.
9104 		 */
9105 		if (!page->vmp_cleaning &&
9106 		    !page->vmp_laundry &&
9107 		    !vm_page_is_fictitious(page) &&
9108 		    !page->vmp_precious &&
9109 		    !page->vmp_absent &&
9110 		    !VMP_ERROR_GET(page) &&
9111 		    !page->vmp_dirty &&
9112 		    !is_page_wired(page)) {
9113 			if (page->vmp_pmapped) {
9114 				int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page));
9115 				if (refmod & VM_MEM_MODIFIED) {
9116 					SET_PAGE_DIRTY(page, FALSE);
9117 				}
9118 				if (refmod & VM_MEM_REFERENCED) {
9119 					page->vmp_reference = TRUE;
9120 				}
9121 			}
9122 			/* If the page seems intentionally modified, don't trash it. */
9123 			if (!page->vmp_dirty) {
9124 				VM_PAGE_FREE(page);
9125 			} else {
9126 				event->vmtc_not_eligible = true;
9127 			}
9128 		} else {
9129 			event->vmtc_not_eligible = true;
9130 		}
9131 		vm_object_unlock(object);
9132 		object = VM_OBJECT_NULL;
9133 
9134 		/*
9135 		 * Now try to diagnose the type of failure by faulting
9136 		 * in a new copy and diff'ing it with what we saved.
9137 		 */
9138 		if (diagnose_buffer != NULL) {
9139 			vmtc_text_page_diagnose(code_addr, diagnose_buffer, event);
9140 		}
9141 #if DEBUG || DEVELOPMENT
9142 		if (corruption_test_va != 0) {
9143 			corruption_test_va = 0;
9144 			event->vmtc_testing = true;
9145 		}
9146 #endif /* DEBUG || DEVELOPMENT */
9147 		ktriage_record(thread_tid(current_thread()),
9148 		    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_TEXT_CORRUPTION),
9149 		    0 /* arg */);
9150 		CA_EVENT_SEND(ca_event);
9151 		printf("Text page corruption detected for pid %d\n", proc_selfpid());
9152 		++vmtc_total;
9153 		return KERN_FAILURE; /* failure means we definitely found a corrupt page */
9154 	}
9155 done:
9156 	if (object != NULL) {
9157 		vm_object_unlock(object);
9158 	}
9159 	return KERN_SUCCESS;
9160 }
9161 
9162 #if DEBUG || DEVELOPMENT
9163 /*
9164  * For implementing unit tests - ask the pmap to corrupt a text page.
9165  * We have to find the page, to get the physical address, then invoke
9166  * the pmap.
9167  */
9168 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
9169 
9170 kern_return_t
vm_corrupt_text_addr(uintptr_t va)9171 vm_corrupt_text_addr(uintptr_t va)
9172 {
9173 	task_t                 task = current_task();
9174 	vm_map_t               map;
9175 	kern_return_t          kr = KERN_SUCCESS;
9176 	vm_object_t            object = VM_OBJECT_NULL;
9177 	vm_object_offset_t     offset;
9178 	vm_page_t              page = NULL;
9179 	pmap_paddr_t           pa;
9180 	vm_prot_t              prot;
9181 
9182 	map = task->map;
9183 	if (task->map == NULL) {
9184 		printf("corrupt_text_addr: no map\n");
9185 		return KERN_FAILURE;
9186 	}
9187 
9188 	kr = vmtc_revalidate_lookup(map, (vm_map_offset_t)va, &object, &offset, &page, &prot);
9189 	if (kr != KERN_SUCCESS) {
9190 		printf("corrupt_text_addr: page lookup failed\n");
9191 		return kr;
9192 	}
9193 	if (!(prot & VM_PROT_EXECUTE)) {
9194 		printf("corrupt_text_addr: page not executable\n");
9195 		return KERN_FAILURE;
9196 	}
9197 
9198 	/* get the physical address to use */
9199 	pa = ptoa(VM_PAGE_GET_PHYS_PAGE(page)) + (va - vm_object_trunc_page(va));
9200 
9201 	/*
9202 	 * Check we have something we can work with.
9203 	 * Due to racing with pageout as we enter the sysctl,
9204 	 * it's theoretically possible to have the page disappear, just
9205 	 * before the lookup.
9206 	 *
9207 	 * That's highly likely to happen often. I've filed a radar 72857482
9208 	 * to bubble up the error here to the sysctl result and have the
9209 	 * test not FAIL in that case.
9210 	 */
9211 	if (page->vmp_busy) {
9212 		printf("corrupt_text_addr: vmp_busy\n");
9213 		kr = KERN_FAILURE;
9214 	}
9215 	if (page->vmp_cleaning) {
9216 		printf("corrupt_text_addr: vmp_cleaning\n");
9217 		kr = KERN_FAILURE;
9218 	}
9219 	if (page->vmp_laundry) {
9220 		printf("corrupt_text_addr: vmp_cleaning\n");
9221 		kr = KERN_FAILURE;
9222 	}
9223 	if (vm_page_is_fictitious(page)) {
9224 		printf("corrupt_text_addr: vmp_fictitious\n");
9225 		kr = KERN_FAILURE;
9226 	}
9227 	if (page->vmp_precious) {
9228 		printf("corrupt_text_addr: vmp_precious\n");
9229 		kr = KERN_FAILURE;
9230 	}
9231 	if (page->vmp_absent) {
9232 		printf("corrupt_text_addr: vmp_absent\n");
9233 		kr = KERN_FAILURE;
9234 	}
9235 	if (VMP_ERROR_GET(page)) {
9236 		printf("corrupt_text_addr: vmp_error\n");
9237 		kr = KERN_FAILURE;
9238 	}
9239 	if (page->vmp_dirty) {
9240 		printf("corrupt_text_addr: vmp_dirty\n");
9241 		kr = KERN_FAILURE;
9242 	}
9243 	if (is_page_wired(page)) {
9244 		printf("corrupt_text_addr: wired\n");
9245 		kr = KERN_FAILURE;
9246 	}
9247 	if (!page->vmp_pmapped) {
9248 		printf("corrupt_text_addr: !vmp_pmapped\n");
9249 		kr = KERN_FAILURE;
9250 	}
9251 
9252 	if (kr == KERN_SUCCESS) {
9253 		printf("corrupt_text_addr: using physaddr 0x%llx\n", (long long)pa);
9254 		kr = pmap_test_text_corruption(pa);
9255 		if (kr != KERN_SUCCESS) {
9256 			printf("corrupt_text_addr: pmap error %d\n", kr);
9257 		} else {
9258 			corruption_test_va = va;
9259 		}
9260 	} else {
9261 		printf("corrupt_text_addr: object %p\n", object);
9262 		printf("corrupt_text_addr: offset 0x%llx\n", (uint64_t)offset);
9263 		printf("corrupt_text_addr: va 0x%llx\n", (uint64_t)va);
9264 		printf("corrupt_text_addr: vm_object_trunc_page(va) 0x%llx\n", (uint64_t)vm_object_trunc_page(va));
9265 		printf("corrupt_text_addr: vm_page_t %p\n", page);
9266 		printf("corrupt_text_addr: ptoa(PHYS_PAGE) 0x%llx\n", (uint64_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page)));
9267 		printf("corrupt_text_addr: using physaddr 0x%llx\n", (uint64_t)pa);
9268 	}
9269 
9270 	if (object != VM_OBJECT_NULL) {
9271 		vm_object_unlock(object);
9272 	}
9273 	return kr;
9274 }
9275 
9276 #endif /* DEBUG || DEVELOPMENT */
9277