xref: /xnu-11417.101.15/bsd/vm/vm_unix.c (revision e3723e1f17661b24996789d8afc084c0c3303b26)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Mach Operating System
30  * Copyright (c) 1987 Carnegie-Mellon University
31  * All rights reserved.  The CMU software License Agreement specifies
32  * the terms and conditions for use and redistribution.
33  */
34 /*
35  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
36  * support for mandatory and extensible security protections.  This notice
37  * is included in support of clause 2.2 (b) of the Apple Public License,
38  * Version 2.0.
39  */
40 #include <vm/vm_options.h>
41 
42 #include <kern/ecc.h>
43 #include <kern/task.h>
44 #include <kern/thread.h>
45 #include <kern/debug.h>
46 #include <kern/extmod_statistics.h>
47 #include <mach/mach_traps.h>
48 #include <mach/port.h>
49 #include <mach/sdt.h>
50 #include <mach/task.h>
51 #include <mach/task_access.h>
52 #include <mach/task_special_ports.h>
53 #include <mach/time_value.h>
54 #include <mach/vm_map.h>
55 #include <mach/vm_param.h>
56 #include <mach/vm_prot.h>
57 #include <machine/machine_routines.h>
58 
59 #include <sys/file_internal.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/dir.h>
63 #include <sys/namei.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/vm.h>
67 #include <sys/file.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/mount.h>
70 #include <sys/xattr.h>
71 #include <sys/trace.h>
72 #include <sys/kernel.h>
73 #include <sys/ubc_internal.h>
74 #include <sys/user.h>
75 #include <sys/syslog.h>
76 #include <sys/stat.h>
77 #include <sys/sysproto.h>
78 #include <sys/mman.h>
79 #include <sys/sysctl.h>
80 #include <sys/cprotect.h>
81 #include <sys/kpi_socket.h>
82 #include <sys/kas_info.h>
83 #include <sys/socket.h>
84 #include <sys/socketvar.h>
85 #include <sys/random.h>
86 #include <sys/code_signing.h>
87 #if NECP
88 #include <net/necp.h>
89 #endif /* NECP */
90 #if SKYWALK
91 #include <skywalk/os_channel.h>
92 #endif /* SKYWALK */
93 
94 #include <security/audit/audit.h>
95 #include <security/mac.h>
96 #include <bsm/audit_kevents.h>
97 
98 #include <kern/kalloc.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_kern_xnu.h>
101 #include <vm/vm_pageout_xnu.h>
102 
103 #include <mach/shared_region.h>
104 #include <vm/vm_shared_region_internal.h>
105 
106 #include <vm/vm_dyld_pager_internal.h>
107 #include <vm/vm_protos_internal.h>
108 #if DEVELOPMENT || DEBUG
109 #include <vm/vm_compressor_info.h>         /* for c_segment_info */
110 #include <vm/vm_compressor_xnu.h>          /* for vm_compressor_serialize_segment_debug_info() */
111 #endif
112 #include <vm/vm_reclaim_xnu.h>
113 
114 #include <sys/kern_memorystatus.h>
115 #include <sys/kern_memorystatus_freeze.h>
116 #include <sys/proc_internal.h>
117 
118 #include <mach-o/fixup-chains.h>
119 
120 #if CONFIG_MACF
121 #include <security/mac_framework.h>
122 #endif
123 
124 #include <kern/bits.h>
125 
126 #if CONFIG_CSR
127 #include <sys/csr.h>
128 #endif /* CONFIG_CSR */
129 #include <sys/trust_caches.h>
130 #include <libkern/amfi/amfi.h>
131 #include <IOKit/IOBSD.h>
132 
133 #if VM_MAP_DEBUG_APPLE_PROTECT
134 SYSCTL_INT(_vm, OID_AUTO, map_debug_apple_protect, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_apple_protect, 0, "");
135 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
136 
137 #if DEVELOPMENT || DEBUG
138 
139 static int
140 sysctl_kmem_alloc_contig SYSCTL_HANDLER_ARGS
141 {
142 #pragma unused(arg1, arg2)
143 	vm_offset_t     kaddr;
144 	kern_return_t   kr;
145 	int     error = 0;
146 	int     size = 0;
147 
148 	error = sysctl_handle_int(oidp, &size, 0, req);
149 	if (error || !req->newptr) {
150 		return error;
151 	}
152 
153 	kr = kmem_alloc_contig(kernel_map, &kaddr, (vm_size_t)size,
154 	    0, 0, 0, KMA_DATA, VM_KERN_MEMORY_IOKIT);
155 
156 	if (kr == KERN_SUCCESS) {
157 		kmem_free(kernel_map, kaddr, size);
158 	}
159 
160 	return error;
161 }
162 
163 SYSCTL_PROC(_vm, OID_AUTO, kmem_alloc_contig, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
164     0, 0, &sysctl_kmem_alloc_contig, "I", "");
165 
166 extern int vm_region_footprint;
167 SYSCTL_INT(_vm, OID_AUTO, region_footprint, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, &vm_region_footprint, 0, "");
168 
169 static int
170 sysctl_kmem_gobj_stats SYSCTL_HANDLER_ARGS
171 {
172 #pragma unused(arg1, arg2, oidp)
173 	kmem_gobj_stats stats = kmem_get_gobj_stats();
174 
175 	return SYSCTL_OUT(req, &stats, sizeof(stats));
176 }
177 
178 SYSCTL_PROC(_vm, OID_AUTO, kmem_gobj_stats,
179     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
180     0, 0, &sysctl_kmem_gobj_stats, "S,kmem_gobj_stats", "");
181 
182 #endif /* DEVELOPMENT || DEBUG */
183 
184 static int
185 sysctl_vm_self_region_footprint SYSCTL_HANDLER_ARGS
186 {
187 #pragma unused(arg1, arg2, oidp)
188 	int     error = 0;
189 	int     value;
190 
191 	value = task_self_region_footprint();
192 	error = SYSCTL_OUT(req, &value, sizeof(int));
193 	if (error) {
194 		return error;
195 	}
196 
197 	if (!req->newptr) {
198 		return 0;
199 	}
200 
201 	error = SYSCTL_IN(req, &value, sizeof(int));
202 	if (error) {
203 		return error;
204 	}
205 	task_self_region_footprint_set(value);
206 	return 0;
207 }
208 SYSCTL_PROC(_vm, OID_AUTO, self_region_footprint, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_footprint, "I", "");
209 
210 static int
211 sysctl_vm_self_region_page_size SYSCTL_HANDLER_ARGS
212 {
213 #pragma unused(arg1, arg2, oidp)
214 	int     error = 0;
215 	int     value;
216 
217 	value = (1 << thread_self_region_page_shift());
218 	error = SYSCTL_OUT(req, &value, sizeof(int));
219 	if (error) {
220 		return error;
221 	}
222 
223 	if (!req->newptr) {
224 		return 0;
225 	}
226 
227 	error = SYSCTL_IN(req, &value, sizeof(int));
228 	if (error) {
229 		return error;
230 	}
231 
232 	if (value != 0 && value != 4096 && value != 16384) {
233 		return EINVAL;
234 	}
235 
236 #if !__ARM_MIXED_PAGE_SIZE__
237 	if (value != vm_map_page_size(current_map())) {
238 		return EINVAL;
239 	}
240 #endif /* !__ARM_MIXED_PAGE_SIZE__ */
241 
242 	thread_self_region_page_shift_set(bit_first(value));
243 	return 0;
244 }
245 SYSCTL_PROC(_vm, OID_AUTO, self_region_page_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_page_size, "I", "");
246 
247 static int
248 sysctl_vm_self_region_info_flags SYSCTL_HANDLER_ARGS
249 {
250 #pragma unused(arg1, arg2, oidp)
251 	int     error = 0;
252 	int     value;
253 	kern_return_t kr;
254 
255 	value = task_self_region_info_flags();
256 	error = SYSCTL_OUT(req, &value, sizeof(int));
257 	if (error) {
258 		return error;
259 	}
260 
261 	if (!req->newptr) {
262 		return 0;
263 	}
264 
265 	error = SYSCTL_IN(req, &value, sizeof(int));
266 	if (error) {
267 		return error;
268 	}
269 
270 	kr = task_self_region_info_flags_set(value);
271 	if (kr != KERN_SUCCESS) {
272 		return EINVAL;
273 	}
274 
275 	return 0;
276 }
277 SYSCTL_PROC(_vm, OID_AUTO, self_region_info_flags, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_info_flags, "I", "");
278 
279 
280 #if DEVELOPMENT || DEBUG
281 extern int panic_on_unsigned_execute;
282 SYSCTL_INT(_vm, OID_AUTO, panic_on_unsigned_execute, CTLFLAG_RW | CTLFLAG_LOCKED, &panic_on_unsigned_execute, 0, "");
283 
284 extern int vm_log_xnu_user_debug;
285 SYSCTL_INT(_vm, OID_AUTO, log_xnu_user_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_xnu_user_debug, 0, "");
286 #endif /* DEVELOPMENT || DEBUG */
287 
288 extern int vm_log_map_delete_permanent_prot_none;
289 SYSCTL_INT(_vm, OID_AUTO, log_map_delete_permanent_prot_none, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_map_delete_permanent_prot_none, 0, "");
290 
291 extern int cs_executable_create_upl;
292 extern int cs_executable_wire;
293 SYSCTL_INT(_vm, OID_AUTO, cs_executable_create_upl, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_create_upl, 0, "");
294 SYSCTL_INT(_vm, OID_AUTO, cs_executable_wire, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_wire, 0, "");
295 
296 extern int apple_protect_pager_count;
297 extern int apple_protect_pager_count_mapped;
298 extern unsigned int apple_protect_pager_cache_limit;
299 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count, 0, "");
300 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count_mapped, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count_mapped, 0, "");
301 SYSCTL_UINT(_vm, OID_AUTO, apple_protect_pager_cache_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_cache_limit, 0, "");
302 
303 #if DEVELOPMENT || DEBUG
304 extern int radar_20146450;
305 SYSCTL_INT(_vm, OID_AUTO, radar_20146450, CTLFLAG_RW | CTLFLAG_LOCKED, &radar_20146450, 0, "");
306 
307 extern int macho_printf;
308 SYSCTL_INT(_vm, OID_AUTO, macho_printf, CTLFLAG_RW | CTLFLAG_LOCKED, &macho_printf, 0, "");
309 
310 extern int apple_protect_pager_data_request_debug;
311 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_data_request_debug, 0, "");
312 
313 extern unsigned int vm_object_copy_delayed_paging_wait_disable;
314 EXPERIMENT_FACTOR_UINT(_vm, vm_object_copy_delayed_paging_wait_disable, &vm_object_copy_delayed_paging_wait_disable, FALSE, TRUE, "");
315 
316 #if __arm64__
317 /* These are meant to support the page table accounting unit test. */
318 extern unsigned int arm_hardware_page_size;
319 extern unsigned int arm_pt_desc_size;
320 extern unsigned int arm_pt_root_size;
321 extern unsigned int inuse_user_tteroot_count;
322 extern unsigned int inuse_kernel_tteroot_count;
323 extern unsigned int inuse_user_ttepages_count;
324 extern unsigned int inuse_kernel_ttepages_count;
325 extern unsigned int inuse_user_ptepages_count;
326 extern unsigned int inuse_kernel_ptepages_count;
327 SYSCTL_UINT(_vm, OID_AUTO, native_hw_pagesize, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_hardware_page_size, 0, "");
328 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_desc_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_desc_size, 0, "");
329 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_root_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_root_size, 0, "");
330 SYSCTL_UINT(_vm, OID_AUTO, user_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_tteroot_count, 0, "");
331 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_tteroot_count, 0, "");
332 SYSCTL_UINT(_vm, OID_AUTO, user_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ttepages_count, 0, "");
333 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ttepages_count, 0, "");
334 SYSCTL_UINT(_vm, OID_AUTO, user_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ptepages_count, 0, "");
335 SYSCTL_UINT(_vm, OID_AUTO, kernel_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ptepages_count, 0, "");
336 #if !CONFIG_SPTM
337 extern unsigned int free_page_size_tt_count;
338 extern unsigned int free_tt_count;
339 SYSCTL_UINT(_vm, OID_AUTO, free_1page_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_page_size_tt_count, 0, "");
340 SYSCTL_UINT(_vm, OID_AUTO, free_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_tt_count, 0, "");
341 #endif
342 #if DEVELOPMENT || DEBUG
343 extern unsigned long pmap_asid_flushes;
344 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_flushes, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_flushes, "");
345 extern unsigned long pmap_asid_hits;
346 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_hits, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_hits, "");
347 extern unsigned long pmap_asid_misses;
348 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_misses, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_misses, "");
349 extern unsigned long pmap_speculation_restrictions;
350 SYSCTL_ULONG(_vm, OID_AUTO, pmap_speculation_restrictions, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_speculation_restrictions, "");
351 #endif
352 #endif /* __arm64__ */
353 #endif /* DEVELOPMENT || DEBUG */
354 
355 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor, 0, "");
356 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor_pages, 0, "");
357 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate, 0, "");
358 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate_failure, 0, "");
359 SYSCTL_INT(_vm, OID_AUTO, vm_should_cow_but_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.should_cow_but_wired, 0, "");
360 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow, 0, "");
361 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow_pages, 0, "");
362 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_write, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_write, 0, "");
363 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_copy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_copy, 0, "");
364 #if VM_SCAN_FOR_SHADOW_CHAIN
365 static int vm_shadow_max_enabled = 0;    /* Disabled by default */
366 extern int proc_shadow_max(void);
367 static int
368 vm_shadow_max SYSCTL_HANDLER_ARGS
369 {
370 #pragma unused(arg1, arg2, oidp)
371 	int value = 0;
372 
373 	if (vm_shadow_max_enabled) {
374 		value = proc_shadow_max();
375 	}
376 
377 	return SYSCTL_OUT(req, &value, sizeof(value));
378 }
379 SYSCTL_PROC(_vm, OID_AUTO, vm_shadow_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
380     0, 0, &vm_shadow_max, "I", "");
381 
382 SYSCTL_INT(_vm, OID_AUTO, vm_shadow_max_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_shadow_max_enabled, 0, "");
383 
384 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
385 
386 SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, "");
387 
388 #if PAGE_SLEEP_WITH_INHERITOR
389 #if DEVELOPMENT || DEBUG
390 extern uint32_t page_worker_table_size;
391 SYSCTL_INT(_vm, OID_AUTO, page_worker_table_size, CTLFLAG_RD | CTLFLAG_LOCKED, &page_worker_table_size, 0, "");
392 SCALABLE_COUNTER_DECLARE(page_worker_hash_collisions);
393 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_hash_collisions, page_worker_hash_collisions, "");
394 SCALABLE_COUNTER_DECLARE(page_worker_inheritor_sleeps);
395 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_inheritor_sleeps, page_worker_inheritor_sleeps, "");
396 #endif /* DEVELOPMENT || DEBUG */
397 #endif /* PAGE_SLEEP_WITH_INHERITOR */
398 
399 /*
400  * Sysctl's related to data/stack execution.  See osfmk/vm/vm_map.c
401  */
402 
403 #if DEVELOPMENT || DEBUG
404 extern int allow_stack_exec, allow_data_exec;
405 
406 SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, "");
407 SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, "");
408 
409 #endif /* DEVELOPMENT || DEBUG */
410 
411 static const char *prot_values[] = {
412 	"none",
413 	"read-only",
414 	"write-only",
415 	"read-write",
416 	"execute-only",
417 	"read-execute",
418 	"write-execute",
419 	"read-write-execute"
420 };
421 
422 void
log_stack_execution_failure(addr64_t vaddr,vm_prot_t prot)423 log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot)
424 {
425 	printf("Data/Stack execution not permitted: %s[pid %d] at virtual address 0x%qx, protections were %s\n",
426 	    current_proc()->p_comm, proc_getpid(current_proc()), vaddr, prot_values[prot & VM_PROT_ALL]);
427 }
428 
429 /*
430  * shared_region_unnest_logging: level of logging of unnesting events
431  * 0	- no logging
432  * 1	- throttled logging of unexpected unnesting events (default)
433  * 2	- unthrottled logging of unexpected unnesting events
434  * 3+	- unthrottled logging of all unnesting events
435  */
436 int shared_region_unnest_logging = 1;
437 
438 SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED,
439     &shared_region_unnest_logging, 0, "");
440 
441 int vm_shared_region_unnest_log_interval = 10;
442 int shared_region_unnest_log_count_threshold = 5;
443 
444 
445 #if XNU_TARGET_OS_OSX
446 
447 #if defined (__x86_64__)
448 static int scdir_enforce = 1;
449 #else /* defined (__x86_64__) */
450 static int scdir_enforce = 0;   /* AOT caches live elsewhere */
451 #endif /* defined (__x86_64__) */
452 
453 static char *scdir_path[] = {
454 	"/System/Library/dyld/",
455 	"/System/Volumes/Preboot/Cryptexes/OS/System/Library/dyld",
456 	"/System/Cryptexes/OS/System/Library/dyld",
457 	NULL
458 };
459 
460 #else /* XNU_TARGET_OS_OSX */
461 
462 static int scdir_enforce = 0;
463 static char *scdir_path[] = {
464 	"/System/Library/Caches/com.apple.dyld/",
465 	"/private/preboot/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
466 	"/System/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
467 	NULL
468 };
469 
470 #endif /* XNU_TARGET_OS_OSX */
471 
472 static char *driverkit_scdir_path[] = {
473 	"/System/DriverKit/System/Library/dyld/",
474 #if XNU_TARGET_OS_OSX
475 	"/System/Volumes/Preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
476 #else
477 	"/private/preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
478 #endif /* XNU_TARGET_OS_OSX */
479 	"/System/Cryptexes/OS/System/DriverKit/System/Library/dyld",
480 	NULL
481 };
482 
483 #ifndef SECURE_KERNEL
484 static int sysctl_scdir_enforce SYSCTL_HANDLER_ARGS
485 {
486 #if CONFIG_CSR
487 	if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
488 		printf("Failed attempt to set vm.enforce_shared_cache_dir sysctl\n");
489 		return EPERM;
490 	}
491 #endif /* CONFIG_CSR */
492 	return sysctl_handle_int(oidp, arg1, arg2, req);
493 }
494 
495 SYSCTL_PROC(_vm, OID_AUTO, enforce_shared_cache_dir, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, sysctl_scdir_enforce, "I", "");
496 #endif
497 
498 /* These log rate throttling state variables aren't thread safe, but
499  * are sufficient unto the task.
500  */
501 static int64_t last_unnest_log_time = 0;
502 static int shared_region_unnest_log_count = 0;
503 
504 void
log_unnest_badness(vm_map_t m,vm_map_offset_t s,vm_map_offset_t e,boolean_t is_nested_map,vm_map_offset_t lowest_unnestable_addr)505 log_unnest_badness(
506 	vm_map_t        m,
507 	vm_map_offset_t s,
508 	vm_map_offset_t e,
509 	boolean_t       is_nested_map,
510 	vm_map_offset_t lowest_unnestable_addr)
511 {
512 	struct timeval  tv;
513 
514 	if (shared_region_unnest_logging == 0) {
515 		return;
516 	}
517 
518 	if (shared_region_unnest_logging <= 2 &&
519 	    is_nested_map &&
520 	    s >= lowest_unnestable_addr) {
521 		/*
522 		 * Unnesting of writable map entries is fine.
523 		 */
524 		return;
525 	}
526 
527 	if (shared_region_unnest_logging <= 1) {
528 		microtime(&tv);
529 		if ((tv.tv_sec - last_unnest_log_time) <
530 		    vm_shared_region_unnest_log_interval) {
531 			if (shared_region_unnest_log_count++ >
532 			    shared_region_unnest_log_count_threshold) {
533 				return;
534 			}
535 		} else {
536 			last_unnest_log_time = tv.tv_sec;
537 			shared_region_unnest_log_count = 0;
538 		}
539 	}
540 
541 	DTRACE_VM4(log_unnest_badness,
542 	    vm_map_t, m,
543 	    vm_map_offset_t, s,
544 	    vm_map_offset_t, e,
545 	    vm_map_offset_t, lowest_unnestable_addr);
546 	printf("%s[%d] triggered unnest of range 0x%qx->0x%qx of DYLD shared region in VM map %p. While not abnormal for debuggers, this increases system memory footprint until the target exits.\n", current_proc()->p_comm, proc_getpid(current_proc()), (uint64_t)s, (uint64_t)e, (void *) VM_KERNEL_ADDRPERM(m));
547 }
548 
549 uint64_t
vm_purge_filebacked_pagers(void)550 vm_purge_filebacked_pagers(void)
551 {
552 	uint64_t pages_purged;
553 
554 	pages_purged = 0;
555 	pages_purged += apple_protect_pager_purge_all();
556 	pages_purged += shared_region_pager_purge_all();
557 	pages_purged += dyld_pager_purge_all();
558 #if DEVELOPMENT || DEBUG
559 	printf("%s:%d pages purged: %llu\n", __FUNCTION__, __LINE__, pages_purged);
560 #endif /* DEVELOPMENT || DEBUG */
561 	return pages_purged;
562 }
563 
564 int
useracc(user_addr_ut addr_u,user_size_ut len_u,int prot)565 useracc(
566 	user_addr_ut    addr_u,
567 	user_size_ut    len_u,
568 	int             prot)
569 {
570 	vm_map_t        map;
571 	vm_prot_t       vm_prot = VM_PROT_WRITE;
572 
573 	map = current_map();
574 
575 	if (prot == B_READ) {
576 		vm_prot = VM_PROT_READ;
577 	}
578 
579 	return vm_map_check_protection(map, addr_u,
580 	           vm_sanitize_compute_ut_end(addr_u, len_u), vm_prot,
581 	           VM_SANITIZE_CALLER_USERACC);
582 }
583 
584 #if XNU_PLATFORM_MacOSX
585 static __attribute__((always_inline, warn_unused_result))
586 kern_return_t
vslock_sanitize(vm_map_t map,user_addr_ut addr_u,user_size_ut len_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)587 vslock_sanitize(
588 	vm_map_t                map,
589 	user_addr_ut            addr_u,
590 	user_size_ut            len_u,
591 	vm_sanitize_caller_t    vm_sanitize_caller,
592 	vm_map_offset_t        *start,
593 	vm_map_offset_t        *end,
594 	vm_map_size_t          *size)
595 {
596 	return vm_sanitize_addr_size(addr_u, len_u, vm_sanitize_caller,
597 	           map,
598 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
599 	           size);
600 }
601 #endif /* XNU_PLATFORM_MacOSX */
602 
603 int
vslock(user_addr_ut addr,user_size_ut len)604 vslock(user_addr_ut addr, user_size_ut len)
605 {
606 	kern_return_t kret;
607 
608 #if XNU_PLATFORM_MacOSX
609 	/*
610 	 * Preserve previous behavior on macOS for overflows due to bin
611 	 * compatibility i.e. return success for overflows without doing
612 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
613 	 * for overflow errors which gets converted to KERN_SUCCESS by
614 	 * vm_sanitize_get_kr.
615 	 */
616 	vm_map_offset_t start, end;
617 	vm_map_size_t   size;
618 
619 	kret = vslock_sanitize(current_map(),
620 	    addr,
621 	    len,
622 	    VM_SANITIZE_CALLER_VSLOCK,
623 	    &start,
624 	    &end,
625 	    &size);
626 	if (__improbable(kret != KERN_SUCCESS)) {
627 		switch (vm_sanitize_get_kr(kret)) {
628 		case KERN_SUCCESS:
629 			return 0;
630 		case KERN_INVALID_ADDRESS:
631 		case KERN_NO_SPACE:
632 			return ENOMEM;
633 		case KERN_PROTECTION_FAILURE:
634 			return EACCES;
635 		default:
636 			return EINVAL;
637 		}
638 	}
639 #endif /* XNU_PLATFORM_MacOSX */
640 
641 	kret = vm_map_wire_kernel(current_map(), addr,
642 	    vm_sanitize_compute_ut_end(addr, len),
643 	    vm_sanitize_wrap_prot(VM_PROT_READ | VM_PROT_WRITE),
644 	    VM_KERN_MEMORY_BSD,
645 	    FALSE);
646 
647 	switch (kret) {
648 	case KERN_SUCCESS:
649 		return 0;
650 	case KERN_INVALID_ADDRESS:
651 	case KERN_NO_SPACE:
652 		return ENOMEM;
653 	case KERN_PROTECTION_FAILURE:
654 		return EACCES;
655 	default:
656 		return EINVAL;
657 	}
658 }
659 
660 int
vsunlock(user_addr_ut addr,user_size_ut len,__unused int dirtied)661 vsunlock(user_addr_ut addr, user_size_ut len, __unused int dirtied)
662 {
663 #if FIXME  /* [ */
664 	pmap_t          pmap;
665 	vm_page_t       pg;
666 	vm_map_offset_t vaddr;
667 	ppnum_t         paddr;
668 #endif  /* FIXME ] */
669 	kern_return_t   kret;
670 	vm_map_t        map;
671 
672 	map = current_map();
673 
674 #if FIXME  /* [ */
675 	if (dirtied) {
676 		pmap = get_task_pmap(current_task());
677 		for (vaddr = vm_map_trunc_page(addr, PAGE_MASK);
678 		    vaddr < vm_map_round_page(addr + len, PAGE_MASK);
679 		    vaddr += PAGE_SIZE) {
680 			paddr = pmap_find_phys(pmap, vaddr);
681 			pg = PHYS_TO_VM_PAGE(paddr);
682 			vm_page_set_modified(pg);
683 		}
684 	}
685 #endif  /* FIXME ] */
686 #ifdef  lint
687 	dirtied++;
688 #endif  /* lint */
689 
690 #if XNU_PLATFORM_MacOSX
691 	/*
692 	 * Preserve previous behavior on macOS for overflows due to bin
693 	 * compatibility i.e. return success for overflows without doing
694 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
695 	 * for overflow errors which gets converted to KERN_SUCCESS by
696 	 * vm_sanitize_get_kr.
697 	 */
698 	vm_map_offset_t start, end;
699 	vm_map_size_t   size;
700 
701 	kret = vslock_sanitize(map,
702 	    addr,
703 	    len,
704 	    VM_SANITIZE_CALLER_VSUNLOCK,
705 	    &start,
706 	    &end,
707 	    &size);
708 	if (__improbable(kret != KERN_SUCCESS)) {
709 		switch (vm_sanitize_get_kr(kret)) {
710 		case KERN_SUCCESS:
711 			return 0;
712 		case KERN_INVALID_ADDRESS:
713 		case KERN_NO_SPACE:
714 			return ENOMEM;
715 		case KERN_PROTECTION_FAILURE:
716 			return EACCES;
717 		default:
718 			return EINVAL;
719 		}
720 	}
721 #endif /* XNU_PLATFORM_MacOSX */
722 
723 	kret = vm_map_unwire(map, addr,
724 	    vm_sanitize_compute_ut_end(addr, len), false);
725 	switch (kret) {
726 	case KERN_SUCCESS:
727 		return 0;
728 	case KERN_INVALID_ADDRESS:
729 	case KERN_NO_SPACE:
730 		return ENOMEM;
731 	case KERN_PROTECTION_FAILURE:
732 		return EACCES;
733 	default:
734 		return EINVAL;
735 	}
736 }
737 
738 int
subyte(user_addr_t addr,int byte)739 subyte(
740 	user_addr_t addr,
741 	int byte)
742 {
743 	char character;
744 
745 	character = (char)byte;
746 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
747 }
748 
749 int
suibyte(user_addr_t addr,int byte)750 suibyte(
751 	user_addr_t addr,
752 	int byte)
753 {
754 	char character;
755 
756 	character = (char)byte;
757 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
758 }
759 
760 int
fubyte(user_addr_t addr)761 fubyte(user_addr_t addr)
762 {
763 	unsigned char byte;
764 
765 	if (copyin(addr, (void *) &byte, sizeof(char))) {
766 		return -1;
767 	}
768 	return byte;
769 }
770 
771 int
fuibyte(user_addr_t addr)772 fuibyte(user_addr_t addr)
773 {
774 	unsigned char byte;
775 
776 	if (copyin(addr, (void *) &(byte), sizeof(char))) {
777 		return -1;
778 	}
779 	return byte;
780 }
781 
782 int
suword(user_addr_t addr,long word)783 suword(
784 	user_addr_t addr,
785 	long word)
786 {
787 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
788 }
789 
790 long
fuword(user_addr_t addr)791 fuword(user_addr_t addr)
792 {
793 	long word = 0;
794 
795 	if (copyin(addr, (void *) &word, sizeof(int))) {
796 		return -1;
797 	}
798 	return word;
799 }
800 
801 /* suiword and fuiword are the same as suword and fuword, respectively */
802 
803 int
suiword(user_addr_t addr,long word)804 suiword(
805 	user_addr_t addr,
806 	long word)
807 {
808 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
809 }
810 
811 long
fuiword(user_addr_t addr)812 fuiword(user_addr_t addr)
813 {
814 	long word = 0;
815 
816 	if (copyin(addr, (void *) &word, sizeof(int))) {
817 		return -1;
818 	}
819 	return word;
820 }
821 
822 /*
823  * With a 32-bit kernel and mixed 32/64-bit user tasks, this interface allows the
824  * fetching and setting of process-sized size_t and pointer values.
825  */
826 int
sulong(user_addr_t addr,int64_t word)827 sulong(user_addr_t addr, int64_t word)
828 {
829 	if (IS_64BIT_PROCESS(current_proc())) {
830 		return copyout((void *)&word, addr, sizeof(word)) == 0 ? 0 : -1;
831 	} else {
832 		return suiword(addr, (long)word);
833 	}
834 }
835 
836 int64_t
fulong(user_addr_t addr)837 fulong(user_addr_t addr)
838 {
839 	int64_t longword;
840 
841 	if (IS_64BIT_PROCESS(current_proc())) {
842 		if (copyin(addr, (void *)&longword, sizeof(longword)) != 0) {
843 			return -1;
844 		}
845 		return longword;
846 	} else {
847 		return (int64_t)fuiword(addr);
848 	}
849 }
850 
851 int
suulong(user_addr_t addr,uint64_t uword)852 suulong(user_addr_t addr, uint64_t uword)
853 {
854 	if (IS_64BIT_PROCESS(current_proc())) {
855 		return copyout((void *)&uword, addr, sizeof(uword)) == 0 ? 0 : -1;
856 	} else {
857 		return suiword(addr, (uint32_t)uword);
858 	}
859 }
860 
861 uint64_t
fuulong(user_addr_t addr)862 fuulong(user_addr_t addr)
863 {
864 	uint64_t ulongword;
865 
866 	if (IS_64BIT_PROCESS(current_proc())) {
867 		if (copyin(addr, (void *)&ulongword, sizeof(ulongword)) != 0) {
868 			return -1ULL;
869 		}
870 		return ulongword;
871 	} else {
872 		return (uint64_t)fuiword(addr);
873 	}
874 }
875 
876 int
swapon(__unused proc_t procp,__unused struct swapon_args * uap,__unused int * retval)877 swapon(__unused proc_t procp, __unused struct swapon_args *uap, __unused int *retval)
878 {
879 	return ENOTSUP;
880 }
881 
882 #if defined(SECURE_KERNEL)
883 static int kern_secure_kernel = 1;
884 #else
885 static int kern_secure_kernel = 0;
886 #endif
887 
888 SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_secure_kernel, 0, "");
889 SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW | CTLFLAG_LOCKED,
890     &shared_region_trace_level, 0, "");
891 SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD | CTLFLAG_LOCKED,
892     &shared_region_version, 0, "");
893 SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED,
894     &shared_region_persistence, 0, "");
895 
896 /*
897  * shared_region_check_np:
898  *
899  * This system call is intended for dyld.
900  *
901  * dyld calls this when any process starts to see if the process's shared
902  * region is already set up and ready to use.
903  * This call returns the base address of the first mapping in the
904  * process's shared region's first mapping.
905  * dyld will then check what's mapped at that address.
906  *
907  * If the shared region is empty, dyld will then attempt to map the shared
908  * cache file in the shared region via the shared_region_map_np() system call.
909  *
910  * If something's already mapped in the shared region, dyld will check if it
911  * matches the shared cache it would like to use for that process.
912  * If it matches, evrything's ready and the process can proceed and use the
913  * shared region.
914  * If it doesn't match, dyld will unmap the shared region and map the shared
915  * cache into the process's address space via mmap().
916  *
917  * A NULL pointer argument can be used by dyld to indicate it has unmapped
918  * the shared region. We will remove the shared_region reference from the task.
919  *
920  * ERROR VALUES
921  * EINVAL	no shared region
922  * ENOMEM	shared region is empty
923  * EFAULT	bad address for "start_address"
924  */
925 int
shared_region_check_np(__unused struct proc * p,struct shared_region_check_np_args * uap,__unused int * retvalp)926 shared_region_check_np(
927 	__unused struct proc                    *p,
928 	struct shared_region_check_np_args      *uap,
929 	__unused int                            *retvalp)
930 {
931 	vm_shared_region_t      shared_region;
932 	mach_vm_offset_t        start_address = 0;
933 	int                     error = 0;
934 	kern_return_t           kr;
935 	task_t                  task = current_task();
936 
937 	SHARED_REGION_TRACE_DEBUG(
938 		("shared_region: %p [%d(%s)] -> check_np(0x%llx)\n",
939 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
940 		proc_getpid(p), p->p_comm,
941 		(uint64_t)uap->start_address));
942 
943 	/*
944 	 * Special value of start_address used to indicate that map_with_linking() should
945 	 * no longer be allowed in this process
946 	 */
947 	if (uap->start_address == (task_get_64bit_addr(task) ? DYLD_VM_END_MWL : (uint32_t)DYLD_VM_END_MWL)) {
948 		p->p_disallow_map_with_linking = TRUE;
949 		return 0;
950 	}
951 
952 	/* retrieve the current tasks's shared region */
953 	shared_region = vm_shared_region_get(task);
954 	if (shared_region != NULL) {
955 		/*
956 		 * A NULL argument is used by dyld to indicate the task
957 		 * has unmapped its shared region.
958 		 */
959 		if (uap->start_address == 0) {
960 			/* unmap it first */
961 			vm_shared_region_remove(task, shared_region);
962 			vm_shared_region_set(task, NULL);
963 		} else {
964 			/* retrieve address of its first mapping... */
965 			kr = vm_shared_region_start_address(shared_region, &start_address, task);
966 			if (kr != KERN_SUCCESS) {
967 				SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
968 				    "check_np(0x%llx) "
969 				    "vm_shared_region_start_address() failed\n",
970 				    (void *)VM_KERNEL_ADDRPERM(current_thread()),
971 				    proc_getpid(p), p->p_comm,
972 				    (uint64_t)uap->start_address));
973 				error = ENOMEM;
974 			} else {
975 #if __has_feature(ptrauth_calls)
976 				/*
977 				 * Remap any section of the shared library that
978 				 * has authenticated pointers into private memory.
979 				 */
980 				if (vm_shared_region_auth_remap(shared_region) != KERN_SUCCESS) {
981 					SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
982 					    "check_np(0x%llx) "
983 					    "vm_shared_region_auth_remap() failed\n",
984 					    (void *)VM_KERNEL_ADDRPERM(current_thread()),
985 					    proc_getpid(p), p->p_comm,
986 					    (uint64_t)uap->start_address));
987 					error = ENOMEM;
988 				}
989 #endif /* __has_feature(ptrauth_calls) */
990 
991 				/* ... and give it to the caller */
992 				if (error == 0) {
993 					error = copyout(&start_address,
994 					    (user_addr_t) uap->start_address,
995 					    sizeof(start_address));
996 					if (error != 0) {
997 						SHARED_REGION_TRACE_ERROR(
998 							("shared_region: %p [%d(%s)] "
999 							"check_np(0x%llx) "
1000 							"copyout(0x%llx) error %d\n",
1001 							(void *)VM_KERNEL_ADDRPERM(current_thread()),
1002 							proc_getpid(p), p->p_comm,
1003 							(uint64_t)uap->start_address, (uint64_t)start_address,
1004 							error));
1005 					}
1006 				}
1007 			}
1008 		}
1009 		vm_shared_region_deallocate(shared_region);
1010 	} else {
1011 		/* no shared region ! */
1012 		error = EINVAL;
1013 	}
1014 
1015 	SHARED_REGION_TRACE_DEBUG(
1016 		("shared_region: %p [%d(%s)] check_np(0x%llx) <- 0x%llx %d\n",
1017 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1018 		proc_getpid(p), p->p_comm,
1019 		(uint64_t)uap->start_address, (uint64_t)start_address, error));
1020 
1021 	return error;
1022 }
1023 
1024 
1025 static int
shared_region_copyin(struct proc * p,user_addr_t user_addr,unsigned int count,unsigned int element_size,void * kernel_data)1026 shared_region_copyin(
1027 	struct proc  *p,
1028 	user_addr_t  user_addr,
1029 	unsigned int count,
1030 	unsigned int element_size,
1031 	void         *kernel_data)
1032 {
1033 	int             error = 0;
1034 	vm_size_t       size = count * element_size;
1035 
1036 	error = copyin(user_addr, kernel_data, size);
1037 	if (error) {
1038 		SHARED_REGION_TRACE_ERROR(
1039 			("shared_region: %p [%d(%s)] map(): "
1040 			"copyin(0x%llx, %ld) failed (error=%d)\n",
1041 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1042 			proc_getpid(p), p->p_comm,
1043 			(uint64_t)user_addr, (long)size, error));
1044 	}
1045 	return error;
1046 }
1047 
1048 /*
1049  * A reasonable upper limit to prevent overflow of allocation/copyin.
1050  */
1051 #define _SR_FILE_MAPPINGS_MAX_FILES 256
1052 
1053 /* forward declaration */
1054 __attribute__((noinline))
1055 static void shared_region_map_and_slide_cleanup(
1056 	struct proc              *p,
1057 	uint32_t                 files_count,
1058 	struct _sr_file_mappings *sr_file_mappings,
1059 	struct vm_shared_region  *shared_region);
1060 
1061 /*
1062  * Setup part of _shared_region_map_and_slide().
1063  * It had to be broken out of _shared_region_map_and_slide() to
1064  * prevent compiler inlining from blowing out the stack.
1065  */
1066 __attribute__((noinline))
1067 static int
shared_region_map_and_slide_setup(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings,struct _sr_file_mappings ** sr_file_mappings,struct vm_shared_region ** shared_region_ptr,struct vnode * rdir_vp)1068 shared_region_map_and_slide_setup(
1069 	struct proc                         *p,
1070 	uint32_t                            files_count,
1071 	struct shared_file_np               *files,
1072 	uint32_t                            mappings_count,
1073 	struct shared_file_mapping_slide_np *mappings,
1074 	struct _sr_file_mappings            **sr_file_mappings,
1075 	struct vm_shared_region             **shared_region_ptr,
1076 	struct vnode                        *rdir_vp)
1077 {
1078 	int                             error = 0;
1079 	struct _sr_file_mappings        *srfmp;
1080 	uint32_t                        mappings_next;
1081 	struct vnode_attr               va;
1082 	off_t                           fs;
1083 #if CONFIG_MACF
1084 	vm_prot_t                       maxprot = VM_PROT_ALL;
1085 #endif
1086 	uint32_t                        i;
1087 	struct vm_shared_region         *shared_region = NULL;
1088 	boolean_t                       is_driverkit = task_is_driver(current_task());
1089 
1090 	SHARED_REGION_TRACE_DEBUG(
1091 		("shared_region: %p [%d(%s)] -> map\n",
1092 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1093 		proc_getpid(p), p->p_comm));
1094 
1095 	if (files_count > _SR_FILE_MAPPINGS_MAX_FILES) {
1096 		error = E2BIG;
1097 		goto done;
1098 	}
1099 	if (files_count == 0) {
1100 		error = EINVAL;
1101 		goto done;
1102 	}
1103 	*sr_file_mappings = kalloc_type(struct _sr_file_mappings, files_count,
1104 	    Z_WAITOK | Z_ZERO);
1105 	if (*sr_file_mappings == NULL) {
1106 		error = ENOMEM;
1107 		goto done;
1108 	}
1109 	mappings_next = 0;
1110 	for (i = 0; i < files_count; i++) {
1111 		srfmp = &(*sr_file_mappings)[i];
1112 		srfmp->fd = files[i].sf_fd;
1113 		srfmp->mappings_count = files[i].sf_mappings_count;
1114 		srfmp->mappings = &mappings[mappings_next];
1115 		mappings_next += srfmp->mappings_count;
1116 		if (mappings_next > mappings_count) {
1117 			error = EINVAL;
1118 			goto done;
1119 		}
1120 		srfmp->slide = files[i].sf_slide;
1121 	}
1122 
1123 	/* get the process's shared region (setup in vm_map_exec()) */
1124 	shared_region = vm_shared_region_trim_and_get(current_task());
1125 	*shared_region_ptr = shared_region;
1126 	if (shared_region == NULL) {
1127 		SHARED_REGION_TRACE_ERROR(
1128 			("shared_region: %p [%d(%s)] map(): "
1129 			"no shared region\n",
1130 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1131 			proc_getpid(p), p->p_comm));
1132 		error = EINVAL;
1133 		goto done;
1134 	}
1135 
1136 	/*
1137 	 * Check the shared region matches the current root
1138 	 * directory of this process.  Deny the mapping to
1139 	 * avoid tainting the shared region with something that
1140 	 * doesn't quite belong into it.
1141 	 */
1142 	struct vnode *sr_vnode = vm_shared_region_root_dir(shared_region);
1143 	if (sr_vnode != NULL ?  rdir_vp != sr_vnode : rdir_vp != rootvnode) {
1144 		SHARED_REGION_TRACE_ERROR(
1145 			("shared_region: map(%p) root_dir mismatch\n",
1146 			(void *)VM_KERNEL_ADDRPERM(current_thread())));
1147 		error = EPERM;
1148 		goto done;
1149 	}
1150 
1151 
1152 	for (srfmp = &(*sr_file_mappings)[0];
1153 	    srfmp < &(*sr_file_mappings)[files_count];
1154 	    srfmp++) {
1155 		if (srfmp->mappings_count == 0) {
1156 			/* no mappings here... */
1157 			continue;
1158 		}
1159 
1160 		/*
1161 		 * A file descriptor of -1 is used to indicate that the data
1162 		 * to be put in the shared region for this mapping comes directly
1163 		 * from the processes address space. Ensure we have proper alignments.
1164 		 */
1165 		if (srfmp->fd == -1) {
1166 			/* only allow one mapping per fd */
1167 			if (srfmp->mappings_count > 1) {
1168 				SHARED_REGION_TRACE_ERROR(
1169 					("shared_region: %p [%d(%s)] map data >1 mapping\n",
1170 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1171 					proc_getpid(p), p->p_comm));
1172 				error = EINVAL;
1173 				goto done;
1174 			}
1175 
1176 			/*
1177 			 * The destination address and size must be page aligned.
1178 			 */
1179 			struct shared_file_mapping_slide_np *mapping = &srfmp->mappings[0];
1180 			mach_vm_address_t dest_addr = mapping->sms_address;
1181 			mach_vm_size_t    map_size = mapping->sms_size;
1182 			if (!vm_map_page_aligned(dest_addr, vm_map_page_mask(current_map()))) {
1183 				SHARED_REGION_TRACE_ERROR(
1184 					("shared_region: %p [%d(%s)] map data destination 0x%llx not aligned\n",
1185 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1186 					proc_getpid(p), p->p_comm, dest_addr));
1187 				error = EINVAL;
1188 				goto done;
1189 			}
1190 			if (!vm_map_page_aligned(map_size, vm_map_page_mask(current_map()))) {
1191 				SHARED_REGION_TRACE_ERROR(
1192 					("shared_region: %p [%d(%s)] map data size 0x%llx not aligned\n",
1193 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1194 					proc_getpid(p), p->p_comm, map_size));
1195 				error = EINVAL;
1196 				goto done;
1197 			}
1198 			continue;
1199 		}
1200 
1201 		/* get file structure from file descriptor */
1202 		error = fp_get_ftype(p, srfmp->fd, DTYPE_VNODE, EINVAL, &srfmp->fp);
1203 		if (error) {
1204 			SHARED_REGION_TRACE_ERROR(
1205 				("shared_region: %p [%d(%s)] map: "
1206 				"fd=%d lookup failed (error=%d)\n",
1207 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1208 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1209 			goto done;
1210 		}
1211 
1212 		/* we need at least read permission on the file */
1213 		if (!(srfmp->fp->fp_glob->fg_flag & FREAD)) {
1214 			SHARED_REGION_TRACE_ERROR(
1215 				("shared_region: %p [%d(%s)] map: "
1216 				"fd=%d not readable\n",
1217 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1218 				proc_getpid(p), p->p_comm, srfmp->fd));
1219 			error = EPERM;
1220 			goto done;
1221 		}
1222 
1223 		/* get vnode from file structure */
1224 		error = vnode_getwithref((vnode_t)fp_get_data(srfmp->fp));
1225 		if (error) {
1226 			SHARED_REGION_TRACE_ERROR(
1227 				("shared_region: %p [%d(%s)] map: "
1228 				"fd=%d getwithref failed (error=%d)\n",
1229 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1230 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1231 			goto done;
1232 		}
1233 		srfmp->vp = (struct vnode *)fp_get_data(srfmp->fp);
1234 
1235 		/* make sure the vnode is a regular file */
1236 		if (srfmp->vp->v_type != VREG) {
1237 			SHARED_REGION_TRACE_ERROR(
1238 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1239 				"not a file (type=%d)\n",
1240 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1241 				proc_getpid(p), p->p_comm,
1242 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1243 				srfmp->vp->v_name, srfmp->vp->v_type));
1244 			error = EINVAL;
1245 			goto done;
1246 		}
1247 
1248 #if CONFIG_MACF
1249 		/* pass in 0 for the offset argument because AMFI does not need the offset
1250 		 *       of the shared cache */
1251 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
1252 		    srfmp->fp->fp_glob, VM_PROT_ALL, MAP_FILE | MAP_PRIVATE | MAP_FIXED, 0, &maxprot);
1253 		if (error) {
1254 			goto done;
1255 		}
1256 #endif /* MAC */
1257 
1258 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1259 		/*
1260 		 * Check if the shared cache is in the trust cache;
1261 		 * if so, we can skip the root ownership check.
1262 		 */
1263 #if DEVELOPMENT || DEBUG
1264 		/*
1265 		 * Skip both root ownership and trust cache check if
1266 		 * enforcement is disabled.
1267 		 */
1268 		if (!cs_system_enforcement()) {
1269 			goto after_root_check;
1270 		}
1271 #endif /* DEVELOPMENT || DEBUG */
1272 		struct cs_blob *blob = csvnode_get_blob(srfmp->vp, 0);
1273 		if (blob == NULL) {
1274 			SHARED_REGION_TRACE_ERROR(
1275 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1276 				"missing CS blob\n",
1277 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1278 				proc_getpid(p), p->p_comm,
1279 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1280 				srfmp->vp->v_name));
1281 			goto root_check;
1282 		}
1283 		const uint8_t *cdhash = csblob_get_cdhash(blob);
1284 		if (cdhash == NULL) {
1285 			SHARED_REGION_TRACE_ERROR(
1286 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1287 				"missing cdhash\n",
1288 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1289 				proc_getpid(p), p->p_comm,
1290 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1291 				srfmp->vp->v_name));
1292 			goto root_check;
1293 		}
1294 
1295 		bool in_trust_cache = false;
1296 		TrustCacheQueryToken_t qt;
1297 		if (query_trust_cache(kTCQueryTypeAll, cdhash, &qt) == KERN_SUCCESS) {
1298 			TCType_t tc_type = kTCTypeInvalid;
1299 			TCReturn_t tc_ret = amfi->TrustCache.queryGetTCType(&qt, &tc_type);
1300 			in_trust_cache = (tc_ret.error == kTCReturnSuccess &&
1301 			    (tc_type == kTCTypeCryptex1BootOS ||
1302 			    tc_type == kTCTypeStatic ||
1303 			    tc_type == kTCTypeEngineering));
1304 		}
1305 		if (!in_trust_cache) {
1306 			SHARED_REGION_TRACE_ERROR(
1307 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1308 				"not in trust cache\n",
1309 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1310 				proc_getpid(p), p->p_comm,
1311 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1312 				srfmp->vp->v_name));
1313 			goto root_check;
1314 		}
1315 		goto after_root_check;
1316 root_check:
1317 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1318 
1319 		/* The shared cache file must be owned by root */
1320 		VATTR_INIT(&va);
1321 		VATTR_WANTED(&va, va_uid);
1322 		error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1323 		if (error) {
1324 			SHARED_REGION_TRACE_ERROR(
1325 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1326 				"vnode_getattr(%p) failed (error=%d)\n",
1327 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1328 				proc_getpid(p), p->p_comm,
1329 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1330 				srfmp->vp->v_name,
1331 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1332 				error));
1333 			goto done;
1334 		}
1335 		if (va.va_uid != 0) {
1336 			SHARED_REGION_TRACE_ERROR(
1337 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1338 				"owned by uid=%d instead of 0\n",
1339 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1340 				proc_getpid(p), p->p_comm,
1341 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1342 				srfmp->vp->v_name, va.va_uid));
1343 			error = EPERM;
1344 			goto done;
1345 		}
1346 
1347 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1348 after_root_check:
1349 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1350 
1351 #if CONFIG_CSR
1352 		if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
1353 			VATTR_INIT(&va);
1354 			VATTR_WANTED(&va, va_flags);
1355 			error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1356 			if (error) {
1357 				SHARED_REGION_TRACE_ERROR(
1358 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1359 					"vnode_getattr(%p) failed (error=%d)\n",
1360 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1361 					proc_getpid(p), p->p_comm,
1362 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1363 					srfmp->vp->v_name,
1364 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1365 					error));
1366 				goto done;
1367 			}
1368 
1369 			if (!(va.va_flags & SF_RESTRICTED)) {
1370 				/*
1371 				 * CSR is not configured in CSR_ALLOW_UNRESTRICTED_FS mode, and
1372 				 * the shared cache file is NOT SIP-protected, so reject the
1373 				 * mapping request
1374 				 */
1375 				SHARED_REGION_TRACE_ERROR(
1376 					("shared_region: %p [%d(%s)] map(%p:'%s'), "
1377 					"vnode is not SIP-protected. \n",
1378 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1379 					proc_getpid(p), p->p_comm,
1380 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1381 					srfmp->vp->v_name));
1382 				error = EPERM;
1383 				goto done;
1384 			}
1385 		}
1386 #else /* CONFIG_CSR */
1387 
1388 		/*
1389 		 * Devices without SIP/ROSP need to make sure that the shared cache
1390 		 * is either on the root volume or in the preboot cryptex volume.
1391 		 */
1392 		assert(rdir_vp != NULL);
1393 		if (srfmp->vp->v_mount != rdir_vp->v_mount) {
1394 			vnode_t preboot_vp = NULL;
1395 #if XNU_TARGET_OS_OSX
1396 #define PREBOOT_CRYPTEX_PATH "/System/Volumes/Preboot/Cryptexes"
1397 #else
1398 #define PREBOOT_CRYPTEX_PATH "/private/preboot/Cryptexes"
1399 #endif
1400 			error = vnode_lookup(PREBOOT_CRYPTEX_PATH, 0, &preboot_vp, vfs_context_current());
1401 			if (error || srfmp->vp->v_mount != preboot_vp->v_mount) {
1402 				SHARED_REGION_TRACE_ERROR(
1403 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1404 					"not on process' root volume nor preboot volume\n",
1405 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1406 					proc_getpid(p), p->p_comm,
1407 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1408 					srfmp->vp->v_name));
1409 				error = EPERM;
1410 				if (preboot_vp) {
1411 					(void)vnode_put(preboot_vp);
1412 				}
1413 				goto done;
1414 			} else if (preboot_vp) {
1415 				(void)vnode_put(preboot_vp);
1416 			}
1417 		}
1418 #endif /* CONFIG_CSR */
1419 
1420 		if (scdir_enforce) {
1421 			char **expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1422 			struct vnode *scdir_vp = NULL;
1423 			for (expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1424 			    *expected_scdir_path != NULL;
1425 			    expected_scdir_path++) {
1426 				/* get vnode for expected_scdir_path */
1427 				error = vnode_lookup(*expected_scdir_path, 0, &scdir_vp, vfs_context_current());
1428 				if (error) {
1429 					SHARED_REGION_TRACE_ERROR(
1430 						("shared_region: %p [%d(%s)]: "
1431 						"vnode_lookup(%s) failed (error=%d)\n",
1432 						(void *)VM_KERNEL_ADDRPERM(current_thread()),
1433 						proc_getpid(p), p->p_comm,
1434 						*expected_scdir_path, error));
1435 					continue;
1436 				}
1437 
1438 				/* check if parent is scdir_vp */
1439 				assert(scdir_vp != NULL);
1440 				if (vnode_parent(srfmp->vp) == scdir_vp) {
1441 					(void)vnode_put(scdir_vp);
1442 					scdir_vp = NULL;
1443 					goto scdir_ok;
1444 				}
1445 				(void)vnode_put(scdir_vp);
1446 				scdir_vp = NULL;
1447 			}
1448 			/* nothing matches */
1449 			SHARED_REGION_TRACE_ERROR(
1450 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1451 				"shared cache file not in expected directory\n",
1452 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1453 				proc_getpid(p), p->p_comm,
1454 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1455 				srfmp->vp->v_name));
1456 			error = EPERM;
1457 			goto done;
1458 		}
1459 scdir_ok:
1460 
1461 		/* get vnode size */
1462 		error = vnode_size(srfmp->vp, &fs, vfs_context_current());
1463 		if (error) {
1464 			SHARED_REGION_TRACE_ERROR(
1465 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1466 				"vnode_size(%p) failed (error=%d)\n",
1467 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1468 				proc_getpid(p), p->p_comm,
1469 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1470 				srfmp->vp->v_name,
1471 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp), error));
1472 			goto done;
1473 		}
1474 		srfmp->file_size = fs;
1475 
1476 		/* get the file's memory object handle */
1477 		srfmp->file_control = ubc_getobject(srfmp->vp, UBC_HOLDOBJECT);
1478 		if (srfmp->file_control == MEMORY_OBJECT_CONTROL_NULL) {
1479 			SHARED_REGION_TRACE_ERROR(
1480 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1481 				"no memory object\n",
1482 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1483 				proc_getpid(p), p->p_comm,
1484 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1485 				srfmp->vp->v_name));
1486 			error = EINVAL;
1487 			goto done;
1488 		}
1489 
1490 		/* check that the mappings are properly covered by code signatures */
1491 		if (!cs_system_enforcement()) {
1492 			/* code signing is not enforced: no need to check */
1493 		} else {
1494 			for (i = 0; i < srfmp->mappings_count; i++) {
1495 				if (srfmp->mappings[i].sms_init_prot & VM_PROT_ZF) {
1496 					/* zero-filled mapping: not backed by the file */
1497 					continue;
1498 				}
1499 				if (ubc_cs_is_range_codesigned(srfmp->vp,
1500 				    srfmp->mappings[i].sms_file_offset,
1501 				    srfmp->mappings[i].sms_size)) {
1502 					/* this mapping is fully covered by code signatures */
1503 					continue;
1504 				}
1505 				SHARED_REGION_TRACE_ERROR(
1506 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1507 					"mapping #%d/%d [0x%llx:0x%llx:0x%llx:0x%x:0x%x] "
1508 					"is not code-signed\n",
1509 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1510 					proc_getpid(p), p->p_comm,
1511 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1512 					srfmp->vp->v_name,
1513 					i, srfmp->mappings_count,
1514 					srfmp->mappings[i].sms_address,
1515 					srfmp->mappings[i].sms_size,
1516 					srfmp->mappings[i].sms_file_offset,
1517 					srfmp->mappings[i].sms_max_prot,
1518 					srfmp->mappings[i].sms_init_prot));
1519 				error = EINVAL;
1520 				goto done;
1521 			}
1522 		}
1523 	}
1524 done:
1525 	if (error != 0) {
1526 		shared_region_map_and_slide_cleanup(p, files_count, *sr_file_mappings, shared_region);
1527 		*sr_file_mappings = NULL;
1528 		*shared_region_ptr = NULL;
1529 	}
1530 	return error;
1531 }
1532 
1533 /*
1534  * shared_region_map_np()
1535  *
1536  * This system call is intended for dyld.
1537  *
1538  * dyld uses this to map a shared cache file into a shared region.
1539  * This is usually done only the first time a shared cache is needed.
1540  * Subsequent processes will just use the populated shared region without
1541  * requiring any further setup.
1542  */
1543 static int
_shared_region_map_and_slide(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings)1544 _shared_region_map_and_slide(
1545 	struct proc                         *p,
1546 	uint32_t                            files_count,
1547 	struct shared_file_np               *files,
1548 	uint32_t                            mappings_count,
1549 	struct shared_file_mapping_slide_np *mappings)
1550 {
1551 	int                             error = 0;
1552 	kern_return_t                   kr = KERN_SUCCESS;
1553 	struct _sr_file_mappings        *sr_file_mappings = NULL;
1554 	struct vnode                    *rdir_vp = NULL;
1555 	struct vm_shared_region         *shared_region = NULL;
1556 
1557 	/*
1558 	 * Get a reference to the current proc's root dir.
1559 	 * Need this to prevent racing with chroot.
1560 	 */
1561 	proc_fdlock(p);
1562 	rdir_vp = p->p_fd.fd_rdir;
1563 	if (rdir_vp == NULL) {
1564 		rdir_vp = rootvnode;
1565 	}
1566 	assert(rdir_vp != NULL);
1567 	vnode_get(rdir_vp);
1568 	proc_fdunlock(p);
1569 
1570 	/*
1571 	 * Turn files, mappings into sr_file_mappings and other setup.
1572 	 */
1573 	error = shared_region_map_and_slide_setup(p, files_count,
1574 	    files, mappings_count, mappings,
1575 	    &sr_file_mappings, &shared_region, rdir_vp);
1576 	if (error != 0) {
1577 		vnode_put(rdir_vp);
1578 		return error;
1579 	}
1580 
1581 	/* map the file(s) into that shared region's submap */
1582 	kr = vm_shared_region_map_file(shared_region, files_count, sr_file_mappings);
1583 	if (kr != KERN_SUCCESS) {
1584 		SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] map(): "
1585 		    "vm_shared_region_map_file() failed kr=0x%x\n",
1586 		    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1587 		    proc_getpid(p), p->p_comm, kr));
1588 	}
1589 
1590 	/* convert kern_return_t to errno */
1591 	switch (kr) {
1592 	case KERN_SUCCESS:
1593 		error = 0;
1594 		break;
1595 	case KERN_INVALID_ADDRESS:
1596 		error = EFAULT;
1597 		break;
1598 	case KERN_PROTECTION_FAILURE:
1599 		error = EPERM;
1600 		break;
1601 	case KERN_NO_SPACE:
1602 		error = ENOMEM;
1603 		break;
1604 	case KERN_FAILURE:
1605 	case KERN_INVALID_ARGUMENT:
1606 	default:
1607 		error = EINVAL;
1608 		break;
1609 	}
1610 
1611 	/*
1612 	 * Mark that this process is now using split libraries.
1613 	 */
1614 	if (error == 0 && (p->p_flag & P_NOSHLIB)) {
1615 		OSBitAndAtomic(~((uint32_t)P_NOSHLIB), &p->p_flag);
1616 	}
1617 
1618 	vnode_put(rdir_vp);
1619 	shared_region_map_and_slide_cleanup(p, files_count, sr_file_mappings, shared_region);
1620 
1621 	SHARED_REGION_TRACE_DEBUG(
1622 		("shared_region: %p [%d(%s)] <- map\n",
1623 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1624 		proc_getpid(p), p->p_comm));
1625 
1626 	return error;
1627 }
1628 
1629 /*
1630  * Clean up part of _shared_region_map_and_slide()
1631  * It had to be broken out of _shared_region_map_and_slide() to
1632  * prevent compiler inlining from blowing out the stack.
1633  */
1634 __attribute__((noinline))
1635 static void
shared_region_map_and_slide_cleanup(struct proc * p,uint32_t files_count,struct _sr_file_mappings * sr_file_mappings,struct vm_shared_region * shared_region)1636 shared_region_map_and_slide_cleanup(
1637 	struct proc              *p,
1638 	uint32_t                 files_count,
1639 	struct _sr_file_mappings *sr_file_mappings,
1640 	struct vm_shared_region  *shared_region)
1641 {
1642 	struct _sr_file_mappings *srfmp;
1643 	struct vnode_attr        va;
1644 
1645 	if (sr_file_mappings != NULL) {
1646 		for (srfmp = &sr_file_mappings[0]; srfmp < &sr_file_mappings[files_count]; srfmp++) {
1647 			if (srfmp->vp != NULL) {
1648 				vnode_lock_spin(srfmp->vp);
1649 				srfmp->vp->v_flag |= VSHARED_DYLD;
1650 				vnode_unlock(srfmp->vp);
1651 
1652 				/* update the vnode's access time */
1653 				if (!(vnode_vfsvisflags(srfmp->vp) & MNT_NOATIME)) {
1654 					VATTR_INIT(&va);
1655 					nanotime(&va.va_access_time);
1656 					VATTR_SET_ACTIVE(&va, va_access_time);
1657 					vnode_setattr(srfmp->vp, &va, vfs_context_current());
1658 				}
1659 
1660 #if NAMEDSTREAMS
1661 				/*
1662 				 * If the shared cache is compressed, it may
1663 				 * have a namedstream vnode instantiated for
1664 				 * for it. That namedstream vnode will also
1665 				 * have to be marked with VSHARED_DYLD.
1666 				 */
1667 				if (vnode_hasnamedstreams(srfmp->vp)) {
1668 					vnode_t svp;
1669 					if (vnode_getnamedstream(srfmp->vp, &svp, XATTR_RESOURCEFORK_NAME,
1670 					    NS_OPEN, 0, vfs_context_kernel()) == 0) {
1671 						vnode_lock_spin(svp);
1672 						svp->v_flag |= VSHARED_DYLD;
1673 						vnode_unlock(svp);
1674 						vnode_put(svp);
1675 					}
1676 				}
1677 #endif /* NAMEDSTREAMS */
1678 				/*
1679 				 * release the vnode...
1680 				 * ubc_map() still holds it for us in the non-error case
1681 				 */
1682 				(void) vnode_put(srfmp->vp);
1683 				srfmp->vp = NULL;
1684 			}
1685 			if (srfmp->fp != NULL) {
1686 				/* release the file descriptor */
1687 				fp_drop(p, srfmp->fd, srfmp->fp, 0);
1688 				srfmp->fp = NULL;
1689 			}
1690 		}
1691 		kfree_type(struct _sr_file_mappings, files_count, sr_file_mappings);
1692 	}
1693 
1694 	if (shared_region != NULL) {
1695 		vm_shared_region_deallocate(shared_region);
1696 	}
1697 }
1698 
1699 /*
1700  * For each file mapped, we may have mappings for:
1701  *    TEXT, EXECUTE, LINKEDIT, DATA_CONST, __AUTH, DATA
1702  * so let's round up to 8 mappings per file.
1703  */
1704 #define SFM_MAX       (_SR_FILE_MAPPINGS_MAX_FILES * 8)     /* max mapping structs allowed to pass in */
1705 
1706 /*
1707  * This is the new interface for setting up shared region mappings.
1708  *
1709  * The slide used for shared regions setup using this interface is done differently
1710  * from the old interface. The slide value passed in the shared_files_np represents
1711  * a max value. The kernel will choose a random value based on that, then use it
1712  * for all shared regions.
1713  */
1714 #if defined (__x86_64__)
1715 #define SLIDE_AMOUNT_MASK ~FOURK_PAGE_MASK
1716 #else
1717 #define SLIDE_AMOUNT_MASK ~SIXTEENK_PAGE_MASK
1718 #endif
1719 
1720 static inline __result_use_check kern_return_t
shared_region_map_and_slide_2_np_sanitize(struct proc * p,user_addr_t mappings_userspace_addr,unsigned int count,shared_file_mapping_slide_np_t * mappings)1721 shared_region_map_and_slide_2_np_sanitize(
1722 	struct proc                         *p,
1723 	user_addr_t                         mappings_userspace_addr,
1724 	unsigned int                        count,
1725 	shared_file_mapping_slide_np_t      *mappings)
1726 {
1727 	kern_return_t kr;
1728 	vm_map_t map = current_map();
1729 	mach_vm_address_t addr, end;
1730 	mach_vm_offset_t offset, offset_end;
1731 	mach_vm_size_t size, offset_size;
1732 	user_addr_t slide_start, slide_end, slide_size;
1733 	vm_prot_t cur;
1734 	vm_prot_t max;
1735 
1736 	user_addr_t user_addr = mappings_userspace_addr;
1737 
1738 	for (size_t i = 0; i < count; i++) {
1739 		shared_file_mapping_slide_np_ut mapping_u;
1740 		/*
1741 		 * First we bring each mapping struct into our kernel stack to
1742 		 * avoid TOCTOU.
1743 		 */
1744 		kr = shared_region_copyin(
1745 			p,
1746 			user_addr,
1747 			1, // copy 1 element at a time
1748 			sizeof(shared_file_mapping_slide_np_ut),
1749 			&mapping_u);
1750 		if (__improbable(kr != KERN_SUCCESS)) {
1751 			return kr;
1752 		}
1753 
1754 		/*
1755 		 * Then, we sanitize the data on the kernel stack.
1756 		 */
1757 		kr = vm_sanitize_addr_size(
1758 			mapping_u.sms_address_u,
1759 			mapping_u.sms_size_u,
1760 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1761 			map,
1762 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1763 			| VM_SANITIZE_FLAGS_CHECK_ALIGNED_START
1764 			| VM_SANITIZE_FLAGS_CHECK_ALIGNED_SIZE),
1765 			&addr,
1766 			&end,
1767 			&size);
1768 		if (__improbable(kr != KERN_SUCCESS)) {
1769 			return kr;
1770 		}
1771 
1772 		kr = vm_sanitize_addr_size(
1773 			mapping_u.sms_file_offset_u,
1774 			mapping_u.sms_size_u,
1775 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1776 			PAGE_MASK,
1777 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1778 			| VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1779 			&offset,
1780 			&offset_end,
1781 			&offset_size);
1782 		if (__improbable(kr != KERN_SUCCESS)) {
1783 			return kr;
1784 		}
1785 		if (__improbable(0 != (offset & vm_map_page_mask(map)))) {
1786 			return KERN_INVALID_ARGUMENT;
1787 		}
1788 
1789 		/*
1790 		 * Unsafe access is immediately followed by wrap to
1791 		 * convert from addr to size.
1792 		 */
1793 		mach_vm_size_ut sms_slide_size_u =
1794 		    vm_sanitize_wrap_size(
1795 			VM_SANITIZE_UNSAFE_UNWRAP(
1796 				mapping_u.sms_slide_size_u));
1797 
1798 		kr = vm_sanitize_addr_size(
1799 			mapping_u.sms_slide_start_u,
1800 			sms_slide_size_u,
1801 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1802 			map,
1803 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1804 			| VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1805 			&slide_start,
1806 			&slide_end,
1807 			&slide_size);
1808 		if (__improbable(kr != KERN_SUCCESS)) {
1809 			return kr;
1810 		}
1811 
1812 		kr = vm_sanitize_cur_and_max_prots(
1813 			mapping_u.sms_init_prot_u,
1814 			mapping_u.sms_max_prot_u,
1815 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1816 			map,
1817 			VM_PROT_SFM_EXTENSIONS_MASK | VM_PROT_TPRO,
1818 			&cur,
1819 			&max);
1820 		if (__improbable(kr != KERN_SUCCESS)) {
1821 			return kr;
1822 		}
1823 
1824 		/*
1825 		 * Finally, we move the data from the kernel stack to our
1826 		 * caller-allocated kernel heap buffer.
1827 		 */
1828 		mappings[i].sms_address = addr;
1829 		mappings[i].sms_size = size;
1830 		mappings[i].sms_file_offset = offset;
1831 		mappings[i].sms_slide_size = slide_size;
1832 		mappings[i].sms_slide_start = slide_start;
1833 		mappings[i].sms_max_prot = max;
1834 		mappings[i].sms_init_prot = cur;
1835 
1836 		if (__improbable(os_add_overflow(
1837 			    user_addr,
1838 			    sizeof(shared_file_mapping_slide_np_ut),
1839 			    &user_addr))) {
1840 			return KERN_INVALID_ARGUMENT;
1841 		}
1842 	}
1843 
1844 	return KERN_SUCCESS;
1845 }
1846 
1847 int
shared_region_map_and_slide_2_np(struct proc * p,struct shared_region_map_and_slide_2_np_args * uap,__unused int * retvalp)1848 shared_region_map_and_slide_2_np(
1849 	struct proc                                  *p,
1850 	struct shared_region_map_and_slide_2_np_args *uap,
1851 	__unused int                                 *retvalp)
1852 {
1853 	unsigned int                  files_count;
1854 	struct shared_file_np         *shared_files = NULL;
1855 	unsigned int                  mappings_count;
1856 	struct shared_file_mapping_slide_np *mappings = NULL;
1857 	kern_return_t                 kr = KERN_SUCCESS;
1858 
1859 	files_count = uap->files_count;
1860 	mappings_count = uap->mappings_count;
1861 
1862 	if (files_count == 0) {
1863 		SHARED_REGION_TRACE_INFO(
1864 			("shared_region: %p [%d(%s)] map(): "
1865 			"no files\n",
1866 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1867 			proc_getpid(p), p->p_comm));
1868 		kr = 0; /* no files to map: we're done ! */
1869 		goto done;
1870 	} else if (files_count <= _SR_FILE_MAPPINGS_MAX_FILES) {
1871 		shared_files = kalloc_data(files_count * sizeof(shared_files[0]), Z_WAITOK);
1872 		if (shared_files == NULL) {
1873 			kr = KERN_RESOURCE_SHORTAGE;
1874 			goto done;
1875 		}
1876 	} else {
1877 		SHARED_REGION_TRACE_ERROR(
1878 			("shared_region: %p [%d(%s)] map(): "
1879 			"too many files (%d) max %d\n",
1880 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1881 			proc_getpid(p), p->p_comm,
1882 			files_count, _SR_FILE_MAPPINGS_MAX_FILES));
1883 		kr = KERN_FAILURE;
1884 		goto done;
1885 	}
1886 
1887 	if (mappings_count == 0) {
1888 		SHARED_REGION_TRACE_INFO(
1889 			("shared_region: %p [%d(%s)] map(): "
1890 			"no mappings\n",
1891 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1892 			proc_getpid(p), p->p_comm));
1893 		kr = 0; /* no mappings: we're done ! */
1894 		goto done;
1895 	} else if (mappings_count <= SFM_MAX) {
1896 		mappings = kalloc_data(mappings_count * sizeof(mappings[0]), Z_WAITOK);
1897 		if (mappings == NULL) {
1898 			kr = KERN_RESOURCE_SHORTAGE;
1899 			goto done;
1900 		}
1901 	} else {
1902 		SHARED_REGION_TRACE_ERROR(
1903 			("shared_region: %p [%d(%s)] map(): "
1904 			"too many mappings (%d) max %d\n",
1905 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1906 			proc_getpid(p), p->p_comm,
1907 			mappings_count, SFM_MAX));
1908 		kr = KERN_FAILURE;
1909 		goto done;
1910 	}
1911 
1912 	/*
1913 	 * struct shared_file_np does not have fields that are subject to
1914 	 * sanitization, it is thus copied from userspace as is.
1915 	 */
1916 	kr = shared_region_copyin(p, uap->files, files_count, sizeof(shared_files[0]), shared_files);
1917 	if (kr != KERN_SUCCESS) {
1918 		goto done;
1919 	}
1920 
1921 	kr = shared_region_map_and_slide_2_np_sanitize(
1922 		p,
1923 		uap->mappings_u,
1924 		mappings_count,
1925 		mappings);
1926 	if (__improbable(kr != KERN_SUCCESS)) {
1927 		kr = vm_sanitize_get_kr(kr);
1928 		goto done;
1929 	}
1930 
1931 	uint32_t max_slide = shared_files[0].sf_slide;
1932 	uint32_t random_val;
1933 	uint32_t slide_amount;
1934 
1935 	if (max_slide != 0) {
1936 		read_random(&random_val, sizeof random_val);
1937 		slide_amount = ((random_val % max_slide) & SLIDE_AMOUNT_MASK);
1938 	} else {
1939 		slide_amount = 0;
1940 	}
1941 #if DEVELOPMENT || DEBUG
1942 	extern bool bootarg_disable_aslr;
1943 	if (bootarg_disable_aslr) {
1944 		slide_amount = 0;
1945 	}
1946 #endif /* DEVELOPMENT || DEBUG */
1947 
1948 	/*
1949 	 * Fix up the mappings to reflect the desired slide.
1950 	 */
1951 	unsigned int f;
1952 	unsigned int m = 0;
1953 	unsigned int i;
1954 	for (f = 0; f < files_count; ++f) {
1955 		shared_files[f].sf_slide = slide_amount;
1956 		for (i = 0; i < shared_files[f].sf_mappings_count; ++i, ++m) {
1957 			if (m >= mappings_count) {
1958 				SHARED_REGION_TRACE_ERROR(
1959 					("shared_region: %p [%d(%s)] map(): "
1960 					"mapping count argument was too small\n",
1961 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1962 					proc_getpid(p), p->p_comm));
1963 				kr = KERN_FAILURE;
1964 				goto done;
1965 			}
1966 			if (__improbable(
1967 				    os_add_overflow(
1968 					    mappings[m].sms_address,
1969 					    slide_amount,
1970 					    &mappings[m].sms_address))) {
1971 				kr = KERN_INVALID_ARGUMENT;
1972 				goto done;
1973 			}
1974 			if (mappings[m].sms_slide_size != 0) {
1975 				mach_vm_address_t discard;
1976 				/* Slide and check that new start/size pairs do not overflow. */
1977 				if (__improbable(
1978 					    os_add_overflow(
1979 						    mappings[m].sms_slide_start,
1980 						    slide_amount,
1981 						    &mappings[m].sms_slide_start) ||
1982 					    os_add_overflow(
1983 						    mappings[m].sms_slide_start,
1984 						    mappings[m].sms_slide_size,
1985 						    &discard))) {
1986 					kr = KERN_INVALID_ARGUMENT;
1987 					goto done;
1988 				}
1989 			}
1990 		}
1991 	}
1992 
1993 	kr = _shared_region_map_and_slide(p, files_count, shared_files, mappings_count, mappings);
1994 done:
1995 	kfree_data(shared_files, files_count * sizeof(shared_files[0]));
1996 	kfree_data(mappings, mappings_count * sizeof(mappings[0]));
1997 	return kr;
1998 }
1999 
2000 /*
2001  * A syscall for dyld to use to map data pages that need load time relocation fixups.
2002  * The fixups are performed by a custom pager during page-in, so the pages still appear
2003  * "clean" and hence are easily discarded under memory pressure. They can be re-paged-in
2004  * on demand later, all w/o using the compressor.
2005  *
2006  * Note these page are treated as MAP_PRIVATE. So if the application dirties any pages while
2007  * running, they are COW'd as normal.
2008  */
2009 int
map_with_linking_np(struct proc * p,struct map_with_linking_np_args * uap,__unused int * retvalp)2010 map_with_linking_np(
2011 	struct proc                     *p,
2012 	struct map_with_linking_np_args *uap,
2013 	__unused int                    *retvalp)
2014 {
2015 	uint32_t                        region_count;
2016 	uint32_t                        r;
2017 	struct mwl_region               *regions = NULL;
2018 	struct mwl_region               *rp;
2019 	uint32_t                        link_info_size;
2020 	void                            *link_info = NULL;      /* starts with a struct mwl_info_hdr */
2021 	struct mwl_info_hdr             *info_hdr = NULL;
2022 	uint64_t                        binds_size;
2023 	int                             fd;
2024 	struct fileproc                 *fp = NULL;
2025 	struct vnode                    *vp = NULL;
2026 	size_t                          file_size;
2027 	off_t                           fs;
2028 	struct vnode_attr               va;
2029 	memory_object_control_t         file_control = NULL;
2030 	int                             error;
2031 	kern_return_t                   kr = KERN_SUCCESS;
2032 
2033 	/*
2034 	 * Check if dyld has told us it finished with this call.
2035 	 */
2036 	if (p->p_disallow_map_with_linking) {
2037 		printf("%s: [%d(%s)]: map__with_linking() was disabled\n",
2038 		    __func__, proc_getpid(p), p->p_comm);
2039 		kr = KERN_FAILURE;
2040 		goto done;
2041 	}
2042 
2043 	/*
2044 	 * First we do some sanity checking on what dyld has passed us.
2045 	 */
2046 	region_count = uap->region_count;
2047 	link_info_size = uap->link_info_size;
2048 	if (region_count == 0) {
2049 		printf("%s: [%d(%s)]: region_count == 0\n",
2050 		    __func__, proc_getpid(p), p->p_comm);
2051 		kr = KERN_FAILURE;
2052 		goto done;
2053 	}
2054 	if (region_count > MWL_MAX_REGION_COUNT) {
2055 		printf("%s: [%d(%s)]: region_count too big %d\n",
2056 		    __func__, proc_getpid(p), p->p_comm, region_count);
2057 		kr = KERN_FAILURE;
2058 		goto done;
2059 	}
2060 
2061 	if (link_info_size <= MWL_MIN_LINK_INFO_SIZE) {
2062 		printf("%s: [%d(%s)]: link_info_size too small\n",
2063 		    __func__, proc_getpid(p), p->p_comm);
2064 		kr = KERN_FAILURE;
2065 		goto done;
2066 	}
2067 	if (link_info_size >= MWL_MAX_LINK_INFO_SIZE) {
2068 		printf("%s: [%d(%s)]: link_info_size too big %d\n",
2069 		    __func__, proc_getpid(p), p->p_comm, link_info_size);
2070 		kr = KERN_FAILURE;
2071 		goto done;
2072 	}
2073 
2074 	/*
2075 	 * Allocate and copyin the regions and link info
2076 	 */
2077 	regions = kalloc_data(region_count * sizeof(regions[0]), Z_WAITOK);
2078 	if (regions == NULL) {
2079 		printf("%s: [%d(%s)]: failed to allocate regions\n",
2080 		    __func__, proc_getpid(p), p->p_comm);
2081 		kr = KERN_RESOURCE_SHORTAGE;
2082 		goto done;
2083 	}
2084 	kr = shared_region_copyin(p, uap->regions, region_count, sizeof(regions[0]), regions);
2085 	if (kr != KERN_SUCCESS) {
2086 		printf("%s: [%d(%s)]: failed to copyin regions kr=%d\n",
2087 		    __func__, proc_getpid(p), p->p_comm, kr);
2088 		goto done;
2089 	}
2090 
2091 	link_info = kalloc_data(link_info_size, Z_WAITOK);
2092 	if (link_info == NULL) {
2093 		printf("%s: [%d(%s)]: failed to allocate link_info\n",
2094 		    __func__, proc_getpid(p), p->p_comm);
2095 		kr = KERN_RESOURCE_SHORTAGE;
2096 		goto done;
2097 	}
2098 	kr = shared_region_copyin(p, uap->link_info, 1, link_info_size, link_info);
2099 	if (kr != KERN_SUCCESS) {
2100 		printf("%s: [%d(%s)]: failed to copyin link_info kr=%d\n",
2101 		    __func__, proc_getpid(p), p->p_comm, kr);
2102 		goto done;
2103 	}
2104 
2105 	/*
2106 	 * Do some verification the data structures.
2107 	 */
2108 	info_hdr = (struct mwl_info_hdr *)link_info;
2109 	if (info_hdr->mwli_version != MWL_INFO_VERS) {
2110 		printf("%s: [%d(%s)]: unrecognized mwli_version=%d\n",
2111 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_version);
2112 		kr = KERN_FAILURE;
2113 		goto done;
2114 	}
2115 
2116 	if (info_hdr->mwli_binds_offset > link_info_size) {
2117 		printf("%s: [%d(%s)]: mwli_binds_offset too large %d\n",
2118 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_offset);
2119 		kr = KERN_FAILURE;
2120 		goto done;
2121 	}
2122 
2123 	/* some older devs have s/w page size > h/w page size, no need to support them */
2124 	if (info_hdr->mwli_page_size != PAGE_SIZE) {
2125 		/* no printf, since this is expected on some devices */
2126 		kr = KERN_INVALID_ARGUMENT;
2127 		goto done;
2128 	}
2129 
2130 	binds_size = (uint64_t)info_hdr->mwli_binds_count *
2131 	    ((info_hdr->mwli_pointer_format == DYLD_CHAINED_PTR_32) ? 4 : 8);
2132 	if (binds_size > link_info_size - info_hdr->mwli_binds_offset) {
2133 		printf("%s: [%d(%s)]: mwli_binds_count too large %d\n",
2134 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_count);
2135 		kr = KERN_FAILURE;
2136 		goto done;
2137 	}
2138 
2139 	if (info_hdr->mwli_chains_offset > link_info_size) {
2140 		printf("%s: [%d(%s)]: mwli_chains_offset too large %d\n",
2141 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_offset);
2142 		kr = KERN_FAILURE;
2143 		goto done;
2144 	}
2145 
2146 
2147 	/*
2148 	 * Ensure the chained starts in the link info and make sure the
2149 	 * segment info offsets are within bounds.
2150 	 */
2151 	if (info_hdr->mwli_chains_size < sizeof(struct dyld_chained_starts_in_image)) {
2152 		printf("%s: [%d(%s)]: mwli_chains_size too small %d\n",
2153 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2154 		kr = KERN_FAILURE;
2155 		goto done;
2156 	}
2157 	if (info_hdr->mwli_chains_size > link_info_size - info_hdr->mwli_chains_offset) {
2158 		printf("%s: [%d(%s)]: mwli_chains_size too large %d\n",
2159 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2160 		kr = KERN_FAILURE;
2161 		goto done;
2162 	}
2163 
2164 	/* Note that more verification of offsets is done in the pager itself */
2165 
2166 	/*
2167 	 * Ensure we've only been given one FD and verify valid protections.
2168 	 */
2169 	fd = regions[0].mwlr_fd;
2170 	for (r = 0; r < region_count; ++r) {
2171 		if (regions[r].mwlr_fd != fd) {
2172 			printf("%s: [%d(%s)]: mwlr_fd mismatch %d and %d\n",
2173 			    __func__, proc_getpid(p), p->p_comm, fd, regions[r].mwlr_fd);
2174 			kr = KERN_FAILURE;
2175 			goto done;
2176 		}
2177 
2178 		/*
2179 		 * Only allow data mappings and not zero fill. Permit TPRO
2180 		 * mappings only when VM_PROT_READ | VM_PROT_WRITE.
2181 		 */
2182 		if (regions[r].mwlr_protections & VM_PROT_EXECUTE) {
2183 			printf("%s: [%d(%s)]: mwlr_protections EXECUTE not allowed\n",
2184 			    __func__, proc_getpid(p), p->p_comm);
2185 			kr = KERN_FAILURE;
2186 			goto done;
2187 		}
2188 		if (regions[r].mwlr_protections & VM_PROT_ZF) {
2189 			printf("%s: [%d(%s)]: region %d, found VM_PROT_ZF not allowed\n",
2190 			    __func__, proc_getpid(p), p->p_comm, r);
2191 			kr = KERN_FAILURE;
2192 			goto done;
2193 		}
2194 		if ((regions[r].mwlr_protections & VM_PROT_TPRO) &&
2195 		    !(regions[r].mwlr_protections & VM_PROT_WRITE)) {
2196 			printf("%s: [%d(%s)]: region %d, found VM_PROT_TPRO without VM_PROT_WRITE\n",
2197 			    __func__, proc_getpid(p), p->p_comm, r);
2198 			kr = KERN_FAILURE;
2199 			goto done;
2200 		}
2201 	}
2202 
2203 
2204 	/* get file structure from file descriptor */
2205 	error = fp_get_ftype(p, fd, DTYPE_VNODE, EINVAL, &fp);
2206 	if (error) {
2207 		printf("%s: [%d(%s)]: fp_get_ftype() failed, error %d\n",
2208 		    __func__, proc_getpid(p), p->p_comm, error);
2209 		kr = KERN_FAILURE;
2210 		goto done;
2211 	}
2212 
2213 	/* We need at least read permission on the file */
2214 	if (!(fp->fp_glob->fg_flag & FREAD)) {
2215 		printf("%s: [%d(%s)]: not readable\n",
2216 		    __func__, proc_getpid(p), p->p_comm);
2217 		kr = KERN_FAILURE;
2218 		goto done;
2219 	}
2220 
2221 	/* Get the vnode from file structure */
2222 	vp = (struct vnode *)fp_get_data(fp);
2223 	error = vnode_getwithref(vp);
2224 	if (error) {
2225 		printf("%s: [%d(%s)]: failed to get vnode, error %d\n",
2226 		    __func__, proc_getpid(p), p->p_comm, error);
2227 		kr = KERN_FAILURE;
2228 		vp = NULL; /* just to be sure */
2229 		goto done;
2230 	}
2231 
2232 	/* Make sure the vnode is a regular file */
2233 	if (vp->v_type != VREG) {
2234 		printf("%s: [%d(%s)]: vnode not VREG\n",
2235 		    __func__, proc_getpid(p), p->p_comm);
2236 		kr = KERN_FAILURE;
2237 		goto done;
2238 	}
2239 
2240 	/* get vnode size */
2241 	error = vnode_size(vp, &fs, vfs_context_current());
2242 	if (error) {
2243 		goto done;
2244 	}
2245 	file_size = fs;
2246 
2247 	/* get the file's memory object handle */
2248 	file_control = ubc_getobject(vp, UBC_HOLDOBJECT);
2249 	if (file_control == MEMORY_OBJECT_CONTROL_NULL) {
2250 		printf("%s: [%d(%s)]: no memory object\n",
2251 		    __func__, proc_getpid(p), p->p_comm);
2252 		kr = KERN_FAILURE;
2253 		goto done;
2254 	}
2255 
2256 	for (r = 0; r < region_count; ++r) {
2257 		rp = &regions[r];
2258 
2259 #if CONFIG_MACF
2260 		vm_prot_t prot = (rp->mwlr_protections & VM_PROT_ALL);
2261 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
2262 		    fp->fp_glob, prot, MAP_FILE | MAP_PRIVATE | MAP_FIXED, rp->mwlr_file_offset, &prot);
2263 		if (error) {
2264 			printf("%s: [%d(%s)]: mac_file_check_mmap() failed, region %d, error %d\n",
2265 			    __func__, proc_getpid(p), p->p_comm, r, error);
2266 			kr = KERN_FAILURE;
2267 			goto done;
2268 		}
2269 #endif /* MAC */
2270 
2271 		/* check that the mappings are properly covered by code signatures */
2272 		if (cs_system_enforcement()) {
2273 			if (!ubc_cs_is_range_codesigned(vp, rp->mwlr_file_offset, rp->mwlr_size)) {
2274 				printf("%s: [%d(%s)]: region %d, not code signed\n",
2275 				    __func__, proc_getpid(p), p->p_comm, r);
2276 				kr = KERN_FAILURE;
2277 				goto done;
2278 			}
2279 		}
2280 	}
2281 
2282 	/* update the vnode's access time */
2283 	if (!(vnode_vfsvisflags(vp) & MNT_NOATIME)) {
2284 		VATTR_INIT(&va);
2285 		nanotime(&va.va_access_time);
2286 		VATTR_SET_ACTIVE(&va, va_access_time);
2287 		vnode_setattr(vp, &va, vfs_context_current());
2288 	}
2289 
2290 	/* get the VM to do the work */
2291 	kr = vm_map_with_linking(proc_task(p), regions, region_count, &link_info, link_info_size, file_control);
2292 
2293 done:
2294 	if (fp != NULL) {
2295 		/* release the file descriptor */
2296 		fp_drop(p, fd, fp, 0);
2297 	}
2298 	if (vp != NULL) {
2299 		(void)vnode_put(vp);
2300 	}
2301 	if (regions != NULL) {
2302 		kfree_data(regions, region_count * sizeof(regions[0]));
2303 	}
2304 	/* link info is NULL if it is used in the pager, if things worked */
2305 	if (link_info != NULL) {
2306 		kfree_data(link_info, link_info_size);
2307 	}
2308 
2309 	switch (kr) {
2310 	case KERN_SUCCESS:
2311 		return 0;
2312 	case KERN_RESOURCE_SHORTAGE:
2313 		return ENOMEM;
2314 	default:
2315 		return EINVAL;
2316 	}
2317 }
2318 
2319 #if DEBUG || DEVELOPMENT
2320 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count,
2321     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count, 0, "");
2322 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count_max,
2323     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count_max, 0, "");
2324 #endif /* DEBUG || DEVELOPMENT */
2325 
2326 /* sysctl overflow room */
2327 
2328 SYSCTL_INT(_vm, OID_AUTO, pagesize, CTLFLAG_RD | CTLFLAG_LOCKED,
2329     (int *) &page_size, 0, "vm page size");
2330 
2331 /* vm_page_free_target is provided as a makeshift solution for applications that want to
2332  *       allocate buffer space, possibly purgeable memory, but not cause inactive pages to be
2333  *       reclaimed. It allows the app to calculate how much memory is free outside the free target. */
2334 extern unsigned int     vm_page_free_target;
2335 SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED,
2336     &vm_page_free_target, 0, "Pageout daemon free target");
2337 
2338 SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED,
2339     &vm_pageout_state.vm_memory_pressure, 0, "Memory pressure indicator");
2340 
2341 static int
2342 vm_ctl_page_free_wanted SYSCTL_HANDLER_ARGS
2343 {
2344 #pragma unused(oidp, arg1, arg2)
2345 	unsigned int page_free_wanted;
2346 
2347 	page_free_wanted = mach_vm_ctl_page_free_wanted();
2348 	return SYSCTL_OUT(req, &page_free_wanted, sizeof(page_free_wanted));
2349 }
2350 SYSCTL_PROC(_vm, OID_AUTO, page_free_wanted,
2351     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
2352     0, 0, vm_ctl_page_free_wanted, "I", "");
2353 
2354 extern unsigned int     vm_page_purgeable_count;
2355 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2356     &vm_page_purgeable_count, 0, "Purgeable page count");
2357 
2358 extern unsigned int     vm_page_purgeable_wired_count;
2359 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2360     &vm_page_purgeable_wired_count, 0, "Wired purgeable page count");
2361 
2362 extern unsigned int vm_page_kern_lpage_count;
2363 SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2364     &vm_page_kern_lpage_count, 0, "kernel used large pages");
2365 
2366 SCALABLE_COUNTER_DECLARE(vm_page_grab_count);
2367 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed");
2368 
2369 #if DEVELOPMENT || DEBUG
2370 #if __ARM_MIXED_PAGE_SIZE__
2371 static int vm_mixed_pagesize_supported = 1;
2372 #else
2373 static int vm_mixed_pagesize_supported = 0;
2374 #endif /*__ARM_MIXED_PAGE_SIZE__ */
2375 SYSCTL_INT(_debug, OID_AUTO, vm_mixed_pagesize_supported, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED,
2376     &vm_mixed_pagesize_supported, 0, "kernel support for mixed pagesize");
2377 
2378 SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
2379     &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed");
2380 
2381 SYSCTL_INT(_vm, OID_AUTO, pageout_purged_objects, CTLFLAG_RD | CTLFLAG_LOCKED,
2382     &vm_pageout_debug.vm_pageout_purged_objects, 0, "System purged object count");
2383 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED,
2384     &vm_pageout_debug.vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)");
2385 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED,
2386     &vm_pageout_debug.vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)");
2387 
2388 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2389     &vm_pageout_debug.vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated");
2390 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2391     &vm_pageout_debug.vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated");
2392 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2393     &vm_pageout_debug.vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated");         /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */
2394 SYSCTL_ULONG(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2395     &vm_pageout_vminfo.vm_pageout_freed_cleaned, "Cleaned pages freed");
2396 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2397     &vm_pageout_debug.vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated");
2398 SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2399     &vm_pageout_debug.vm_pageout_enqueued_cleaned, 0, "");         /* sum of next two */
2400 #endif /* DEVELOPMENT || DEBUG */
2401 
2402 extern int madvise_free_debug;
2403 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
2404     &madvise_free_debug, 0, "zero-fill on madvise(MADV_FREE*)");
2405 extern int madvise_free_debug_sometimes;
2406 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug_sometimes, CTLFLAG_RW | CTLFLAG_LOCKED,
2407     &madvise_free_debug_sometimes, 0, "sometimes zero-fill on madvise(MADV_FREE*)");
2408 
2409 SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2410     &vm_page_stats_reusable.reusable_count, 0, "Reusable page count");
2411 SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2412     &vm_page_stats_reusable.reusable_pages_success, "");
2413 SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2414     &vm_page_stats_reusable.reusable_pages_failure, "");
2415 SYSCTL_QUAD(_vm, OID_AUTO, reusable_pages_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2416     &vm_page_stats_reusable.reusable_pages_shared, "");
2417 SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2418     &vm_page_stats_reusable.all_reusable_calls, "");
2419 SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2420     &vm_page_stats_reusable.partial_reusable_calls, "");
2421 SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2422     &vm_page_stats_reusable.reuse_pages_success, "");
2423 SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2424     &vm_page_stats_reusable.reuse_pages_failure, "");
2425 SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2426     &vm_page_stats_reusable.all_reuse_calls, "");
2427 SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2428     &vm_page_stats_reusable.partial_reuse_calls, "");
2429 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2430     &vm_page_stats_reusable.can_reuse_success, "");
2431 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2432     &vm_page_stats_reusable.can_reuse_failure, "");
2433 SYSCTL_QUAD(_vm, OID_AUTO, reusable_reclaimed, CTLFLAG_RD | CTLFLAG_LOCKED,
2434     &vm_page_stats_reusable.reusable_reclaimed, "");
2435 SYSCTL_QUAD(_vm, OID_AUTO, reusable_nonwritable, CTLFLAG_RD | CTLFLAG_LOCKED,
2436     &vm_page_stats_reusable.reusable_nonwritable, "");
2437 SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2438     &vm_page_stats_reusable.reusable_shared, "");
2439 SYSCTL_QUAD(_vm, OID_AUTO, free_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2440     &vm_page_stats_reusable.free_shared, "");
2441 
2442 
2443 extern unsigned int vm_page_free_count, vm_page_speculative_count;
2444 SYSCTL_UINT(_vm, OID_AUTO, page_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_count, 0, "");
2445 SYSCTL_UINT(_vm, OID_AUTO, page_speculative_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_speculative_count, 0, "");
2446 
2447 extern unsigned int vm_page_cleaned_count;
2448 SYSCTL_UINT(_vm, OID_AUTO, page_cleaned_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_cleaned_count, 0, "Cleaned queue size");
2449 
2450 extern unsigned int vm_page_pageable_internal_count, vm_page_pageable_external_count;
2451 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_internal_count, 0, "");
2452 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, "");
2453 
2454 /* pageout counts */
2455 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_clean, 0, "");
2456 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_used, 0, "");
2457 
2458 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, "");
2459 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_external, "");
2460 SYSCTL_ULONG(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2461 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_external, "");
2462 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2463 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_cleaned, "");
2464 
2465 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_sharedcache, "");
2466 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache, "");
2467 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_realtime, "");
2468 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime, "");
2469 extern unsigned int vm_page_realtime_count;
2470 SYSCTL_UINT(_vm, OID_AUTO, page_realtime_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_realtime_count, 0, "");
2471 extern int vm_pageout_protect_realtime;
2472 SYSCTL_INT(_vm, OID_AUTO, pageout_protect_realtime, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_protect_realtime, 0, "");
2473 
2474 /* counts of pages prefaulted when entering a memory object */
2475 extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout;
2476 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_pages, "");
2477 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_bailout, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_bailout, "");
2478 
2479 #if defined (__x86_64__)
2480 extern unsigned int vm_clump_promote_threshold;
2481 SYSCTL_UINT(_vm, OID_AUTO, vm_clump_promote_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_clump_promote_threshold, 0, "clump size threshold for promotes");
2482 #if DEVELOPMENT || DEBUG
2483 extern unsigned long vm_clump_stats[];
2484 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[1], "free page allocations from clump of 1 page");
2485 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[2], "free page allocations from clump of 2 pages");
2486 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[3], "free page allocations from clump of 3 pages");
2487 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats4, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[4], "free page allocations from clump of 4 pages");
2488 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats5, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[5], "free page allocations from clump of 5 pages");
2489 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats6, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[6], "free page allocations from clump of 6 pages");
2490 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats7, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[7], "free page allocations from clump of 7 pages");
2491 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats8, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[8], "free page allocations from clump of 8 pages");
2492 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats9, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[9], "free page allocations from clump of 9 pages");
2493 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats10, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[10], "free page allocations from clump of 10 pages");
2494 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats11, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[11], "free page allocations from clump of 11 pages");
2495 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats12, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[12], "free page allocations from clump of 12 pages");
2496 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats13, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[13], "free page allocations from clump of 13 pages");
2497 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats14, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[14], "free page allocations from clump of 14 pages");
2498 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats15, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[15], "free page allocations from clump of 15 pages");
2499 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats16, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[16], "free page allocations from clump of 16 pages");
2500 extern unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
2501 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_alloc, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_allocs, "free page allocations");
2502 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inserts, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inserts, "free page insertions");
2503 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inrange, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inrange, "free page insertions that are part of vm_pages");
2504 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_promotes, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_promotes, "pages promoted to head");
2505 #endif  /* if DEVELOPMENT || DEBUG */
2506 #endif  /* #if defined (__x86_64__) */
2507 
2508 #if CONFIG_SECLUDED_MEMORY
2509 
2510 SYSCTL_UINT(_vm, OID_AUTO, num_tasks_can_use_secluded_mem, CTLFLAG_RD | CTLFLAG_LOCKED, &num_tasks_can_use_secluded_mem, 0, "");
2511 extern unsigned int vm_page_secluded_target;
2512 extern unsigned int vm_page_secluded_count;
2513 extern unsigned int vm_page_secluded_count_free;
2514 extern unsigned int vm_page_secluded_count_inuse;
2515 extern unsigned int vm_page_secluded_count_over_target;
2516 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, "");
2517 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, "");
2518 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, "");
2519 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, "");
2520 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_over_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_over_target, 0, "");
2521 
2522 extern struct vm_page_secluded_data vm_page_secluded;
2523 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, "");
2524 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_free, 0, "");
2525 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_other, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_other, 0, "");
2526 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_locked, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_locked, 0, "");
2527 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_state, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_state, 0, "");
2528 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_realtime, 0, "");
2529 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_dirty, 0, "");
2530 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, "");
2531 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, "");
2532 
2533 #endif /* CONFIG_SECLUDED_MEMORY */
2534 
2535 #if CONFIG_DEFERRED_RECLAIM
2536 #pragma mark Deferred Reclaim
2537 SYSCTL_NODE(_vm, OID_AUTO, reclaim, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Deferred Memory Reclamation");
2538 #if DEVELOPMENT || DEBUG
2539 /*
2540  * VM reclaim testing
2541  */
2542 extern bool vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task);
2543 
2544 static int
2545 sysctl_vm_reclaim_wait_for_pid SYSCTL_HANDLER_ARGS
2546 {
2547 	int error = EINVAL, pid = 0;
2548 	/*
2549 	 * Only send on write
2550 	 */
2551 	error = sysctl_handle_int(oidp, &pid, 0, req);
2552 	if (error || !req->newptr) {
2553 		return error;
2554 	}
2555 	if (pid <= 0) {
2556 		return EINVAL;
2557 	}
2558 	proc_t p = proc_find(pid);
2559 	if (p == PROC_NULL) {
2560 		return ESRCH;
2561 	}
2562 	task_t t = proc_task(p);
2563 	if (t == TASK_NULL) {
2564 		proc_rele(p);
2565 		return ESRCH;
2566 	}
2567 	task_reference(t);
2568 	proc_rele(p);
2569 
2570 	bool success = vm_deferred_reclamation_block_until_task_has_been_reclaimed(t);
2571 	if (success) {
2572 		error = 0;
2573 	}
2574 	task_deallocate(t);
2575 
2576 	return error;
2577 }
2578 
2579 SYSCTL_PROC(_vm_reclaim, OID_AUTO, wait_for_pid,
2580     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2581     &sysctl_vm_reclaim_wait_for_pid, "I",
2582     "Block until the given pid has been drained by kernel GC");
2583 
2584 static int
2585 sysctl_vm_reclaim_drain_pid SYSCTL_HANDLER_ARGS
2586 {
2587 	int error = EINVAL;
2588 	kern_return_t kr;
2589 	pid_t pid;
2590 	error = sysctl_handle_int(oidp, &pid, 0, req);
2591 	/* Only reclaim on write */
2592 	if (error || !req->newptr) {
2593 		return error;
2594 	}
2595 	if (pid <= 0) {
2596 		return EINVAL;
2597 	}
2598 	proc_t p = proc_find(pid);
2599 	if (p == PROC_NULL) {
2600 		return ESRCH;
2601 	}
2602 	task_t t = proc_task(p);
2603 	if (t == TASK_NULL) {
2604 		proc_rele(p);
2605 		return ESRCH;
2606 	}
2607 	task_reference(t);
2608 	proc_rele(p);
2609 	kr = vm_deferred_reclamation_task_drain(t, RECLAIM_OPTIONS_NONE);
2610 	task_deallocate(t);
2611 	return mach_to_bsd_errno(kr);
2612 }
2613 
2614 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_pid,
2615     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2616     &sysctl_vm_reclaim_drain_pid, "I",
2617     "Drain the deferred reclamation buffer for a pid");
2618 
2619 static int
proc_filter_reclaimable(proc_t p,__unused void * arg)2620 proc_filter_reclaimable(proc_t p, __unused void *arg)
2621 {
2622 	task_t task = proc_task(p);
2623 	return vm_deferred_reclamation_task_has_ring(task);
2624 }
2625 
2626 static int
proc_reclaim_drain(proc_t p,__unused void * arg)2627 proc_reclaim_drain(proc_t p, __unused void *arg)
2628 {
2629 	kern_return_t kr;
2630 	task_t task = proc_task(p);
2631 	kr = vm_deferred_reclamation_task_drain(task, RECLAIM_OPTIONS_NONE);
2632 	return mach_to_bsd_errno(kr);
2633 }
2634 
2635 static int
2636 sysctl_vm_reclaim_drain_all SYSCTL_HANDLER_ARGS
2637 {
2638 	int error;
2639 	int val;
2640 	if (!req->newptr) {
2641 		return EINVAL;
2642 	}
2643 	error = sysctl_handle_int(oidp, &val, 0, req);
2644 	if (error || val == FALSE) {
2645 		return error;
2646 	}
2647 	proc_iterate(PROC_ALLPROCLIST, proc_reclaim_drain, NULL,
2648 	    proc_filter_reclaimable, NULL);
2649 	return 0;
2650 }
2651 
2652 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_all,
2653     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2654     &sysctl_vm_reclaim_drain_all, "I",
2655     "Fully reclaim from every deferred reclamation buffer on the system");
2656 
2657 extern uint32_t vm_reclaim_buffer_count;
2658 extern uint64_t vm_reclaim_gc_epoch;
2659 extern uint64_t vm_reclaim_gc_reclaim_count;
2660 #if XNU_TARGET_OS_IOS
2661 extern uint64_t vm_reclaim_max_threshold;
2662 #else /* !XNU_TARGET_OS_IOS */
2663 extern bool vm_reclaim_debug;
2664 extern bool vm_reclaim_enabled;
2665 extern uint64_t vm_reclaim_sampling_period_ns;
2666 extern uint64_t vm_reclaim_sampling_period_abs;
2667 extern uint32_t vm_reclaim_autotrim_pct_normal;
2668 extern uint32_t vm_reclaim_autotrim_pct_pressure;
2669 extern uint32_t vm_reclaim_autotrim_pct_critical;
2670 extern uint32_t vm_reclaim_wma_weight_base;
2671 extern uint32_t vm_reclaim_wma_weight_cur;
2672 extern uint32_t vm_reclaim_wma_denom;
2673 extern uint64_t vm_reclaim_abandonment_threshold;
2674 #endif /* XNU_TARGET_OS_IOS */
2675 
2676 SYSCTL_UINT(_vm_reclaim, OID_AUTO, reclaim_buffer_count,
2677     CTLFLAG_RD | CTLFLAG_LOCKED, (uint32_t *)&vm_reclaim_buffer_count, 0,
2678     "The number of deferred memory buffers currently alive");
2679 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_epoch,
2680     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_epoch,
2681     "Number of times the global GC thread has run");
2682 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_reclaim_count,
2683     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_reclaim_count,
2684     "Number of times the global GC thread has reclaimed from a buffer");
2685 #if XNU_TARGET_OS_IOS
2686 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, max_threshold,
2687     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_max_threshold,
2688     "Maximum amount of virtual memory (in B) that may be deferred without "
2689     "synchronous reclamation");
2690 #else /* !XNU_TARGET_OS_IOS */
2691 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, enabled,
2692     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_enabled, 0,
2693     "Whether deferred memory reclamation is enabled on this system");
2694 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, debug,
2695     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_debug, 0,
2696     "Whether vm.reclaim debug logs are enabled");
2697 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_normal,
2698     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_normal, 0,
2699     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2700     "to engage auto-trim when the system is operating normally");
2701 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_pressure,
2702     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_pressure, 0,
2703     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2704     "to engage auto-trim when the system is under memory pressure");
2705 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_critical,
2706     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_critical, 0,
2707     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2708     "to engage auto-trim when the system is under critical memory pressure");
2709 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_base,
2710     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_base, 0,
2711     "Weight applied to historical minimum buffer size samples");
2712 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_cur,
2713     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_cur, 0,
2714     "Weight applied to current sampled minimum buffer size");
2715 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_denom,
2716     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_denom, 0,
2717     "Denominator for weighted moving average calculation");
2718 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, abandonment_threshold,
2719     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_abandonment_threshold,
2720     "The number of sampling periods between accounting updates that may elapse "
2721     "before the buffer is considered \"abandoned\"");
2722 
2723 static int
2724 sysctl_vm_reclaim_sampling_period SYSCTL_HANDLER_ARGS
2725 {
2726 	uint64_t new_val_ns;
2727 	uint64_t old_val_ns = vm_reclaim_sampling_period_ns;
2728 	int err = sysctl_io_number(req, vm_reclaim_sampling_period_ns,
2729 	    sizeof(vm_reclaim_sampling_period_ns), &new_val_ns, NULL);
2730 	if (err || !req->newptr) {
2731 		return err;
2732 	}
2733 	if (new_val_ns != old_val_ns) {
2734 		vm_reclaim_sampling_period_ns = new_val_ns;
2735 		nanoseconds_to_absolutetime(vm_reclaim_sampling_period_ns, &vm_reclaim_sampling_period_abs);
2736 	}
2737 	return 0;
2738 }
2739 
2740 SYSCTL_PROC(_vm_reclaim, OID_AUTO, sampling_period_ns,
2741     CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0, sysctl_vm_reclaim_sampling_period, "I",
2742     "Interval (nanoseconds) at which to sample the minimum buffer size and "
2743     "consider trimming excess");
2744 #endif /* XNU_TARGET_OS_IOS */
2745 #endif /* DEVELOPMENT || DEBUG */
2746 #endif /* CONFIG_DEFERRED_RECLAIM */
2747 
2748 #include <kern/thread.h>
2749 #include <sys/user.h>
2750 
2751 void vm_pageout_io_throttle(void);
2752 
2753 void
vm_pageout_io_throttle(void)2754 vm_pageout_io_throttle(void)
2755 {
2756 	struct uthread *uthread = current_uthread();
2757 
2758 	/*
2759 	 * thread is marked as a low priority I/O type
2760 	 * and the I/O we issued while in this cleaning operation
2761 	 * collided with normal I/O operations... we'll
2762 	 * delay in order to mitigate the impact of this
2763 	 * task on the normal operation of the system
2764 	 */
2765 
2766 	if (uthread->uu_lowpri_window) {
2767 		throttle_lowpri_io(1);
2768 	}
2769 }
2770 
2771 int
vm_pressure_monitor(__unused struct proc * p,struct vm_pressure_monitor_args * uap,int * retval)2772 vm_pressure_monitor(
2773 	__unused struct proc *p,
2774 	struct vm_pressure_monitor_args *uap,
2775 	int *retval)
2776 {
2777 	kern_return_t   kr;
2778 	uint32_t        pages_reclaimed;
2779 	uint32_t        pages_wanted;
2780 
2781 	kr = mach_vm_pressure_monitor(
2782 		(boolean_t) uap->wait_for_pressure,
2783 		uap->nsecs_monitored,
2784 		(uap->pages_reclaimed) ? &pages_reclaimed : NULL,
2785 		&pages_wanted);
2786 
2787 	switch (kr) {
2788 	case KERN_SUCCESS:
2789 		break;
2790 	case KERN_ABORTED:
2791 		return EINTR;
2792 	default:
2793 		return EINVAL;
2794 	}
2795 
2796 	if (uap->pages_reclaimed) {
2797 		if (copyout((void *)&pages_reclaimed,
2798 		    uap->pages_reclaimed,
2799 		    sizeof(pages_reclaimed)) != 0) {
2800 			return EFAULT;
2801 		}
2802 	}
2803 
2804 	*retval = (int) pages_wanted;
2805 	return 0;
2806 }
2807 
2808 int
kas_info(struct proc * p,struct kas_info_args * uap,int * retval __unused)2809 kas_info(struct proc *p,
2810     struct kas_info_args *uap,
2811     int *retval __unused)
2812 {
2813 #ifndef CONFIG_KAS_INFO
2814 	(void)p;
2815 	(void)uap;
2816 	return ENOTSUP;
2817 #else /* CONFIG_KAS_INFO */
2818 	int                     selector = uap->selector;
2819 	user_addr_t     valuep = uap->value;
2820 	user_addr_t     sizep = uap->size;
2821 	user_size_t size, rsize;
2822 	int                     error;
2823 
2824 	if (!kauth_cred_issuser(kauth_cred_get())) {
2825 		return EPERM;
2826 	}
2827 
2828 #if CONFIG_MACF
2829 	error = mac_system_check_kas_info(kauth_cred_get(), selector);
2830 	if (error) {
2831 		return error;
2832 	}
2833 #endif
2834 
2835 	if (IS_64BIT_PROCESS(p)) {
2836 		user64_size_t size64;
2837 		error = copyin(sizep, &size64, sizeof(size64));
2838 		size = (user_size_t)size64;
2839 	} else {
2840 		user32_size_t size32;
2841 		error = copyin(sizep, &size32, sizeof(size32));
2842 		size = (user_size_t)size32;
2843 	}
2844 	if (error) {
2845 		return error;
2846 	}
2847 
2848 	switch (selector) {
2849 	case KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR:
2850 	{
2851 		uint64_t slide = vm_kernel_slide;
2852 
2853 		if (sizeof(slide) != size) {
2854 			return EINVAL;
2855 		}
2856 
2857 		error = copyout(&slide, valuep, sizeof(slide));
2858 		if (error) {
2859 			return error;
2860 		}
2861 		rsize = size;
2862 	}
2863 	break;
2864 	case KAS_INFO_KERNEL_SEGMENT_VMADDR_SELECTOR:
2865 	{
2866 		uint32_t i;
2867 		kernel_mach_header_t *mh = &_mh_execute_header;
2868 		struct load_command *cmd;
2869 		cmd = (struct load_command*) &mh[1];
2870 		uint64_t *bases;
2871 		rsize = mh->ncmds * sizeof(uint64_t);
2872 
2873 		/*
2874 		 * Return the size if no data was passed
2875 		 */
2876 		if (valuep == 0) {
2877 			break;
2878 		}
2879 
2880 		if (rsize > size) {
2881 			return EINVAL;
2882 		}
2883 
2884 		bases = kalloc_data(rsize, Z_WAITOK | Z_ZERO);
2885 
2886 		for (i = 0; i < mh->ncmds; i++) {
2887 			if (cmd->cmd == LC_SEGMENT_KERNEL) {
2888 				__IGNORE_WCASTALIGN(kernel_segment_command_t * sg = (kernel_segment_command_t *) cmd);
2889 				bases[i] = (uint64_t)sg->vmaddr;
2890 			}
2891 			cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize);
2892 		}
2893 
2894 		error = copyout(bases, valuep, rsize);
2895 
2896 		kfree_data(bases, rsize);
2897 
2898 		if (error) {
2899 			return error;
2900 		}
2901 	}
2902 	break;
2903 	case KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR:
2904 	case KAS_INFO_TXM_TEXT_SLIDE_SELECTOR:
2905 	{
2906 #if CONFIG_SPTM
2907 		const uint64_t slide =
2908 		    (selector == KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR) ? vm_sptm_offsets.slide : vm_txm_offsets.slide;
2909 #else
2910 		const uint64_t slide = 0;
2911 #endif
2912 
2913 		if (sizeof(slide) != size) {
2914 			return EINVAL;
2915 		}
2916 
2917 		error = copyout(&slide, valuep, sizeof(slide));
2918 		if (error) {
2919 			return error;
2920 		}
2921 		rsize = size;
2922 	}
2923 	break;
2924 	default:
2925 		return EINVAL;
2926 	}
2927 
2928 	if (IS_64BIT_PROCESS(p)) {
2929 		user64_size_t size64 = (user64_size_t)rsize;
2930 		error = copyout(&size64, sizep, sizeof(size64));
2931 	} else {
2932 		user32_size_t size32 = (user32_size_t)rsize;
2933 		error = copyout(&size32, sizep, sizeof(size32));
2934 	}
2935 
2936 	return error;
2937 #endif /* CONFIG_KAS_INFO */
2938 }
2939 
2940 #pragma clang diagnostic push
2941 #pragma clang diagnostic ignored "-Wcast-qual"
2942 #pragma clang diagnostic ignored "-Wunused-function"
2943 
2944 static void
asserts()2945 asserts()
2946 {
2947 	static_assert(sizeof(vm_min_kernel_address) == sizeof(unsigned long));
2948 	static_assert(sizeof(vm_max_kernel_address) == sizeof(unsigned long));
2949 }
2950 
2951 SYSCTL_ULONG(_vm, OID_AUTO, vm_min_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_min_kernel_address, "");
2952 SYSCTL_ULONG(_vm, OID_AUTO, vm_max_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_max_kernel_address, "");
2953 #pragma clang diagnostic pop
2954 
2955 extern uint32_t vm_page_pages;
2956 SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0, "");
2957 
2958 extern uint32_t vm_page_busy_absent_skipped;
2959 SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, "");
2960 
2961 extern uint32_t vm_page_upl_tainted;
2962 SYSCTL_UINT(_vm, OID_AUTO, upl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_upl_tainted, 0, "");
2963 
2964 extern uint32_t vm_page_iopl_tainted;
2965 SYSCTL_UINT(_vm, OID_AUTO, iopl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_iopl_tainted, 0, "");
2966 
2967 #if __arm64__ && (DEVELOPMENT || DEBUG)
2968 extern int vm_footprint_suspend_allowed;
2969 SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, "");
2970 
2971 extern void pmap_footprint_suspend(vm_map_t map, boolean_t suspend);
2972 static int
2973 sysctl_vm_footprint_suspend SYSCTL_HANDLER_ARGS
2974 {
2975 #pragma unused(oidp, arg1, arg2)
2976 	int error = 0;
2977 	int new_value;
2978 
2979 	if (req->newptr == USER_ADDR_NULL) {
2980 		return 0;
2981 	}
2982 	error = SYSCTL_IN(req, &new_value, sizeof(int));
2983 	if (error) {
2984 		return error;
2985 	}
2986 	if (!vm_footprint_suspend_allowed) {
2987 		if (new_value != 0) {
2988 			/* suspends are not allowed... */
2989 			return 0;
2990 		}
2991 		/* ... but let resumes proceed */
2992 	}
2993 	DTRACE_VM2(footprint_suspend,
2994 	    vm_map_t, current_map(),
2995 	    int, new_value);
2996 
2997 	pmap_footprint_suspend(current_map(), new_value);
2998 
2999 	return 0;
3000 }
3001 SYSCTL_PROC(_vm, OID_AUTO, footprint_suspend,
3002     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3003     0, 0, &sysctl_vm_footprint_suspend, "I", "");
3004 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
3005 
3006 extern uint64_t vm_map_corpse_footprint_count;
3007 extern uint64_t vm_map_corpse_footprint_size_avg;
3008 extern uint64_t vm_map_corpse_footprint_size_max;
3009 extern uint64_t vm_map_corpse_footprint_full;
3010 extern uint64_t vm_map_corpse_footprint_no_buf;
3011 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_count,
3012     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_count, "");
3013 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_avg,
3014     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_avg, "");
3015 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_max,
3016     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_max, "");
3017 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_full,
3018     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_full, "");
3019 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_no_buf,
3020     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_no_buf, "");
3021 
3022 #if CODE_SIGNING_MONITOR
3023 extern uint64_t vm_cs_defer_to_csm;
3024 extern uint64_t vm_cs_defer_to_csm_not;
3025 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm,
3026     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm, "");
3027 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm_not,
3028     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm_not, "");
3029 #endif /* CODE_SIGNING_MONITOR */
3030 
3031 extern uint64_t shared_region_pager_copied;
3032 extern uint64_t shared_region_pager_slid;
3033 extern uint64_t shared_region_pager_slid_error;
3034 extern uint64_t shared_region_pager_reclaimed;
3035 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_copied,
3036     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_copied, "");
3037 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid,
3038     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid, "");
3039 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid_error,
3040     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid_error, "");
3041 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed,
3042     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_reclaimed, "");
3043 extern int shared_region_destroy_delay;
3044 SYSCTL_INT(_vm, OID_AUTO, shared_region_destroy_delay,
3045     CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_destroy_delay, 0, "");
3046 
3047 #if MACH_ASSERT
3048 extern int pmap_ledgers_panic_leeway;
3049 SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
3050 #endif /* MACH_ASSERT */
3051 
3052 
3053 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_count;
3054 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_size;
3055 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_max;
3056 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart;
3057 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_error;
3058 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_count;
3059 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_size;
3060 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_max;
3061 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart;
3062 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_error;
3063 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_count;
3064 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_size;
3065 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_max;
3066 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_count,
3067     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_count, "");
3068 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_size,
3069     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_size, "");
3070 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_max,
3071     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_max, "");
3072 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_restart,
3073     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_restart, "");
3074 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_error,
3075     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_error, "");
3076 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_count,
3077     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_count, "");
3078 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_size,
3079     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_size, "");
3080 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_max,
3081     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_max, "");
3082 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_restart,
3083     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_restart, "");
3084 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_error,
3085     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_error, "");
3086 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_count,
3087     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_count, "");
3088 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_size,
3089     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_size, "");
3090 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_max,
3091     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_max, "");
3092 
3093 extern int vm_protect_privileged_from_untrusted;
3094 SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted,
3095     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, "");
3096 extern uint64_t vm_copied_on_read;
3097 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read,
3098     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, "");
3099 
3100 extern int vm_shared_region_count;
3101 extern int vm_shared_region_peak;
3102 SYSCTL_INT(_vm, OID_AUTO, shared_region_count,
3103     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_count, 0, "");
3104 SYSCTL_INT(_vm, OID_AUTO, shared_region_peak,
3105     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_peak, 0, "");
3106 #if DEVELOPMENT || DEBUG
3107 extern unsigned int shared_region_pagers_resident_count;
3108 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_count,
3109     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_count, 0, "");
3110 extern unsigned int shared_region_pagers_resident_peak;
3111 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_peak,
3112     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_peak, 0, "");
3113 extern int shared_region_pager_count;
3114 SYSCTL_INT(_vm, OID_AUTO, shared_region_pager_count,
3115     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_count, 0, "");
3116 #if __has_feature(ptrauth_calls)
3117 extern int shared_region_key_count;
3118 SYSCTL_INT(_vm, OID_AUTO, shared_region_key_count,
3119     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_key_count, 0, "");
3120 extern int vm_shared_region_reslide_count;
3121 SYSCTL_INT(_vm, OID_AUTO, shared_region_reslide_count,
3122     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_reslide_count, 0, "");
3123 #endif /* __has_feature(ptrauth_calls) */
3124 #endif /* DEVELOPMENT || DEBUG */
3125 
3126 #if MACH_ASSERT
3127 extern int debug4k_filter;
3128 SYSCTL_INT(_vm, OID_AUTO, debug4k_filter, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_filter, 0, "");
3129 extern int debug4k_panic_on_terminate;
3130 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_terminate, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_terminate, 0, "");
3131 extern int debug4k_panic_on_exception;
3132 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_exception, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_exception, 0, "");
3133 extern int debug4k_panic_on_misaligned_sharing;
3134 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_misaligned_sharing, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_misaligned_sharing, 0, "");
3135 #endif /* MACH_ASSERT */
3136 
3137 extern uint64_t vm_map_set_size_limit_count;
3138 extern uint64_t vm_map_set_data_limit_count;
3139 extern uint64_t vm_map_enter_RLIMIT_AS_count;
3140 extern uint64_t vm_map_enter_RLIMIT_DATA_count;
3141 SYSCTL_QUAD(_vm, OID_AUTO, map_set_size_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_size_limit_count, "");
3142 SYSCTL_QUAD(_vm, OID_AUTO, map_set_data_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_data_limit_count, "");
3143 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_AS_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_AS_count, "");
3144 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_DATA_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_DATA_count, "");
3145 
3146 extern uint64_t vm_fault_resilient_media_initiate;
3147 extern uint64_t vm_fault_resilient_media_retry;
3148 extern uint64_t vm_fault_resilient_media_proceed;
3149 extern uint64_t vm_fault_resilient_media_release;
3150 extern uint64_t vm_fault_resilient_media_abort1;
3151 extern uint64_t vm_fault_resilient_media_abort2;
3152 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_initiate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_initiate, "");
3153 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_retry, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_retry, "");
3154 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_proceed, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_proceed, "");
3155 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_release, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_release, "");
3156 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort1, "");
3157 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort2, "");
3158 #if MACH_ASSERT
3159 extern int vm_fault_resilient_media_inject_error1_rate;
3160 extern int vm_fault_resilient_media_inject_error1;
3161 extern int vm_fault_resilient_media_inject_error2_rate;
3162 extern int vm_fault_resilient_media_inject_error2;
3163 extern int vm_fault_resilient_media_inject_error3_rate;
3164 extern int vm_fault_resilient_media_inject_error3;
3165 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1_rate, 0, "");
3166 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1, 0, "");
3167 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2_rate, 0, "");
3168 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2, 0, "");
3169 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3_rate, 0, "");
3170 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3, 0, "");
3171 #endif /* MACH_ASSERT */
3172 
3173 extern uint64_t pmap_query_page_info_retries;
3174 SYSCTL_QUAD(_vm, OID_AUTO, pmap_query_page_info_retries, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_query_page_info_retries, "");
3175 
3176 /*
3177  * A sysctl which causes all existing shared regions to become stale. They
3178  * will no longer be used by anything new and will be torn down as soon as
3179  * the last existing user exits. A write of non-zero value causes that to happen.
3180  * This should only be used by launchd, so we check that this is initproc.
3181  */
3182 static int
shared_region_pivot(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3183 shared_region_pivot(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3184 {
3185 	unsigned int value = 0;
3186 	int changed = 0;
3187 	int error = sysctl_io_number(req, 0, sizeof(value), &value, &changed);
3188 	if (error || !changed) {
3189 		return error;
3190 	}
3191 	if (current_proc() != initproc) {
3192 		return EPERM;
3193 	}
3194 
3195 	vm_shared_region_pivot();
3196 
3197 	return 0;
3198 }
3199 
3200 SYSCTL_PROC(_vm, OID_AUTO, shared_region_pivot,
3201     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
3202     0, 0, shared_region_pivot, "I", "");
3203 
3204 extern uint64_t vm_object_shadow_forced;
3205 extern uint64_t vm_object_shadow_skipped;
3206 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
3207     &vm_object_shadow_forced, "");
3208 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_skipped, CTLFLAG_RD | CTLFLAG_LOCKED,
3209     &vm_object_shadow_skipped, "");
3210 
3211 
3212 
3213 SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED,
3214     &vmtc_total, 0, "total text page corruptions detected");
3215 
3216 
3217 #if DEBUG || DEVELOPMENT
3218 /*
3219  * A sysctl that can be used to corrupt a text page with an illegal instruction.
3220  * Used for testing text page self healing.
3221  */
3222 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
3223 static int
corrupt_text_addr(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3224 corrupt_text_addr(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3225 {
3226 	uint64_t value = 0;
3227 	int error = sysctl_handle_quad(oidp, &value, 0, req);
3228 	if (error || !req->newptr) {
3229 		return error;
3230 	}
3231 
3232 	if (vm_corrupt_text_addr((uintptr_t)value) == KERN_SUCCESS) {
3233 		return 0;
3234 	} else {
3235 		return EINVAL;
3236 	}
3237 }
3238 
3239 SYSCTL_PROC(_vm, OID_AUTO, corrupt_text_addr,
3240     CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3241     0, 0, corrupt_text_addr, "-", "");
3242 #endif /* DEBUG || DEVELOPMENT */
3243 
3244 #if CONFIG_MAP_RANGES
3245 /*
3246  * vm.malloc_ranges
3247  *
3248  * space-separated list of <left:right> hexadecimal addresses.
3249  */
3250 static int
3251 vm_map_malloc_ranges SYSCTL_HANDLER_ARGS
3252 {
3253 	vm_map_t map = current_map();
3254 	struct mach_vm_range r1, r2;
3255 	char str[20 * 4];
3256 	int len;
3257 	mach_vm_offset_t right_hole_max;
3258 
3259 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_DEFAULT, &r1)) {
3260 		return ENOENT;
3261 	}
3262 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_HEAP, &r2)) {
3263 		return ENOENT;
3264 	}
3265 
3266 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
3267 	right_hole_max = MACH_VM_JUMBO_ADDRESS;
3268 #else /* !XNU_TARGET_OS_IOS || !EXTENDED_USER_VA_SUPPORT */
3269 	right_hole_max = get_map_max(map);
3270 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
3271 
3272 	len = scnprintf(str, sizeof(str), "0x%llx:0x%llx 0x%llx:0x%llx",
3273 	    r1.max_address, r2.min_address,
3274 	    r2.max_address, right_hole_max);
3275 
3276 	return SYSCTL_OUT(req, str, len);
3277 }
3278 
3279 SYSCTL_PROC(_vm, OID_AUTO, malloc_ranges,
3280     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3281     0, 0, &vm_map_malloc_ranges, "A", "");
3282 
3283 #if DEBUG || DEVELOPMENT
3284 static int
3285 vm_map_user_range_default SYSCTL_HANDLER_ARGS
3286 {
3287 #pragma unused(arg1, arg2, oidp)
3288 	struct mach_vm_range range;
3289 
3290 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_DEFAULT, &range)
3291 	    != KERN_SUCCESS) {
3292 		return EINVAL;
3293 	}
3294 
3295 	return SYSCTL_OUT(req, &range, sizeof(range));
3296 }
3297 
3298 static int
3299 vm_map_user_range_heap SYSCTL_HANDLER_ARGS
3300 {
3301 #pragma unused(arg1, arg2, oidp)
3302 	struct mach_vm_range range;
3303 
3304 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_HEAP, &range)
3305 	    != KERN_SUCCESS) {
3306 		return EINVAL;
3307 	}
3308 
3309 	return SYSCTL_OUT(req, &range, sizeof(range));
3310 }
3311 
3312 static int
3313 vm_map_user_range_large_file SYSCTL_HANDLER_ARGS
3314 {
3315 #pragma unused(arg1, arg2, oidp)
3316 	struct mach_vm_range range;
3317 
3318 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_LARGE_FILE, &range)
3319 	    != KERN_SUCCESS) {
3320 		return EINVAL;
3321 	}
3322 
3323 	return SYSCTL_OUT(req, &range, sizeof(range));
3324 }
3325 
3326 /*
3327  * A sysctl that can be used to return ranges for the current VM map.
3328  * Used for testing VM ranges.
3329  */
3330 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_default, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3331     0, 0, &vm_map_user_range_default, "S,mach_vm_range", "");
3332 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_heap, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3333     0, 0, &vm_map_user_range_heap, "S,mach_vm_range", "");
3334 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_large_file, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3335     0, 0, &vm_map_user_range_large_file, "S,mach_vm_range", "");
3336 
3337 #endif /* DEBUG || DEVELOPMENT */
3338 #endif /* CONFIG_MAP_RANGES */
3339 
3340 #if DEBUG || DEVELOPMENT
3341 #endif /* DEBUG || DEVELOPMENT */
3342 
3343 extern uint64_t vm_map_range_overflows_count;
3344 SYSCTL_QUAD(_vm, OID_AUTO, map_range_overflows_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_range_overflows_count, "");
3345 extern boolean_t vm_map_range_overflows_log;
3346 SYSCTL_INT(_vm, OID_AUTO, map_range_oveflows_log, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_range_overflows_log, 0, "");
3347 
3348 extern uint64_t c_seg_filled_no_contention;
3349 extern uint64_t c_seg_filled_contention;
3350 extern clock_sec_t c_seg_filled_contention_sec_max;
3351 extern clock_nsec_t c_seg_filled_contention_nsec_max;
3352 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_no_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_no_contention, "");
3353 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention, "");
3354 SYSCTL_ULONG(_vm, OID_AUTO, c_seg_filled_contention_sec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_sec_max, "");
3355 SYSCTL_UINT(_vm, OID_AUTO, c_seg_filled_contention_nsec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_nsec_max, 0, "");
3356 #if (XNU_TARGET_OS_OSX && __arm64__)
3357 extern clock_nsec_t c_process_major_report_over_ms; /* report if over ? ms */
3358 extern int c_process_major_yield_after; /* yield after moving ? segments */
3359 extern uint64_t c_process_major_reports;
3360 extern clock_sec_t c_process_major_max_sec;
3361 extern clock_nsec_t c_process_major_max_nsec;
3362 extern uint32_t c_process_major_peak_segcount;
3363 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_report_over_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_report_over_ms, 0, "");
3364 SYSCTL_INT(_vm, OID_AUTO, c_process_major_yield_after, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_yield_after, 0, "");
3365 SYSCTL_QUAD(_vm, OID_AUTO, c_process_major_reports, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_reports, "");
3366 SYSCTL_ULONG(_vm, OID_AUTO, c_process_major_max_sec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_sec, "");
3367 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_max_nsec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_nsec, 0, "");
3368 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_peak_segcount, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_peak_segcount, 0, "");
3369 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3370 
3371 #if DEVELOPMENT || DEBUG
3372 extern int panic_object_not_alive;
3373 SYSCTL_INT(_vm, OID_AUTO, panic_object_not_alive, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &panic_object_not_alive, 0, "");
3374 #endif /* DEVELOPMENT || DEBUG */
3375 
3376 #if FBDP_DEBUG_OBJECT_NO_PAGER
3377 extern int fbdp_no_panic;
3378 SYSCTL_INT(_vm, OID_AUTO, fbdp_no_panic, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &fbdp_no_panic, 0, "");
3379 #endif /* MACH_ASSERT */
3380 
3381 extern uint64_t cluster_direct_write_wired;
3382 SYSCTL_QUAD(_vm, OID_AUTO, cluster_direct_write_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &cluster_direct_write_wired, "");
3383 
3384 
3385 #if DEVELOPMENT || DEBUG
3386 
3387 static uint32_t
sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)3388 sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)
3389 {
3390 #pragma unused(with_data)
3391 	return VM_C_SEGMENT_INFO_MAGIC;
3392 }
3393 
3394 /* The largest possible single segment + its slots is
3395  * (sizeof(c_segment_info) + C_SLOT_MAX_INDEX * sizeof(c_slot_info)) + (data of a single segment) */
3396 #define SYSCTL_SEG_BUF_SIZE (8 * 1024 + 64 * 1024)
3397 
3398 extern uint32_t c_segments_available;
3399 
3400 struct sysctl_buf_header {
3401 	uint32_t magic;
3402 } __attribute__((packed));
3403 
3404 /* This sysctl iterates over the populated c_segments and writes some info about each one and its slots.
3405  * instead of doing everything here, the function calls a function vm_compressor.c. */
3406 static int
sysctl_compressor_segments_stream(struct sysctl_req * req,vm_c_serialize_add_data_t with_data)3407 sysctl_compressor_segments_stream(struct sysctl_req *req, vm_c_serialize_add_data_t with_data)
3408 {
3409 	char* buf = kalloc_data(SYSCTL_SEG_BUF_SIZE, Z_WAITOK | Z_ZERO);
3410 	if (!buf) {
3411 		return ENOMEM;
3412 	}
3413 	size_t offset = 0;
3414 	int error = 0;
3415 	int segno = 0;
3416 	/* 4 byte header to identify the version of the formatting of the data.
3417 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3418 	((struct sysctl_buf_header*)buf)->magic = sysctl_compressor_seg_magic(with_data);
3419 	offset += sizeof(uint32_t);
3420 
3421 	while (segno < c_segments_available) {
3422 		size_t left_sz = SYSCTL_SEG_BUF_SIZE - offset;
3423 		kern_return_t kr = vm_compressor_serialize_segment_debug_info(segno, buf + offset, &left_sz, with_data);
3424 		if (kr == KERN_NO_SPACE) {
3425 			/* failed to add another segment, push the current buffer out and try again */
3426 			if (offset == 0) {
3427 				error = EINVAL; /* no space to write but I didn't write anything, shouldn't really happen */
3428 				goto out;
3429 			}
3430 			/* write out chunk */
3431 			error = SYSCTL_OUT(req, buf, offset);
3432 			if (error) {
3433 				goto out;
3434 			}
3435 			offset = 0;
3436 			bzero(buf, SYSCTL_SEG_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3437 			/* don't increment segno, need to try again saving the current one */
3438 		} else if (kr != KERN_SUCCESS) {
3439 			error = EINVAL;
3440 			goto out;
3441 		} else {
3442 			offset += left_sz;
3443 			++segno;
3444 			assert(offset <= SYSCTL_SEG_BUF_SIZE);
3445 		}
3446 	}
3447 
3448 	if (offset > 0) { /* write last chunk */
3449 		error = SYSCTL_OUT(req, buf, offset);
3450 	}
3451 
3452 out:
3453 	kfree_data(buf, SYSCTL_SEG_BUF_SIZE)
3454 	return error;
3455 }
3456 
3457 static int
sysctl_compressor_segments(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3458 sysctl_compressor_segments(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3459 {
3460 	return sysctl_compressor_segments_stream(req, VM_C_SERIALIZE_DATA_NONE);
3461 }
3462 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments, "S", "");
3463 
3464 
3465 extern uint32_t vm_compressor_fragmentation_level(void);
3466 
3467 static int
sysctl_compressor_fragmentation_level(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3468 sysctl_compressor_fragmentation_level(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3469 {
3470 	uint32_t value = vm_compressor_fragmentation_level();
3471 	return SYSCTL_OUT(req, &value, sizeof(value));
3472 }
3473 
3474 SYSCTL_PROC(_vm, OID_AUTO, compressor_fragmentation_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_fragmentation_level, "IU", "");
3475 
3476 extern uint32_t vm_compressor_incore_fragmentation_wasted_pages(void);
3477 
3478 static int
sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3479 sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3480 {
3481 	uint32_t value = vm_compressor_incore_fragmentation_wasted_pages();
3482 	return SYSCTL_OUT(req, &value, sizeof(value));
3483 }
3484 
3485 SYSCTL_PROC(_vm, OID_AUTO, compressor_incore_fragmentation_wasted_pages, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_incore_fragmentation_wasted_pages, "IU", "");
3486 
3487 
3488 
3489 #define SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE (8 * 1024)
3490 
3491 
3492 /* This sysctl iterates over all the entries of the vm_map of the a given process and write some info about the vm_object pointed by the entries.
3493  * This can be used for mapping where are all the pages of a process located in the compressor.
3494  */
3495 static int
sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)3496 sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3497 {
3498 	int error = 0;
3499 	char *buf = NULL;
3500 	proc_t p = PROC_NULL;
3501 	task_t task = TASK_NULL;
3502 	vm_map_t map = VM_MAP_NULL;
3503 	__block size_t offset = 0;
3504 
3505 	/* go from pid to proc to task to vm_map. see sysctl_procargsx() for another example of this procession */
3506 	int *name = arg1;
3507 	int namelen = arg2;
3508 	if (namelen < 1) {
3509 		return EINVAL;
3510 	}
3511 	int pid = name[0];
3512 	p = proc_find(pid);  /* this increments a reference to the proc */
3513 	if (p == PROC_NULL) {
3514 		return EINVAL;
3515 	}
3516 	task = proc_task(p);
3517 	proc_rele(p);  /* decrement ref of proc */
3518 	p = PROC_NULL;
3519 	if (task == TASK_NULL) {
3520 		return EINVAL;
3521 	}
3522 	/* convert proc reference to task reference */
3523 	task_reference(task);
3524 	/* task reference to map reference */
3525 	map = get_task_map_reference(task);
3526 	task_deallocate(task);
3527 
3528 	if (map == VM_MAP_NULL) {
3529 		return EINVAL;  /* nothing allocated yet */
3530 	}
3531 
3532 	buf = kalloc_data(SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE, Z_WAITOK | Z_ZERO);
3533 	if (!buf) {
3534 		error = ENOMEM;
3535 		goto out;
3536 	}
3537 
3538 	/* 4 byte header to identify the version of the formatting of the data.
3539 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3540 	((struct sysctl_buf_header*)buf)->magic = VM_MAP_ENTRY_INFO_MAGIC;
3541 	offset += sizeof(uint32_t);
3542 
3543 	kern_return_t (^write_header)(int) = ^kern_return_t (int nentries) {
3544 		/* write the header, happens only once at the beginning so we should have enough space */
3545 		assert(offset + sizeof(struct vm_map_info_hdr) < SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE);
3546 		struct vm_map_info_hdr* out_hdr = (struct vm_map_info_hdr*)(buf + offset);
3547 		out_hdr->vmi_nentries = nentries;
3548 		offset += sizeof(struct vm_map_info_hdr);
3549 		return KERN_SUCCESS;
3550 	};
3551 
3552 	kern_return_t (^write_entry)(void*) = ^kern_return_t (void* entry) {
3553 		while (true) { /* try up to 2 times, first try write the the current buffer, otherwise to a new buffer */
3554 			size_t left_sz = SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE - offset;
3555 			kern_return_t kr = vm_map_dump_entry_and_compressor_pager(entry, buf + offset, &left_sz);
3556 			if (kr == KERN_NO_SPACE) {
3557 				/* failed to write anything, flush the current buffer and try again */
3558 				if (offset == 0) {
3559 					return KERN_FAILURE; /* no space to write but I didn't write anything yet, shouldn't really happen */
3560 				}
3561 				/* write out chunk */
3562 				int out_error = SYSCTL_OUT(req, buf, offset);
3563 				if (out_error) {
3564 					return KERN_FAILURE;
3565 				}
3566 				offset = 0;
3567 				bzero(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3568 				continue; /* need to retry the entry dump again with the cleaned buffer */
3569 			} else if (kr != KERN_SUCCESS) {
3570 				return kr;
3571 			}
3572 			offset += left_sz;
3573 			break;
3574 		}
3575 		return KERN_SUCCESS;
3576 	};
3577 
3578 	/* this foreach first calls to the first callback with the number of entries, then calls the second for every entry
3579 	 * when the buffer is exhausted, it is flushed to the sysctl and restarted */
3580 	kern_return_t kr = vm_map_entries_foreach(map, write_header, write_entry);
3581 
3582 	if (kr != KERN_SUCCESS) {
3583 		goto out;
3584 	}
3585 
3586 	if (offset > 0) { /* last chunk */
3587 		error = SYSCTL_OUT(req, buf, offset);
3588 	}
3589 
3590 out:
3591 	if (buf != NULL) {
3592 		kfree_data(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE)
3593 	}
3594 	if (map != NULL) {
3595 		vm_map_deallocate(map);
3596 	}
3597 	return error;
3598 }
3599 
3600 SYSCTL_PROC(_vm, OID_AUTO, task_vm_objects_slotmap, CTLTYPE_NODE | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_task_vm_objects_slotmap, "S", "");
3601 
3602 
3603 
3604 #endif /* DEVELOPMENT || DEBUG */
3605