xref: /xnu-11215.81.4/bsd/vm/vm_unix.c (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Mach Operating System
30  * Copyright (c) 1987 Carnegie-Mellon University
31  * All rights reserved.  The CMU software License Agreement specifies
32  * the terms and conditions for use and redistribution.
33  */
34 /*
35  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
36  * support for mandatory and extensible security protections.  This notice
37  * is included in support of clause 2.2 (b) of the Apple Public License,
38  * Version 2.0.
39  */
40 #include <vm/vm_options.h>
41 
42 #include <kern/ecc.h>
43 #include <kern/task.h>
44 #include <kern/thread.h>
45 #include <kern/debug.h>
46 #include <kern/extmod_statistics.h>
47 #include <mach/mach_traps.h>
48 #include <mach/port.h>
49 #include <mach/sdt.h>
50 #include <mach/task.h>
51 #include <mach/task_access.h>
52 #include <mach/task_special_ports.h>
53 #include <mach/time_value.h>
54 #include <mach/vm_map.h>
55 #include <mach/vm_param.h>
56 #include <mach/vm_prot.h>
57 #include <machine/machine_routines.h>
58 
59 #include <sys/file_internal.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/dir.h>
63 #include <sys/namei.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/vm.h>
67 #include <sys/file.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/mount.h>
70 #include <sys/xattr.h>
71 #include <sys/trace.h>
72 #include <sys/kernel.h>
73 #include <sys/ubc_internal.h>
74 #include <sys/user.h>
75 #include <sys/syslog.h>
76 #include <sys/stat.h>
77 #include <sys/sysproto.h>
78 #include <sys/mman.h>
79 #include <sys/sysctl.h>
80 #include <sys/cprotect.h>
81 #include <sys/kpi_socket.h>
82 #include <sys/kas_info.h>
83 #include <sys/socket.h>
84 #include <sys/socketvar.h>
85 #include <sys/random.h>
86 #include <sys/code_signing.h>
87 #if NECP
88 #include <net/necp.h>
89 #endif /* NECP */
90 #if SKYWALK
91 #include <skywalk/os_channel.h>
92 #endif /* SKYWALK */
93 
94 #include <security/audit/audit.h>
95 #include <security/mac.h>
96 #include <bsm/audit_kevents.h>
97 
98 #include <kern/kalloc.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_kern_xnu.h>
101 #include <vm/vm_pageout_xnu.h>
102 
103 #include <mach/shared_region.h>
104 #include <vm/vm_shared_region_internal.h>
105 
106 #include <vm/vm_dyld_pager_internal.h>
107 #include <vm/vm_protos_internal.h>
108 #if DEVELOPMENT || DEBUG
109 #include <vm/vm_compressor_info.h>         /* for c_segment_info */
110 #include <vm/vm_compressor_xnu.h>          /* for vm_compressor_serialize_segment_debug_info() */
111 #endif
112 #include <vm/vm_reclaim_xnu.h>
113 
114 #include <sys/kern_memorystatus.h>
115 #include <sys/kern_memorystatus_freeze.h>
116 #include <sys/proc_internal.h>
117 
118 #include <mach-o/fixup-chains.h>
119 
120 #if CONFIG_MACF
121 #include <security/mac_framework.h>
122 #endif
123 
124 #include <kern/bits.h>
125 
126 #if CONFIG_CSR
127 #include <sys/csr.h>
128 #endif /* CONFIG_CSR */
129 #include <sys/trust_caches.h>
130 #include <libkern/amfi/amfi.h>
131 #include <IOKit/IOBSD.h>
132 
133 #if VM_MAP_DEBUG_APPLE_PROTECT
134 SYSCTL_INT(_vm, OID_AUTO, map_debug_apple_protect, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_apple_protect, 0, "");
135 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
136 
137 #if DEVELOPMENT || DEBUG
138 
139 static int
140 sysctl_kmem_alloc_contig SYSCTL_HANDLER_ARGS
141 {
142 #pragma unused(arg1, arg2)
143 	vm_offset_t     kaddr;
144 	kern_return_t   kr;
145 	int     error = 0;
146 	int     size = 0;
147 
148 	error = sysctl_handle_int(oidp, &size, 0, req);
149 	if (error || !req->newptr) {
150 		return error;
151 	}
152 
153 	kr = kmem_alloc_contig(kernel_map, &kaddr, (vm_size_t)size,
154 	    0, 0, 0, KMA_DATA, VM_KERN_MEMORY_IOKIT);
155 
156 	if (kr == KERN_SUCCESS) {
157 		kmem_free(kernel_map, kaddr, size);
158 	}
159 
160 	return error;
161 }
162 
163 SYSCTL_PROC(_vm, OID_AUTO, kmem_alloc_contig, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
164     0, 0, &sysctl_kmem_alloc_contig, "I", "");
165 
166 extern int vm_region_footprint;
167 SYSCTL_INT(_vm, OID_AUTO, region_footprint, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, &vm_region_footprint, 0, "");
168 
169 static int
170 sysctl_kmem_gobj_stats SYSCTL_HANDLER_ARGS
171 {
172 #pragma unused(arg1, arg2, oidp)
173 	kmem_gobj_stats stats = kmem_get_gobj_stats();
174 
175 	return SYSCTL_OUT(req, &stats, sizeof(stats));
176 }
177 
178 SYSCTL_PROC(_vm, OID_AUTO, kmem_gobj_stats,
179     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
180     0, 0, &sysctl_kmem_gobj_stats, "S,kmem_gobj_stats", "");
181 
182 #endif /* DEVELOPMENT || DEBUG */
183 
184 static int
185 sysctl_vm_self_region_footprint SYSCTL_HANDLER_ARGS
186 {
187 #pragma unused(arg1, arg2, oidp)
188 	int     error = 0;
189 	int     value;
190 
191 	value = task_self_region_footprint();
192 	error = SYSCTL_OUT(req, &value, sizeof(int));
193 	if (error) {
194 		return error;
195 	}
196 
197 	if (!req->newptr) {
198 		return 0;
199 	}
200 
201 	error = SYSCTL_IN(req, &value, sizeof(int));
202 	if (error) {
203 		return error;
204 	}
205 	task_self_region_footprint_set(value);
206 	return 0;
207 }
208 SYSCTL_PROC(_vm, OID_AUTO, self_region_footprint, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_footprint, "I", "");
209 
210 static int
211 sysctl_vm_self_region_page_size SYSCTL_HANDLER_ARGS
212 {
213 #pragma unused(arg1, arg2, oidp)
214 	int     error = 0;
215 	int     value;
216 
217 	value = (1 << thread_self_region_page_shift());
218 	error = SYSCTL_OUT(req, &value, sizeof(int));
219 	if (error) {
220 		return error;
221 	}
222 
223 	if (!req->newptr) {
224 		return 0;
225 	}
226 
227 	error = SYSCTL_IN(req, &value, sizeof(int));
228 	if (error) {
229 		return error;
230 	}
231 
232 	if (value != 0 && value != 4096 && value != 16384) {
233 		return EINVAL;
234 	}
235 
236 #if !__ARM_MIXED_PAGE_SIZE__
237 	if (value != vm_map_page_size(current_map())) {
238 		return EINVAL;
239 	}
240 #endif /* !__ARM_MIXED_PAGE_SIZE__ */
241 
242 	thread_self_region_page_shift_set(bit_first(value));
243 	return 0;
244 }
245 SYSCTL_PROC(_vm, OID_AUTO, self_region_page_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_page_size, "I", "");
246 
247 static int
248 sysctl_vm_self_region_info_flags SYSCTL_HANDLER_ARGS
249 {
250 #pragma unused(arg1, arg2, oidp)
251 	int     error = 0;
252 	int     value;
253 	kern_return_t kr;
254 
255 	value = task_self_region_info_flags();
256 	error = SYSCTL_OUT(req, &value, sizeof(int));
257 	if (error) {
258 		return error;
259 	}
260 
261 	if (!req->newptr) {
262 		return 0;
263 	}
264 
265 	error = SYSCTL_IN(req, &value, sizeof(int));
266 	if (error) {
267 		return error;
268 	}
269 
270 	kr = task_self_region_info_flags_set(value);
271 	if (kr != KERN_SUCCESS) {
272 		return EINVAL;
273 	}
274 
275 	return 0;
276 }
277 SYSCTL_PROC(_vm, OID_AUTO, self_region_info_flags, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_info_flags, "I", "");
278 
279 
280 #if DEVELOPMENT || DEBUG
281 extern int panic_on_unsigned_execute;
282 SYSCTL_INT(_vm, OID_AUTO, panic_on_unsigned_execute, CTLFLAG_RW | CTLFLAG_LOCKED, &panic_on_unsigned_execute, 0, "");
283 
284 extern int vm_log_xnu_user_debug;
285 SYSCTL_INT(_vm, OID_AUTO, log_xnu_user_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_xnu_user_debug, 0, "");
286 #endif /* DEVELOPMENT || DEBUG */
287 
288 extern int vm_log_map_delete_permanent_prot_none;
289 SYSCTL_INT(_vm, OID_AUTO, log_map_delete_permanent_prot_none, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_map_delete_permanent_prot_none, 0, "");
290 
291 extern int cs_executable_create_upl;
292 extern int cs_executable_wire;
293 SYSCTL_INT(_vm, OID_AUTO, cs_executable_create_upl, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_create_upl, 0, "");
294 SYSCTL_INT(_vm, OID_AUTO, cs_executable_wire, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_wire, 0, "");
295 
296 extern int apple_protect_pager_count;
297 extern int apple_protect_pager_count_mapped;
298 extern unsigned int apple_protect_pager_cache_limit;
299 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count, 0, "");
300 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count_mapped, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count_mapped, 0, "");
301 SYSCTL_UINT(_vm, OID_AUTO, apple_protect_pager_cache_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_cache_limit, 0, "");
302 
303 #if DEVELOPMENT || DEBUG
304 extern int radar_20146450;
305 SYSCTL_INT(_vm, OID_AUTO, radar_20146450, CTLFLAG_RW | CTLFLAG_LOCKED, &radar_20146450, 0, "");
306 
307 extern int macho_printf;
308 SYSCTL_INT(_vm, OID_AUTO, macho_printf, CTLFLAG_RW | CTLFLAG_LOCKED, &macho_printf, 0, "");
309 
310 extern int apple_protect_pager_data_request_debug;
311 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_data_request_debug, 0, "");
312 
313 #if __arm64__
314 /* These are meant to support the page table accounting unit test. */
315 extern unsigned int arm_hardware_page_size;
316 extern unsigned int arm_pt_desc_size;
317 extern unsigned int arm_pt_root_size;
318 extern unsigned int inuse_user_tteroot_count;
319 extern unsigned int inuse_kernel_tteroot_count;
320 extern unsigned int inuse_user_ttepages_count;
321 extern unsigned int inuse_kernel_ttepages_count;
322 extern unsigned int inuse_user_ptepages_count;
323 extern unsigned int inuse_kernel_ptepages_count;
324 SYSCTL_UINT(_vm, OID_AUTO, native_hw_pagesize, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_hardware_page_size, 0, "");
325 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_desc_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_desc_size, 0, "");
326 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_root_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_root_size, 0, "");
327 SYSCTL_UINT(_vm, OID_AUTO, user_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_tteroot_count, 0, "");
328 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_tteroot_count, 0, "");
329 SYSCTL_UINT(_vm, OID_AUTO, user_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ttepages_count, 0, "");
330 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ttepages_count, 0, "");
331 SYSCTL_UINT(_vm, OID_AUTO, user_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ptepages_count, 0, "");
332 SYSCTL_UINT(_vm, OID_AUTO, kernel_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ptepages_count, 0, "");
333 #if !CONFIG_SPTM
334 extern unsigned int free_page_size_tt_count;
335 extern unsigned int free_tt_count;
336 SYSCTL_UINT(_vm, OID_AUTO, free_1page_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_page_size_tt_count, 0, "");
337 SYSCTL_UINT(_vm, OID_AUTO, free_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_tt_count, 0, "");
338 #endif
339 #if DEVELOPMENT || DEBUG
340 extern unsigned long pmap_asid_flushes;
341 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_flushes, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_flushes, "");
342 extern unsigned long pmap_asid_hits;
343 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_hits, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_hits, "");
344 extern unsigned long pmap_asid_misses;
345 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_misses, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_misses, "");
346 extern unsigned long pmap_speculation_restrictions;
347 SYSCTL_ULONG(_vm, OID_AUTO, pmap_speculation_restrictions, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_speculation_restrictions, "");
348 #endif
349 #endif /* __arm64__ */
350 #endif /* DEVELOPMENT || DEBUG */
351 
352 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor, 0, "");
353 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor_pages, 0, "");
354 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate, 0, "");
355 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate_failure, 0, "");
356 SYSCTL_INT(_vm, OID_AUTO, vm_should_cow_but_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.should_cow_but_wired, 0, "");
357 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow, 0, "");
358 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow_pages, 0, "");
359 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_write, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_write, 0, "");
360 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_copy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_copy, 0, "");
361 #if VM_SCAN_FOR_SHADOW_CHAIN
362 static int vm_shadow_max_enabled = 0;    /* Disabled by default */
363 extern int proc_shadow_max(void);
364 static int
365 vm_shadow_max SYSCTL_HANDLER_ARGS
366 {
367 #pragma unused(arg1, arg2, oidp)
368 	int value = 0;
369 
370 	if (vm_shadow_max_enabled) {
371 		value = proc_shadow_max();
372 	}
373 
374 	return SYSCTL_OUT(req, &value, sizeof(value));
375 }
376 SYSCTL_PROC(_vm, OID_AUTO, vm_shadow_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
377     0, 0, &vm_shadow_max, "I", "");
378 
379 SYSCTL_INT(_vm, OID_AUTO, vm_shadow_max_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_shadow_max_enabled, 0, "");
380 
381 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
382 
383 SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, "");
384 
385 #if PAGE_SLEEP_WITH_INHERITOR
386 #if DEVELOPMENT || DEBUG
387 extern uint32_t page_worker_table_size;
388 SYSCTL_INT(_vm, OID_AUTO, page_worker_table_size, CTLFLAG_RD | CTLFLAG_LOCKED, &page_worker_table_size, 0, "");
389 SCALABLE_COUNTER_DECLARE(page_worker_hash_collisions);
390 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_hash_collisions, page_worker_hash_collisions, "");
391 SCALABLE_COUNTER_DECLARE(page_worker_inheritor_sleeps);
392 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_inheritor_sleeps, page_worker_inheritor_sleeps, "");
393 #endif /* DEVELOPMENT || DEBUG */
394 #endif /* PAGE_SLEEP_WITH_INHERITOR */
395 
396 /*
397  * Sysctl's related to data/stack execution.  See osfmk/vm/vm_map.c
398  */
399 
400 #if DEVELOPMENT || DEBUG
401 extern int allow_stack_exec, allow_data_exec;
402 
403 SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, "");
404 SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, "");
405 
406 #endif /* DEVELOPMENT || DEBUG */
407 
408 static const char *prot_values[] = {
409 	"none",
410 	"read-only",
411 	"write-only",
412 	"read-write",
413 	"execute-only",
414 	"read-execute",
415 	"write-execute",
416 	"read-write-execute"
417 };
418 
419 void
log_stack_execution_failure(addr64_t vaddr,vm_prot_t prot)420 log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot)
421 {
422 	printf("Data/Stack execution not permitted: %s[pid %d] at virtual address 0x%qx, protections were %s\n",
423 	    current_proc()->p_comm, proc_getpid(current_proc()), vaddr, prot_values[prot & VM_PROT_ALL]);
424 }
425 
426 /*
427  * shared_region_unnest_logging: level of logging of unnesting events
428  * 0	- no logging
429  * 1	- throttled logging of unexpected unnesting events (default)
430  * 2	- unthrottled logging of unexpected unnesting events
431  * 3+	- unthrottled logging of all unnesting events
432  */
433 int shared_region_unnest_logging = 1;
434 
435 SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED,
436     &shared_region_unnest_logging, 0, "");
437 
438 int vm_shared_region_unnest_log_interval = 10;
439 int shared_region_unnest_log_count_threshold = 5;
440 
441 
442 #if XNU_TARGET_OS_OSX
443 
444 #if defined (__x86_64__)
445 static int scdir_enforce = 1;
446 #else /* defined (__x86_64__) */
447 static int scdir_enforce = 0;   /* AOT caches live elsewhere */
448 #endif /* defined (__x86_64__) */
449 
450 static char *scdir_path[] = {
451 	"/System/Library/dyld/",
452 	"/System/Volumes/Preboot/Cryptexes/OS/System/Library/dyld",
453 	"/System/Cryptexes/OS/System/Library/dyld",
454 	NULL
455 };
456 
457 #else /* XNU_TARGET_OS_OSX */
458 
459 static int scdir_enforce = 0;
460 static char *scdir_path[] = {
461 	"/System/Library/Caches/com.apple.dyld/",
462 	"/private/preboot/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
463 	"/System/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
464 	NULL
465 };
466 
467 #endif /* XNU_TARGET_OS_OSX */
468 
469 static char *driverkit_scdir_path[] = {
470 	"/System/DriverKit/System/Library/dyld/",
471 #if XNU_TARGET_OS_OSX
472 	"/System/Volumes/Preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
473 #else
474 	"/private/preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
475 #endif /* XNU_TARGET_OS_OSX */
476 	"/System/Cryptexes/OS/System/DriverKit/System/Library/dyld",
477 	NULL
478 };
479 
480 #ifndef SECURE_KERNEL
481 static int sysctl_scdir_enforce SYSCTL_HANDLER_ARGS
482 {
483 #if CONFIG_CSR
484 	if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
485 		printf("Failed attempt to set vm.enforce_shared_cache_dir sysctl\n");
486 		return EPERM;
487 	}
488 #endif /* CONFIG_CSR */
489 	return sysctl_handle_int(oidp, arg1, arg2, req);
490 }
491 
492 SYSCTL_PROC(_vm, OID_AUTO, enforce_shared_cache_dir, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, sysctl_scdir_enforce, "I", "");
493 #endif
494 
495 /* These log rate throttling state variables aren't thread safe, but
496  * are sufficient unto the task.
497  */
498 static int64_t last_unnest_log_time = 0;
499 static int shared_region_unnest_log_count = 0;
500 
501 void
log_unnest_badness(vm_map_t m,vm_map_offset_t s,vm_map_offset_t e,boolean_t is_nested_map,vm_map_offset_t lowest_unnestable_addr)502 log_unnest_badness(
503 	vm_map_t        m,
504 	vm_map_offset_t s,
505 	vm_map_offset_t e,
506 	boolean_t       is_nested_map,
507 	vm_map_offset_t lowest_unnestable_addr)
508 {
509 	struct timeval  tv;
510 
511 	if (shared_region_unnest_logging == 0) {
512 		return;
513 	}
514 
515 	if (shared_region_unnest_logging <= 2 &&
516 	    is_nested_map &&
517 	    s >= lowest_unnestable_addr) {
518 		/*
519 		 * Unnesting of writable map entries is fine.
520 		 */
521 		return;
522 	}
523 
524 	if (shared_region_unnest_logging <= 1) {
525 		microtime(&tv);
526 		if ((tv.tv_sec - last_unnest_log_time) <
527 		    vm_shared_region_unnest_log_interval) {
528 			if (shared_region_unnest_log_count++ >
529 			    shared_region_unnest_log_count_threshold) {
530 				return;
531 			}
532 		} else {
533 			last_unnest_log_time = tv.tv_sec;
534 			shared_region_unnest_log_count = 0;
535 		}
536 	}
537 
538 	DTRACE_VM4(log_unnest_badness,
539 	    vm_map_t, m,
540 	    vm_map_offset_t, s,
541 	    vm_map_offset_t, e,
542 	    vm_map_offset_t, lowest_unnestable_addr);
543 	printf("%s[%d] triggered unnest of range 0x%qx->0x%qx of DYLD shared region in VM map %p. While not abnormal for debuggers, this increases system memory footprint until the target exits.\n", current_proc()->p_comm, proc_getpid(current_proc()), (uint64_t)s, (uint64_t)e, (void *) VM_KERNEL_ADDRPERM(m));
544 }
545 
546 uint64_t
vm_purge_filebacked_pagers(void)547 vm_purge_filebacked_pagers(void)
548 {
549 	uint64_t pages_purged;
550 
551 	pages_purged = 0;
552 	pages_purged += apple_protect_pager_purge_all();
553 	pages_purged += shared_region_pager_purge_all();
554 	pages_purged += dyld_pager_purge_all();
555 #if DEVELOPMENT || DEBUG
556 	printf("%s:%d pages purged: %llu\n", __FUNCTION__, __LINE__, pages_purged);
557 #endif /* DEVELOPMENT || DEBUG */
558 	return pages_purged;
559 }
560 
561 int
useracc(user_addr_ut addr_u,user_size_ut len_u,int prot)562 useracc(
563 	user_addr_ut    addr_u,
564 	user_size_ut    len_u,
565 	int             prot)
566 {
567 	vm_map_t        map;
568 	vm_prot_t       vm_prot = VM_PROT_WRITE;
569 
570 	map = current_map();
571 
572 	if (prot == B_READ) {
573 		vm_prot = VM_PROT_READ;
574 	}
575 
576 	return vm_map_check_protection(map, addr_u,
577 	           vm_sanitize_compute_ut_end(addr_u, len_u), vm_prot,
578 	           VM_SANITIZE_CALLER_USERACC);
579 }
580 
581 #if XNU_PLATFORM_MacOSX
582 static __attribute__((always_inline, warn_unused_result))
583 kern_return_t
vslock_sanitize(vm_map_t map,user_addr_ut addr_u,user_size_ut len_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)584 vslock_sanitize(
585 	vm_map_t                map,
586 	user_addr_ut            addr_u,
587 	user_size_ut            len_u,
588 	vm_sanitize_caller_t    vm_sanitize_caller,
589 	vm_map_offset_t        *start,
590 	vm_map_offset_t        *end,
591 	vm_map_size_t          *size)
592 {
593 	return vm_sanitize_addr_size(addr_u, len_u, vm_sanitize_caller,
594 	           map,
595 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
596 	           size);
597 }
598 #endif /* XNU_PLATFORM_MacOSX */
599 
600 int
vslock(user_addr_ut addr,user_size_ut len)601 vslock(user_addr_ut addr, user_size_ut len)
602 {
603 	kern_return_t kret;
604 
605 #if XNU_PLATFORM_MacOSX
606 	/*
607 	 * Preserve previous behavior on macOS for overflows due to bin
608 	 * compatibility i.e. return success for overflows without doing
609 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
610 	 * for overflow errors which gets converted to KERN_SUCCESS by
611 	 * vm_sanitize_get_kr.
612 	 */
613 	vm_map_offset_t start, end;
614 	vm_map_size_t   size;
615 
616 	kret = vslock_sanitize(current_map(),
617 	    addr,
618 	    len,
619 	    VM_SANITIZE_CALLER_VSLOCK,
620 	    &start,
621 	    &end,
622 	    &size);
623 	if (__improbable(kret != KERN_SUCCESS)) {
624 		switch (vm_sanitize_get_kr(kret)) {
625 		case KERN_SUCCESS:
626 			return 0;
627 		case KERN_INVALID_ADDRESS:
628 		case KERN_NO_SPACE:
629 			return ENOMEM;
630 		case KERN_PROTECTION_FAILURE:
631 			return EACCES;
632 		default:
633 			return EINVAL;
634 		}
635 	}
636 #endif /* XNU_PLATFORM_MacOSX */
637 
638 	kret = vm_map_wire_kernel(current_map(), addr,
639 	    vm_sanitize_compute_ut_end(addr, len),
640 	    vm_sanitize_wrap_prot(VM_PROT_READ | VM_PROT_WRITE),
641 	    VM_KERN_MEMORY_BSD,
642 	    FALSE);
643 
644 	switch (kret) {
645 	case KERN_SUCCESS:
646 		return 0;
647 	case KERN_INVALID_ADDRESS:
648 	case KERN_NO_SPACE:
649 		return ENOMEM;
650 	case KERN_PROTECTION_FAILURE:
651 		return EACCES;
652 	default:
653 		return EINVAL;
654 	}
655 }
656 
657 int
vsunlock(user_addr_ut addr,user_size_ut len,__unused int dirtied)658 vsunlock(user_addr_ut addr, user_size_ut len, __unused int dirtied)
659 {
660 #if FIXME  /* [ */
661 	pmap_t          pmap;
662 	vm_page_t       pg;
663 	vm_map_offset_t vaddr;
664 	ppnum_t         paddr;
665 #endif  /* FIXME ] */
666 	kern_return_t   kret;
667 	vm_map_t        map;
668 
669 	map = current_map();
670 
671 #if FIXME  /* [ */
672 	if (dirtied) {
673 		pmap = get_task_pmap(current_task());
674 		for (vaddr = vm_map_trunc_page(addr, PAGE_MASK);
675 		    vaddr < vm_map_round_page(addr + len, PAGE_MASK);
676 		    vaddr += PAGE_SIZE) {
677 			paddr = pmap_find_phys(pmap, vaddr);
678 			pg = PHYS_TO_VM_PAGE(paddr);
679 			vm_page_set_modified(pg);
680 		}
681 	}
682 #endif  /* FIXME ] */
683 #ifdef  lint
684 	dirtied++;
685 #endif  /* lint */
686 
687 #if XNU_PLATFORM_MacOSX
688 	/*
689 	 * Preserve previous behavior on macOS for overflows due to bin
690 	 * compatibility i.e. return success for overflows without doing
691 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
692 	 * for overflow errors which gets converted to KERN_SUCCESS by
693 	 * vm_sanitize_get_kr.
694 	 */
695 	vm_map_offset_t start, end;
696 	vm_map_size_t   size;
697 
698 	kret = vslock_sanitize(map,
699 	    addr,
700 	    len,
701 	    VM_SANITIZE_CALLER_VSUNLOCK,
702 	    &start,
703 	    &end,
704 	    &size);
705 	if (__improbable(kret != KERN_SUCCESS)) {
706 		switch (vm_sanitize_get_kr(kret)) {
707 		case KERN_SUCCESS:
708 			return 0;
709 		case KERN_INVALID_ADDRESS:
710 		case KERN_NO_SPACE:
711 			return ENOMEM;
712 		case KERN_PROTECTION_FAILURE:
713 			return EACCES;
714 		default:
715 			return EINVAL;
716 		}
717 	}
718 #endif /* XNU_PLATFORM_MacOSX */
719 
720 	kret = vm_map_unwire(map, addr,
721 	    vm_sanitize_compute_ut_end(addr, len), false);
722 	switch (kret) {
723 	case KERN_SUCCESS:
724 		return 0;
725 	case KERN_INVALID_ADDRESS:
726 	case KERN_NO_SPACE:
727 		return ENOMEM;
728 	case KERN_PROTECTION_FAILURE:
729 		return EACCES;
730 	default:
731 		return EINVAL;
732 	}
733 }
734 
735 int
subyte(user_addr_t addr,int byte)736 subyte(
737 	user_addr_t addr,
738 	int byte)
739 {
740 	char character;
741 
742 	character = (char)byte;
743 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
744 }
745 
746 int
suibyte(user_addr_t addr,int byte)747 suibyte(
748 	user_addr_t addr,
749 	int byte)
750 {
751 	char character;
752 
753 	character = (char)byte;
754 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
755 }
756 
757 int
fubyte(user_addr_t addr)758 fubyte(user_addr_t addr)
759 {
760 	unsigned char byte;
761 
762 	if (copyin(addr, (void *) &byte, sizeof(char))) {
763 		return -1;
764 	}
765 	return byte;
766 }
767 
768 int
fuibyte(user_addr_t addr)769 fuibyte(user_addr_t addr)
770 {
771 	unsigned char byte;
772 
773 	if (copyin(addr, (void *) &(byte), sizeof(char))) {
774 		return -1;
775 	}
776 	return byte;
777 }
778 
779 int
suword(user_addr_t addr,long word)780 suword(
781 	user_addr_t addr,
782 	long word)
783 {
784 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
785 }
786 
787 long
fuword(user_addr_t addr)788 fuword(user_addr_t addr)
789 {
790 	long word = 0;
791 
792 	if (copyin(addr, (void *) &word, sizeof(int))) {
793 		return -1;
794 	}
795 	return word;
796 }
797 
798 /* suiword and fuiword are the same as suword and fuword, respectively */
799 
800 int
suiword(user_addr_t addr,long word)801 suiword(
802 	user_addr_t addr,
803 	long word)
804 {
805 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
806 }
807 
808 long
fuiword(user_addr_t addr)809 fuiword(user_addr_t addr)
810 {
811 	long word = 0;
812 
813 	if (copyin(addr, (void *) &word, sizeof(int))) {
814 		return -1;
815 	}
816 	return word;
817 }
818 
819 /*
820  * With a 32-bit kernel and mixed 32/64-bit user tasks, this interface allows the
821  * fetching and setting of process-sized size_t and pointer values.
822  */
823 int
sulong(user_addr_t addr,int64_t word)824 sulong(user_addr_t addr, int64_t word)
825 {
826 	if (IS_64BIT_PROCESS(current_proc())) {
827 		return copyout((void *)&word, addr, sizeof(word)) == 0 ? 0 : -1;
828 	} else {
829 		return suiword(addr, (long)word);
830 	}
831 }
832 
833 int64_t
fulong(user_addr_t addr)834 fulong(user_addr_t addr)
835 {
836 	int64_t longword;
837 
838 	if (IS_64BIT_PROCESS(current_proc())) {
839 		if (copyin(addr, (void *)&longword, sizeof(longword)) != 0) {
840 			return -1;
841 		}
842 		return longword;
843 	} else {
844 		return (int64_t)fuiword(addr);
845 	}
846 }
847 
848 int
suulong(user_addr_t addr,uint64_t uword)849 suulong(user_addr_t addr, uint64_t uword)
850 {
851 	if (IS_64BIT_PROCESS(current_proc())) {
852 		return copyout((void *)&uword, addr, sizeof(uword)) == 0 ? 0 : -1;
853 	} else {
854 		return suiword(addr, (uint32_t)uword);
855 	}
856 }
857 
858 uint64_t
fuulong(user_addr_t addr)859 fuulong(user_addr_t addr)
860 {
861 	uint64_t ulongword;
862 
863 	if (IS_64BIT_PROCESS(current_proc())) {
864 		if (copyin(addr, (void *)&ulongword, sizeof(ulongword)) != 0) {
865 			return -1ULL;
866 		}
867 		return ulongword;
868 	} else {
869 		return (uint64_t)fuiword(addr);
870 	}
871 }
872 
873 int
swapon(__unused proc_t procp,__unused struct swapon_args * uap,__unused int * retval)874 swapon(__unused proc_t procp, __unused struct swapon_args *uap, __unused int *retval)
875 {
876 	return ENOTSUP;
877 }
878 
879 #if defined(SECURE_KERNEL)
880 static int kern_secure_kernel = 1;
881 #else
882 static int kern_secure_kernel = 0;
883 #endif
884 
885 SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_secure_kernel, 0, "");
886 SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW | CTLFLAG_LOCKED,
887     &shared_region_trace_level, 0, "");
888 SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD | CTLFLAG_LOCKED,
889     &shared_region_version, 0, "");
890 SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED,
891     &shared_region_persistence, 0, "");
892 
893 /*
894  * shared_region_check_np:
895  *
896  * This system call is intended for dyld.
897  *
898  * dyld calls this when any process starts to see if the process's shared
899  * region is already set up and ready to use.
900  * This call returns the base address of the first mapping in the
901  * process's shared region's first mapping.
902  * dyld will then check what's mapped at that address.
903  *
904  * If the shared region is empty, dyld will then attempt to map the shared
905  * cache file in the shared region via the shared_region_map_np() system call.
906  *
907  * If something's already mapped in the shared region, dyld will check if it
908  * matches the shared cache it would like to use for that process.
909  * If it matches, evrything's ready and the process can proceed and use the
910  * shared region.
911  * If it doesn't match, dyld will unmap the shared region and map the shared
912  * cache into the process's address space via mmap().
913  *
914  * A NULL pointer argument can be used by dyld to indicate it has unmapped
915  * the shared region. We will remove the shared_region reference from the task.
916  *
917  * ERROR VALUES
918  * EINVAL	no shared region
919  * ENOMEM	shared region is empty
920  * EFAULT	bad address for "start_address"
921  */
922 int
shared_region_check_np(__unused struct proc * p,struct shared_region_check_np_args * uap,__unused int * retvalp)923 shared_region_check_np(
924 	__unused struct proc                    *p,
925 	struct shared_region_check_np_args      *uap,
926 	__unused int                            *retvalp)
927 {
928 	vm_shared_region_t      shared_region;
929 	mach_vm_offset_t        start_address = 0;
930 	int                     error = 0;
931 	kern_return_t           kr;
932 	task_t                  task = current_task();
933 
934 	SHARED_REGION_TRACE_DEBUG(
935 		("shared_region: %p [%d(%s)] -> check_np(0x%llx)\n",
936 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
937 		proc_getpid(p), p->p_comm,
938 		(uint64_t)uap->start_address));
939 
940 	/*
941 	 * Special value of start_address used to indicate that map_with_linking() should
942 	 * no longer be allowed in this process
943 	 */
944 	if (uap->start_address == (task_get_64bit_addr(task) ? DYLD_VM_END_MWL : (uint32_t)DYLD_VM_END_MWL)) {
945 		p->p_disallow_map_with_linking = TRUE;
946 		return 0;
947 	}
948 
949 	/* retrieve the current tasks's shared region */
950 	shared_region = vm_shared_region_get(task);
951 	if (shared_region != NULL) {
952 		/*
953 		 * A NULL argument is used by dyld to indicate the task
954 		 * has unmapped its shared region.
955 		 */
956 		if (uap->start_address == 0) {
957 			/* unmap it first */
958 			vm_shared_region_remove(task, shared_region);
959 			vm_shared_region_set(task, NULL);
960 		} else {
961 			/* retrieve address of its first mapping... */
962 			kr = vm_shared_region_start_address(shared_region, &start_address, task);
963 			if (kr != KERN_SUCCESS) {
964 				SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
965 				    "check_np(0x%llx) "
966 				    "vm_shared_region_start_address() failed\n",
967 				    (void *)VM_KERNEL_ADDRPERM(current_thread()),
968 				    proc_getpid(p), p->p_comm,
969 				    (uint64_t)uap->start_address));
970 				error = ENOMEM;
971 			} else {
972 #if __has_feature(ptrauth_calls)
973 				/*
974 				 * Remap any section of the shared library that
975 				 * has authenticated pointers into private memory.
976 				 */
977 				if (vm_shared_region_auth_remap(shared_region) != KERN_SUCCESS) {
978 					SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
979 					    "check_np(0x%llx) "
980 					    "vm_shared_region_auth_remap() failed\n",
981 					    (void *)VM_KERNEL_ADDRPERM(current_thread()),
982 					    proc_getpid(p), p->p_comm,
983 					    (uint64_t)uap->start_address));
984 					error = ENOMEM;
985 				}
986 #endif /* __has_feature(ptrauth_calls) */
987 
988 				/* ... and give it to the caller */
989 				if (error == 0) {
990 					error = copyout(&start_address,
991 					    (user_addr_t) uap->start_address,
992 					    sizeof(start_address));
993 					if (error != 0) {
994 						SHARED_REGION_TRACE_ERROR(
995 							("shared_region: %p [%d(%s)] "
996 							"check_np(0x%llx) "
997 							"copyout(0x%llx) error %d\n",
998 							(void *)VM_KERNEL_ADDRPERM(current_thread()),
999 							proc_getpid(p), p->p_comm,
1000 							(uint64_t)uap->start_address, (uint64_t)start_address,
1001 							error));
1002 					}
1003 				}
1004 			}
1005 		}
1006 		vm_shared_region_deallocate(shared_region);
1007 	} else {
1008 		/* no shared region ! */
1009 		error = EINVAL;
1010 	}
1011 
1012 	SHARED_REGION_TRACE_DEBUG(
1013 		("shared_region: %p [%d(%s)] check_np(0x%llx) <- 0x%llx %d\n",
1014 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1015 		proc_getpid(p), p->p_comm,
1016 		(uint64_t)uap->start_address, (uint64_t)start_address, error));
1017 
1018 	return error;
1019 }
1020 
1021 
1022 static int
shared_region_copyin(struct proc * p,user_addr_t user_addr,unsigned int count,unsigned int element_size,void * kernel_data)1023 shared_region_copyin(
1024 	struct proc  *p,
1025 	user_addr_t  user_addr,
1026 	unsigned int count,
1027 	unsigned int element_size,
1028 	void         *kernel_data)
1029 {
1030 	int             error = 0;
1031 	vm_size_t       size = count * element_size;
1032 
1033 	error = copyin(user_addr, kernel_data, size);
1034 	if (error) {
1035 		SHARED_REGION_TRACE_ERROR(
1036 			("shared_region: %p [%d(%s)] map(): "
1037 			"copyin(0x%llx, %ld) failed (error=%d)\n",
1038 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1039 			proc_getpid(p), p->p_comm,
1040 			(uint64_t)user_addr, (long)size, error));
1041 	}
1042 	return error;
1043 }
1044 
1045 /*
1046  * A reasonable upper limit to prevent overflow of allocation/copyin.
1047  */
1048 #define _SR_FILE_MAPPINGS_MAX_FILES 256
1049 
1050 /* forward declaration */
1051 __attribute__((noinline))
1052 static void shared_region_map_and_slide_cleanup(
1053 	struct proc              *p,
1054 	uint32_t                 files_count,
1055 	struct _sr_file_mappings *sr_file_mappings,
1056 	struct vm_shared_region  *shared_region);
1057 
1058 /*
1059  * Setup part of _shared_region_map_and_slide().
1060  * It had to be broken out of _shared_region_map_and_slide() to
1061  * prevent compiler inlining from blowing out the stack.
1062  */
1063 __attribute__((noinline))
1064 static int
shared_region_map_and_slide_setup(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings,struct _sr_file_mappings ** sr_file_mappings,struct vm_shared_region ** shared_region_ptr,struct vnode * rdir_vp)1065 shared_region_map_and_slide_setup(
1066 	struct proc                         *p,
1067 	uint32_t                            files_count,
1068 	struct shared_file_np               *files,
1069 	uint32_t                            mappings_count,
1070 	struct shared_file_mapping_slide_np *mappings,
1071 	struct _sr_file_mappings            **sr_file_mappings,
1072 	struct vm_shared_region             **shared_region_ptr,
1073 	struct vnode                        *rdir_vp)
1074 {
1075 	int                             error = 0;
1076 	struct _sr_file_mappings        *srfmp;
1077 	uint32_t                        mappings_next;
1078 	struct vnode_attr               va;
1079 	off_t                           fs;
1080 #if CONFIG_MACF
1081 	vm_prot_t                       maxprot = VM_PROT_ALL;
1082 #endif
1083 	uint32_t                        i;
1084 	struct vm_shared_region         *shared_region = NULL;
1085 	boolean_t                       is_driverkit = task_is_driver(current_task());
1086 
1087 	SHARED_REGION_TRACE_DEBUG(
1088 		("shared_region: %p [%d(%s)] -> map\n",
1089 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1090 		proc_getpid(p), p->p_comm));
1091 
1092 	if (files_count > _SR_FILE_MAPPINGS_MAX_FILES) {
1093 		error = E2BIG;
1094 		goto done;
1095 	}
1096 	if (files_count == 0) {
1097 		error = EINVAL;
1098 		goto done;
1099 	}
1100 	*sr_file_mappings = kalloc_type(struct _sr_file_mappings, files_count,
1101 	    Z_WAITOK | Z_ZERO);
1102 	if (*sr_file_mappings == NULL) {
1103 		error = ENOMEM;
1104 		goto done;
1105 	}
1106 	mappings_next = 0;
1107 	for (i = 0; i < files_count; i++) {
1108 		srfmp = &(*sr_file_mappings)[i];
1109 		srfmp->fd = files[i].sf_fd;
1110 		srfmp->mappings_count = files[i].sf_mappings_count;
1111 		srfmp->mappings = &mappings[mappings_next];
1112 		mappings_next += srfmp->mappings_count;
1113 		if (mappings_next > mappings_count) {
1114 			error = EINVAL;
1115 			goto done;
1116 		}
1117 		srfmp->slide = files[i].sf_slide;
1118 	}
1119 
1120 	/* get the process's shared region (setup in vm_map_exec()) */
1121 	shared_region = vm_shared_region_trim_and_get(current_task());
1122 	*shared_region_ptr = shared_region;
1123 	if (shared_region == NULL) {
1124 		SHARED_REGION_TRACE_ERROR(
1125 			("shared_region: %p [%d(%s)] map(): "
1126 			"no shared region\n",
1127 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1128 			proc_getpid(p), p->p_comm));
1129 		error = EINVAL;
1130 		goto done;
1131 	}
1132 
1133 	/*
1134 	 * Check the shared region matches the current root
1135 	 * directory of this process.  Deny the mapping to
1136 	 * avoid tainting the shared region with something that
1137 	 * doesn't quite belong into it.
1138 	 */
1139 	struct vnode *sr_vnode = vm_shared_region_root_dir(shared_region);
1140 	if (sr_vnode != NULL ?  rdir_vp != sr_vnode : rdir_vp != rootvnode) {
1141 		SHARED_REGION_TRACE_ERROR(
1142 			("shared_region: map(%p) root_dir mismatch\n",
1143 			(void *)VM_KERNEL_ADDRPERM(current_thread())));
1144 		error = EPERM;
1145 		goto done;
1146 	}
1147 
1148 
1149 	for (srfmp = &(*sr_file_mappings)[0];
1150 	    srfmp < &(*sr_file_mappings)[files_count];
1151 	    srfmp++) {
1152 		if (srfmp->mappings_count == 0) {
1153 			/* no mappings here... */
1154 			continue;
1155 		}
1156 
1157 		/*
1158 		 * A file descriptor of -1 is used to indicate that the data
1159 		 * to be put in the shared region for this mapping comes directly
1160 		 * from the processes address space. Ensure we have proper alignments.
1161 		 */
1162 		if (srfmp->fd == -1) {
1163 			/* only allow one mapping per fd */
1164 			if (srfmp->mappings_count > 1) {
1165 				SHARED_REGION_TRACE_ERROR(
1166 					("shared_region: %p [%d(%s)] map data >1 mapping\n",
1167 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1168 					proc_getpid(p), p->p_comm));
1169 				error = EINVAL;
1170 				goto done;
1171 			}
1172 
1173 			/*
1174 			 * The destination address and size must be page aligned.
1175 			 */
1176 			struct shared_file_mapping_slide_np *mapping = &srfmp->mappings[0];
1177 			mach_vm_address_t dest_addr = mapping->sms_address;
1178 			mach_vm_size_t    map_size = mapping->sms_size;
1179 			if (!vm_map_page_aligned(dest_addr, vm_map_page_mask(current_map()))) {
1180 				SHARED_REGION_TRACE_ERROR(
1181 					("shared_region: %p [%d(%s)] map data destination 0x%llx not aligned\n",
1182 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1183 					proc_getpid(p), p->p_comm, dest_addr));
1184 				error = EINVAL;
1185 				goto done;
1186 			}
1187 			if (!vm_map_page_aligned(map_size, vm_map_page_mask(current_map()))) {
1188 				SHARED_REGION_TRACE_ERROR(
1189 					("shared_region: %p [%d(%s)] map data size 0x%llx not aligned\n",
1190 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1191 					proc_getpid(p), p->p_comm, map_size));
1192 				error = EINVAL;
1193 				goto done;
1194 			}
1195 			continue;
1196 		}
1197 
1198 		/* get file structure from file descriptor */
1199 		error = fp_get_ftype(p, srfmp->fd, DTYPE_VNODE, EINVAL, &srfmp->fp);
1200 		if (error) {
1201 			SHARED_REGION_TRACE_ERROR(
1202 				("shared_region: %p [%d(%s)] map: "
1203 				"fd=%d lookup failed (error=%d)\n",
1204 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1205 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1206 			goto done;
1207 		}
1208 
1209 		/* we need at least read permission on the file */
1210 		if (!(srfmp->fp->fp_glob->fg_flag & FREAD)) {
1211 			SHARED_REGION_TRACE_ERROR(
1212 				("shared_region: %p [%d(%s)] map: "
1213 				"fd=%d not readable\n",
1214 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1215 				proc_getpid(p), p->p_comm, srfmp->fd));
1216 			error = EPERM;
1217 			goto done;
1218 		}
1219 
1220 		/* get vnode from file structure */
1221 		error = vnode_getwithref((vnode_t)fp_get_data(srfmp->fp));
1222 		if (error) {
1223 			SHARED_REGION_TRACE_ERROR(
1224 				("shared_region: %p [%d(%s)] map: "
1225 				"fd=%d getwithref failed (error=%d)\n",
1226 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1227 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1228 			goto done;
1229 		}
1230 		srfmp->vp = (struct vnode *)fp_get_data(srfmp->fp);
1231 
1232 		/* make sure the vnode is a regular file */
1233 		if (srfmp->vp->v_type != VREG) {
1234 			SHARED_REGION_TRACE_ERROR(
1235 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1236 				"not a file (type=%d)\n",
1237 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1238 				proc_getpid(p), p->p_comm,
1239 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1240 				srfmp->vp->v_name, srfmp->vp->v_type));
1241 			error = EINVAL;
1242 			goto done;
1243 		}
1244 
1245 #if CONFIG_MACF
1246 		/* pass in 0 for the offset argument because AMFI does not need the offset
1247 		 *       of the shared cache */
1248 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
1249 		    srfmp->fp->fp_glob, VM_PROT_ALL, MAP_FILE | MAP_PRIVATE | MAP_FIXED, 0, &maxprot);
1250 		if (error) {
1251 			goto done;
1252 		}
1253 #endif /* MAC */
1254 
1255 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1256 		/*
1257 		 * Check if the shared cache is in the trust cache;
1258 		 * if so, we can skip the root ownership check.
1259 		 */
1260 #if DEVELOPMENT || DEBUG
1261 		/*
1262 		 * Skip both root ownership and trust cache check if
1263 		 * enforcement is disabled.
1264 		 */
1265 		if (!cs_system_enforcement()) {
1266 			goto after_root_check;
1267 		}
1268 #endif /* DEVELOPMENT || DEBUG */
1269 		struct cs_blob *blob = csvnode_get_blob(srfmp->vp, 0);
1270 		if (blob == NULL) {
1271 			SHARED_REGION_TRACE_ERROR(
1272 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1273 				"missing CS blob\n",
1274 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1275 				proc_getpid(p), p->p_comm,
1276 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1277 				srfmp->vp->v_name));
1278 			goto root_check;
1279 		}
1280 		const uint8_t *cdhash = csblob_get_cdhash(blob);
1281 		if (cdhash == NULL) {
1282 			SHARED_REGION_TRACE_ERROR(
1283 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1284 				"missing cdhash\n",
1285 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1286 				proc_getpid(p), p->p_comm,
1287 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1288 				srfmp->vp->v_name));
1289 			goto root_check;
1290 		}
1291 
1292 		bool in_trust_cache = false;
1293 		TrustCacheQueryToken_t qt;
1294 		if (query_trust_cache(kTCQueryTypeAll, cdhash, &qt) == KERN_SUCCESS) {
1295 			TCType_t tc_type = kTCTypeInvalid;
1296 			TCReturn_t tc_ret = amfi->TrustCache.queryGetTCType(&qt, &tc_type);
1297 			in_trust_cache = (tc_ret.error == kTCReturnSuccess &&
1298 			    (tc_type == kTCTypeCryptex1BootOS ||
1299 			    tc_type == kTCTypeStatic ||
1300 			    tc_type == kTCTypeEngineering));
1301 		}
1302 		if (!in_trust_cache) {
1303 			SHARED_REGION_TRACE_ERROR(
1304 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1305 				"not in trust cache\n",
1306 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1307 				proc_getpid(p), p->p_comm,
1308 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1309 				srfmp->vp->v_name));
1310 			goto root_check;
1311 		}
1312 		goto after_root_check;
1313 root_check:
1314 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1315 
1316 		/* The shared cache file must be owned by root */
1317 		VATTR_INIT(&va);
1318 		VATTR_WANTED(&va, va_uid);
1319 		error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1320 		if (error) {
1321 			SHARED_REGION_TRACE_ERROR(
1322 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1323 				"vnode_getattr(%p) failed (error=%d)\n",
1324 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1325 				proc_getpid(p), p->p_comm,
1326 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1327 				srfmp->vp->v_name,
1328 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1329 				error));
1330 			goto done;
1331 		}
1332 		if (va.va_uid != 0) {
1333 			SHARED_REGION_TRACE_ERROR(
1334 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1335 				"owned by uid=%d instead of 0\n",
1336 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1337 				proc_getpid(p), p->p_comm,
1338 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1339 				srfmp->vp->v_name, va.va_uid));
1340 			error = EPERM;
1341 			goto done;
1342 		}
1343 
1344 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1345 after_root_check:
1346 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1347 
1348 #if CONFIG_CSR
1349 		if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
1350 			VATTR_INIT(&va);
1351 			VATTR_WANTED(&va, va_flags);
1352 			error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1353 			if (error) {
1354 				SHARED_REGION_TRACE_ERROR(
1355 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1356 					"vnode_getattr(%p) failed (error=%d)\n",
1357 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1358 					proc_getpid(p), p->p_comm,
1359 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1360 					srfmp->vp->v_name,
1361 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1362 					error));
1363 				goto done;
1364 			}
1365 
1366 			if (!(va.va_flags & SF_RESTRICTED)) {
1367 				/*
1368 				 * CSR is not configured in CSR_ALLOW_UNRESTRICTED_FS mode, and
1369 				 * the shared cache file is NOT SIP-protected, so reject the
1370 				 * mapping request
1371 				 */
1372 				SHARED_REGION_TRACE_ERROR(
1373 					("shared_region: %p [%d(%s)] map(%p:'%s'), "
1374 					"vnode is not SIP-protected. \n",
1375 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1376 					proc_getpid(p), p->p_comm,
1377 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1378 					srfmp->vp->v_name));
1379 				error = EPERM;
1380 				goto done;
1381 			}
1382 		}
1383 #else /* CONFIG_CSR */
1384 
1385 		/*
1386 		 * Devices without SIP/ROSP need to make sure that the shared cache
1387 		 * is either on the root volume or in the preboot cryptex volume.
1388 		 */
1389 		assert(rdir_vp != NULL);
1390 		if (srfmp->vp->v_mount != rdir_vp->v_mount) {
1391 			vnode_t preboot_vp = NULL;
1392 #if XNU_TARGET_OS_OSX
1393 #define PREBOOT_CRYPTEX_PATH "/System/Volumes/Preboot/Cryptexes"
1394 #else
1395 #define PREBOOT_CRYPTEX_PATH "/private/preboot/Cryptexes"
1396 #endif
1397 			error = vnode_lookup(PREBOOT_CRYPTEX_PATH, 0, &preboot_vp, vfs_context_current());
1398 			if (error || srfmp->vp->v_mount != preboot_vp->v_mount) {
1399 				SHARED_REGION_TRACE_ERROR(
1400 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1401 					"not on process' root volume nor preboot volume\n",
1402 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1403 					proc_getpid(p), p->p_comm,
1404 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1405 					srfmp->vp->v_name));
1406 				error = EPERM;
1407 				if (preboot_vp) {
1408 					(void)vnode_put(preboot_vp);
1409 				}
1410 				goto done;
1411 			} else if (preboot_vp) {
1412 				(void)vnode_put(preboot_vp);
1413 			}
1414 		}
1415 #endif /* CONFIG_CSR */
1416 
1417 		if (scdir_enforce) {
1418 			char **expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1419 			struct vnode *scdir_vp = NULL;
1420 			for (expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1421 			    *expected_scdir_path != NULL;
1422 			    expected_scdir_path++) {
1423 				/* get vnode for expected_scdir_path */
1424 				error = vnode_lookup(*expected_scdir_path, 0, &scdir_vp, vfs_context_current());
1425 				if (error) {
1426 					SHARED_REGION_TRACE_ERROR(
1427 						("shared_region: %p [%d(%s)]: "
1428 						"vnode_lookup(%s) failed (error=%d)\n",
1429 						(void *)VM_KERNEL_ADDRPERM(current_thread()),
1430 						proc_getpid(p), p->p_comm,
1431 						*expected_scdir_path, error));
1432 					continue;
1433 				}
1434 
1435 				/* check if parent is scdir_vp */
1436 				assert(scdir_vp != NULL);
1437 				if (vnode_parent(srfmp->vp) == scdir_vp) {
1438 					(void)vnode_put(scdir_vp);
1439 					scdir_vp = NULL;
1440 					goto scdir_ok;
1441 				}
1442 				(void)vnode_put(scdir_vp);
1443 				scdir_vp = NULL;
1444 			}
1445 			/* nothing matches */
1446 			SHARED_REGION_TRACE_ERROR(
1447 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1448 				"shared cache file not in expected directory\n",
1449 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1450 				proc_getpid(p), p->p_comm,
1451 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1452 				srfmp->vp->v_name));
1453 			error = EPERM;
1454 			goto done;
1455 		}
1456 scdir_ok:
1457 
1458 		/* get vnode size */
1459 		error = vnode_size(srfmp->vp, &fs, vfs_context_current());
1460 		if (error) {
1461 			SHARED_REGION_TRACE_ERROR(
1462 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1463 				"vnode_size(%p) failed (error=%d)\n",
1464 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1465 				proc_getpid(p), p->p_comm,
1466 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1467 				srfmp->vp->v_name,
1468 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp), error));
1469 			goto done;
1470 		}
1471 		srfmp->file_size = fs;
1472 
1473 		/* get the file's memory object handle */
1474 		srfmp->file_control = ubc_getobject(srfmp->vp, UBC_HOLDOBJECT);
1475 		if (srfmp->file_control == MEMORY_OBJECT_CONTROL_NULL) {
1476 			SHARED_REGION_TRACE_ERROR(
1477 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1478 				"no memory object\n",
1479 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1480 				proc_getpid(p), p->p_comm,
1481 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1482 				srfmp->vp->v_name));
1483 			error = EINVAL;
1484 			goto done;
1485 		}
1486 
1487 		/* check that the mappings are properly covered by code signatures */
1488 		if (!cs_system_enforcement()) {
1489 			/* code signing is not enforced: no need to check */
1490 		} else {
1491 			for (i = 0; i < srfmp->mappings_count; i++) {
1492 				if (srfmp->mappings[i].sms_init_prot & VM_PROT_ZF) {
1493 					/* zero-filled mapping: not backed by the file */
1494 					continue;
1495 				}
1496 				if (ubc_cs_is_range_codesigned(srfmp->vp,
1497 				    srfmp->mappings[i].sms_file_offset,
1498 				    srfmp->mappings[i].sms_size)) {
1499 					/* this mapping is fully covered by code signatures */
1500 					continue;
1501 				}
1502 				SHARED_REGION_TRACE_ERROR(
1503 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1504 					"mapping #%d/%d [0x%llx:0x%llx:0x%llx:0x%x:0x%x] "
1505 					"is not code-signed\n",
1506 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1507 					proc_getpid(p), p->p_comm,
1508 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1509 					srfmp->vp->v_name,
1510 					i, srfmp->mappings_count,
1511 					srfmp->mappings[i].sms_address,
1512 					srfmp->mappings[i].sms_size,
1513 					srfmp->mappings[i].sms_file_offset,
1514 					srfmp->mappings[i].sms_max_prot,
1515 					srfmp->mappings[i].sms_init_prot));
1516 				error = EINVAL;
1517 				goto done;
1518 			}
1519 		}
1520 	}
1521 done:
1522 	if (error != 0) {
1523 		shared_region_map_and_slide_cleanup(p, files_count, *sr_file_mappings, shared_region);
1524 		*sr_file_mappings = NULL;
1525 		*shared_region_ptr = NULL;
1526 	}
1527 	return error;
1528 }
1529 
1530 /*
1531  * shared_region_map_np()
1532  *
1533  * This system call is intended for dyld.
1534  *
1535  * dyld uses this to map a shared cache file into a shared region.
1536  * This is usually done only the first time a shared cache is needed.
1537  * Subsequent processes will just use the populated shared region without
1538  * requiring any further setup.
1539  */
1540 static int
_shared_region_map_and_slide(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings)1541 _shared_region_map_and_slide(
1542 	struct proc                         *p,
1543 	uint32_t                            files_count,
1544 	struct shared_file_np               *files,
1545 	uint32_t                            mappings_count,
1546 	struct shared_file_mapping_slide_np *mappings)
1547 {
1548 	int                             error = 0;
1549 	kern_return_t                   kr = KERN_SUCCESS;
1550 	struct _sr_file_mappings        *sr_file_mappings = NULL;
1551 	struct vnode                    *rdir_vp = NULL;
1552 	struct vm_shared_region         *shared_region = NULL;
1553 
1554 	/*
1555 	 * Get a reference to the current proc's root dir.
1556 	 * Need this to prevent racing with chroot.
1557 	 */
1558 	proc_fdlock(p);
1559 	rdir_vp = p->p_fd.fd_rdir;
1560 	if (rdir_vp == NULL) {
1561 		rdir_vp = rootvnode;
1562 	}
1563 	assert(rdir_vp != NULL);
1564 	vnode_get(rdir_vp);
1565 	proc_fdunlock(p);
1566 
1567 	/*
1568 	 * Turn files, mappings into sr_file_mappings and other setup.
1569 	 */
1570 	error = shared_region_map_and_slide_setup(p, files_count,
1571 	    files, mappings_count, mappings,
1572 	    &sr_file_mappings, &shared_region, rdir_vp);
1573 	if (error != 0) {
1574 		vnode_put(rdir_vp);
1575 		return error;
1576 	}
1577 
1578 	/* map the file(s) into that shared region's submap */
1579 	kr = vm_shared_region_map_file(shared_region, files_count, sr_file_mappings);
1580 	if (kr != KERN_SUCCESS) {
1581 		SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] map(): "
1582 		    "vm_shared_region_map_file() failed kr=0x%x\n",
1583 		    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1584 		    proc_getpid(p), p->p_comm, kr));
1585 	}
1586 
1587 	/* convert kern_return_t to errno */
1588 	switch (kr) {
1589 	case KERN_SUCCESS:
1590 		error = 0;
1591 		break;
1592 	case KERN_INVALID_ADDRESS:
1593 		error = EFAULT;
1594 		break;
1595 	case KERN_PROTECTION_FAILURE:
1596 		error = EPERM;
1597 		break;
1598 	case KERN_NO_SPACE:
1599 		error = ENOMEM;
1600 		break;
1601 	case KERN_FAILURE:
1602 	case KERN_INVALID_ARGUMENT:
1603 	default:
1604 		error = EINVAL;
1605 		break;
1606 	}
1607 
1608 	/*
1609 	 * Mark that this process is now using split libraries.
1610 	 */
1611 	if (error == 0 && (p->p_flag & P_NOSHLIB)) {
1612 		OSBitAndAtomic(~((uint32_t)P_NOSHLIB), &p->p_flag);
1613 	}
1614 
1615 	vnode_put(rdir_vp);
1616 	shared_region_map_and_slide_cleanup(p, files_count, sr_file_mappings, shared_region);
1617 
1618 	SHARED_REGION_TRACE_DEBUG(
1619 		("shared_region: %p [%d(%s)] <- map\n",
1620 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1621 		proc_getpid(p), p->p_comm));
1622 
1623 	return error;
1624 }
1625 
1626 /*
1627  * Clean up part of _shared_region_map_and_slide()
1628  * It had to be broken out of _shared_region_map_and_slide() to
1629  * prevent compiler inlining from blowing out the stack.
1630  */
1631 __attribute__((noinline))
1632 static void
shared_region_map_and_slide_cleanup(struct proc * p,uint32_t files_count,struct _sr_file_mappings * sr_file_mappings,struct vm_shared_region * shared_region)1633 shared_region_map_and_slide_cleanup(
1634 	struct proc              *p,
1635 	uint32_t                 files_count,
1636 	struct _sr_file_mappings *sr_file_mappings,
1637 	struct vm_shared_region  *shared_region)
1638 {
1639 	struct _sr_file_mappings *srfmp;
1640 	struct vnode_attr        va;
1641 
1642 	if (sr_file_mappings != NULL) {
1643 		for (srfmp = &sr_file_mappings[0]; srfmp < &sr_file_mappings[files_count]; srfmp++) {
1644 			if (srfmp->vp != NULL) {
1645 				vnode_lock_spin(srfmp->vp);
1646 				srfmp->vp->v_flag |= VSHARED_DYLD;
1647 				vnode_unlock(srfmp->vp);
1648 
1649 				/* update the vnode's access time */
1650 				if (!(vnode_vfsvisflags(srfmp->vp) & MNT_NOATIME)) {
1651 					VATTR_INIT(&va);
1652 					nanotime(&va.va_access_time);
1653 					VATTR_SET_ACTIVE(&va, va_access_time);
1654 					vnode_setattr(srfmp->vp, &va, vfs_context_current());
1655 				}
1656 
1657 #if NAMEDSTREAMS
1658 				/*
1659 				 * If the shared cache is compressed, it may
1660 				 * have a namedstream vnode instantiated for
1661 				 * for it. That namedstream vnode will also
1662 				 * have to be marked with VSHARED_DYLD.
1663 				 */
1664 				if (vnode_hasnamedstreams(srfmp->vp)) {
1665 					vnode_t svp;
1666 					if (vnode_getnamedstream(srfmp->vp, &svp, XATTR_RESOURCEFORK_NAME,
1667 					    NS_OPEN, 0, vfs_context_kernel()) == 0) {
1668 						vnode_lock_spin(svp);
1669 						svp->v_flag |= VSHARED_DYLD;
1670 						vnode_unlock(svp);
1671 						vnode_put(svp);
1672 					}
1673 				}
1674 #endif /* NAMEDSTREAMS */
1675 				/*
1676 				 * release the vnode...
1677 				 * ubc_map() still holds it for us in the non-error case
1678 				 */
1679 				(void) vnode_put(srfmp->vp);
1680 				srfmp->vp = NULL;
1681 			}
1682 			if (srfmp->fp != NULL) {
1683 				/* release the file descriptor */
1684 				fp_drop(p, srfmp->fd, srfmp->fp, 0);
1685 				srfmp->fp = NULL;
1686 			}
1687 		}
1688 		kfree_type(struct _sr_file_mappings, files_count, sr_file_mappings);
1689 	}
1690 
1691 	if (shared_region != NULL) {
1692 		vm_shared_region_deallocate(shared_region);
1693 	}
1694 }
1695 
1696 /*
1697  * For each file mapped, we may have mappings for:
1698  *    TEXT, EXECUTE, LINKEDIT, DATA_CONST, __AUTH, DATA
1699  * so let's round up to 8 mappings per file.
1700  */
1701 #define SFM_MAX       (_SR_FILE_MAPPINGS_MAX_FILES * 8)     /* max mapping structs allowed to pass in */
1702 
1703 /*
1704  * This is the new interface for setting up shared region mappings.
1705  *
1706  * The slide used for shared regions setup using this interface is done differently
1707  * from the old interface. The slide value passed in the shared_files_np represents
1708  * a max value. The kernel will choose a random value based on that, then use it
1709  * for all shared regions.
1710  */
1711 #if defined (__x86_64__)
1712 #define SLIDE_AMOUNT_MASK ~FOURK_PAGE_MASK
1713 #else
1714 #define SLIDE_AMOUNT_MASK ~SIXTEENK_PAGE_MASK
1715 #endif
1716 
1717 static inline __result_use_check kern_return_t
shared_region_map_and_slide_2_np_sanitize(struct proc * p,user_addr_t mappings_userspace_addr,unsigned int count,shared_file_mapping_slide_np_t * mappings)1718 shared_region_map_and_slide_2_np_sanitize(
1719 	struct proc                         *p,
1720 	user_addr_t                         mappings_userspace_addr,
1721 	unsigned int                        count,
1722 	shared_file_mapping_slide_np_t      *mappings)
1723 {
1724 	kern_return_t kr;
1725 	vm_map_t map = current_map();
1726 	mach_vm_address_t addr, end;
1727 	mach_vm_offset_t offset, offset_end;
1728 	mach_vm_size_t size, offset_size;
1729 	user_addr_t slide_start, slide_end, slide_size;
1730 	vm_prot_t cur;
1731 	vm_prot_t max;
1732 
1733 	user_addr_t user_addr = mappings_userspace_addr;
1734 
1735 	for (size_t i = 0; i < count; i++) {
1736 		shared_file_mapping_slide_np_ut mapping_u;
1737 		/*
1738 		 * First we bring each mapping struct into our kernel stack to
1739 		 * avoid TOCTOU.
1740 		 */
1741 		kr = shared_region_copyin(
1742 			p,
1743 			user_addr,
1744 			1, // copy 1 element at a time
1745 			sizeof(shared_file_mapping_slide_np_ut),
1746 			&mapping_u);
1747 		if (__improbable(kr != KERN_SUCCESS)) {
1748 			return kr;
1749 		}
1750 
1751 		/*
1752 		 * Then, we sanitize the data on the kernel stack.
1753 		 */
1754 		kr = vm_sanitize_addr_size(
1755 			mapping_u.sms_address_u,
1756 			mapping_u.sms_size_u,
1757 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1758 			map,
1759 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1760 			| VM_SANITIZE_FLAGS_CHECK_ALIGNED_START
1761 			| VM_SANITIZE_FLAGS_CHECK_ALIGNED_SIZE),
1762 			&addr,
1763 			&end,
1764 			&size);
1765 		if (__improbable(kr != KERN_SUCCESS)) {
1766 			return kr;
1767 		}
1768 
1769 		kr = vm_sanitize_addr_size(
1770 			mapping_u.sms_file_offset_u,
1771 			mapping_u.sms_size_u,
1772 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1773 			PAGE_MASK,
1774 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1775 			| VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1776 			&offset,
1777 			&offset_end,
1778 			&offset_size);
1779 		if (__improbable(kr != KERN_SUCCESS)) {
1780 			return kr;
1781 		}
1782 		if (__improbable(0 != (offset & vm_map_page_mask(map)))) {
1783 			return KERN_INVALID_ARGUMENT;
1784 		}
1785 
1786 		/*
1787 		 * Unsafe access is immediately followed by wrap to
1788 		 * convert from addr to size.
1789 		 */
1790 		mach_vm_size_ut sms_slide_size_u =
1791 		    vm_sanitize_wrap_size(
1792 			VM_SANITIZE_UNSAFE_UNWRAP(
1793 				mapping_u.sms_slide_size_u));
1794 
1795 		kr = vm_sanitize_addr_size(
1796 			mapping_u.sms_slide_start_u,
1797 			sms_slide_size_u,
1798 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1799 			map,
1800 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1801 			| VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1802 			&slide_start,
1803 			&slide_end,
1804 			&slide_size);
1805 		if (__improbable(kr != KERN_SUCCESS)) {
1806 			return kr;
1807 		}
1808 
1809 		kr = vm_sanitize_cur_and_max_prots(
1810 			mapping_u.sms_init_prot_u,
1811 			mapping_u.sms_max_prot_u,
1812 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1813 			map,
1814 			VM_PROT_SFM_EXTENSIONS_MASK | VM_PROT_TPRO,
1815 			&cur,
1816 			&max);
1817 		if (__improbable(kr != KERN_SUCCESS)) {
1818 			return kr;
1819 		}
1820 
1821 		/*
1822 		 * Finally, we move the data from the kernel stack to our
1823 		 * caller-allocated kernel heap buffer.
1824 		 */
1825 		mappings[i].sms_address = addr;
1826 		mappings[i].sms_size = size;
1827 		mappings[i].sms_file_offset = offset;
1828 		mappings[i].sms_slide_size = slide_size;
1829 		mappings[i].sms_slide_start = slide_start;
1830 		mappings[i].sms_max_prot = max;
1831 		mappings[i].sms_init_prot = cur;
1832 
1833 		if (__improbable(os_add_overflow(
1834 			    user_addr,
1835 			    sizeof(shared_file_mapping_slide_np_ut),
1836 			    &user_addr))) {
1837 			return KERN_INVALID_ARGUMENT;
1838 		}
1839 	}
1840 
1841 	return KERN_SUCCESS;
1842 }
1843 
1844 int
shared_region_map_and_slide_2_np(struct proc * p,struct shared_region_map_and_slide_2_np_args * uap,__unused int * retvalp)1845 shared_region_map_and_slide_2_np(
1846 	struct proc                                  *p,
1847 	struct shared_region_map_and_slide_2_np_args *uap,
1848 	__unused int                                 *retvalp)
1849 {
1850 	unsigned int                  files_count;
1851 	struct shared_file_np         *shared_files = NULL;
1852 	unsigned int                  mappings_count;
1853 	struct shared_file_mapping_slide_np *mappings = NULL;
1854 	kern_return_t                 kr = KERN_SUCCESS;
1855 
1856 	files_count = uap->files_count;
1857 	mappings_count = uap->mappings_count;
1858 
1859 	if (files_count == 0) {
1860 		SHARED_REGION_TRACE_INFO(
1861 			("shared_region: %p [%d(%s)] map(): "
1862 			"no files\n",
1863 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1864 			proc_getpid(p), p->p_comm));
1865 		kr = 0; /* no files to map: we're done ! */
1866 		goto done;
1867 	} else if (files_count <= _SR_FILE_MAPPINGS_MAX_FILES) {
1868 		shared_files = kalloc_data(files_count * sizeof(shared_files[0]), Z_WAITOK);
1869 		if (shared_files == NULL) {
1870 			kr = KERN_RESOURCE_SHORTAGE;
1871 			goto done;
1872 		}
1873 	} else {
1874 		SHARED_REGION_TRACE_ERROR(
1875 			("shared_region: %p [%d(%s)] map(): "
1876 			"too many files (%d) max %d\n",
1877 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1878 			proc_getpid(p), p->p_comm,
1879 			files_count, _SR_FILE_MAPPINGS_MAX_FILES));
1880 		kr = KERN_FAILURE;
1881 		goto done;
1882 	}
1883 
1884 	if (mappings_count == 0) {
1885 		SHARED_REGION_TRACE_INFO(
1886 			("shared_region: %p [%d(%s)] map(): "
1887 			"no mappings\n",
1888 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1889 			proc_getpid(p), p->p_comm));
1890 		kr = 0; /* no mappings: we're done ! */
1891 		goto done;
1892 	} else if (mappings_count <= SFM_MAX) {
1893 		mappings = kalloc_data(mappings_count * sizeof(mappings[0]), Z_WAITOK);
1894 		if (mappings == NULL) {
1895 			kr = KERN_RESOURCE_SHORTAGE;
1896 			goto done;
1897 		}
1898 	} else {
1899 		SHARED_REGION_TRACE_ERROR(
1900 			("shared_region: %p [%d(%s)] map(): "
1901 			"too many mappings (%d) max %d\n",
1902 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1903 			proc_getpid(p), p->p_comm,
1904 			mappings_count, SFM_MAX));
1905 		kr = KERN_FAILURE;
1906 		goto done;
1907 	}
1908 
1909 	/*
1910 	 * struct shared_file_np does not have fields that are subject to
1911 	 * sanitization, it is thus copied from userspace as is.
1912 	 */
1913 	kr = shared_region_copyin(p, uap->files, files_count, sizeof(shared_files[0]), shared_files);
1914 	if (kr != KERN_SUCCESS) {
1915 		goto done;
1916 	}
1917 
1918 	kr = shared_region_map_and_slide_2_np_sanitize(
1919 		p,
1920 		uap->mappings_u,
1921 		mappings_count,
1922 		mappings);
1923 	if (__improbable(kr != KERN_SUCCESS)) {
1924 		kr = vm_sanitize_get_kr(kr);
1925 		goto done;
1926 	}
1927 
1928 	uint32_t max_slide = shared_files[0].sf_slide;
1929 	uint32_t random_val;
1930 	uint32_t slide_amount;
1931 
1932 	if (max_slide != 0) {
1933 		read_random(&random_val, sizeof random_val);
1934 		slide_amount = ((random_val % max_slide) & SLIDE_AMOUNT_MASK);
1935 	} else {
1936 		slide_amount = 0;
1937 	}
1938 #if DEVELOPMENT || DEBUG
1939 	extern bool bootarg_disable_aslr;
1940 	if (bootarg_disable_aslr) {
1941 		slide_amount = 0;
1942 	}
1943 #endif /* DEVELOPMENT || DEBUG */
1944 
1945 	/*
1946 	 * Fix up the mappings to reflect the desired slide.
1947 	 */
1948 	unsigned int f;
1949 	unsigned int m = 0;
1950 	unsigned int i;
1951 	for (f = 0; f < files_count; ++f) {
1952 		shared_files[f].sf_slide = slide_amount;
1953 		for (i = 0; i < shared_files[f].sf_mappings_count; ++i, ++m) {
1954 			if (m >= mappings_count) {
1955 				SHARED_REGION_TRACE_ERROR(
1956 					("shared_region: %p [%d(%s)] map(): "
1957 					"mapping count argument was too small\n",
1958 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1959 					proc_getpid(p), p->p_comm));
1960 				kr = KERN_FAILURE;
1961 				goto done;
1962 			}
1963 			if (__improbable(
1964 				    os_add_overflow(
1965 					    mappings[m].sms_address,
1966 					    slide_amount,
1967 					    &mappings[m].sms_address))) {
1968 				kr = KERN_INVALID_ARGUMENT;
1969 				goto done;
1970 			}
1971 			if (mappings[m].sms_slide_size != 0) {
1972 				mach_vm_address_t discard;
1973 				/* Slide and check that new start/size pairs do not overflow. */
1974 				if (__improbable(
1975 					    os_add_overflow(
1976 						    mappings[m].sms_slide_start,
1977 						    slide_amount,
1978 						    &mappings[m].sms_slide_start) ||
1979 					    os_add_overflow(
1980 						    mappings[m].sms_slide_start,
1981 						    mappings[m].sms_slide_size,
1982 						    &discard))) {
1983 					kr = KERN_INVALID_ARGUMENT;
1984 					goto done;
1985 				}
1986 			}
1987 		}
1988 	}
1989 
1990 	kr = _shared_region_map_and_slide(p, files_count, shared_files, mappings_count, mappings);
1991 done:
1992 	kfree_data(shared_files, files_count * sizeof(shared_files[0]));
1993 	kfree_data(mappings, mappings_count * sizeof(mappings[0]));
1994 	return kr;
1995 }
1996 
1997 /*
1998  * A syscall for dyld to use to map data pages that need load time relocation fixups.
1999  * The fixups are performed by a custom pager during page-in, so the pages still appear
2000  * "clean" and hence are easily discarded under memory pressure. They can be re-paged-in
2001  * on demand later, all w/o using the compressor.
2002  *
2003  * Note these page are treated as MAP_PRIVATE. So if the application dirties any pages while
2004  * running, they are COW'd as normal.
2005  */
2006 int
map_with_linking_np(struct proc * p,struct map_with_linking_np_args * uap,__unused int * retvalp)2007 map_with_linking_np(
2008 	struct proc                     *p,
2009 	struct map_with_linking_np_args *uap,
2010 	__unused int                    *retvalp)
2011 {
2012 	uint32_t                        region_count;
2013 	uint32_t                        r;
2014 	struct mwl_region               *regions = NULL;
2015 	struct mwl_region               *rp;
2016 	uint32_t                        link_info_size;
2017 	void                            *link_info = NULL;      /* starts with a struct mwl_info_hdr */
2018 	struct mwl_info_hdr             *info_hdr = NULL;
2019 	uint64_t                        binds_size;
2020 	int                             fd;
2021 	struct fileproc                 *fp = NULL;
2022 	struct vnode                    *vp = NULL;
2023 	size_t                          file_size;
2024 	off_t                           fs;
2025 	struct vnode_attr               va;
2026 	memory_object_control_t         file_control = NULL;
2027 	int                             error;
2028 	kern_return_t                   kr = KERN_SUCCESS;
2029 
2030 	/*
2031 	 * Check if dyld has told us it finished with this call.
2032 	 */
2033 	if (p->p_disallow_map_with_linking) {
2034 		printf("%s: [%d(%s)]: map__with_linking() was disabled\n",
2035 		    __func__, proc_getpid(p), p->p_comm);
2036 		kr = KERN_FAILURE;
2037 		goto done;
2038 	}
2039 
2040 	/*
2041 	 * First we do some sanity checking on what dyld has passed us.
2042 	 */
2043 	region_count = uap->region_count;
2044 	link_info_size = uap->link_info_size;
2045 	if (region_count == 0) {
2046 		printf("%s: [%d(%s)]: region_count == 0\n",
2047 		    __func__, proc_getpid(p), p->p_comm);
2048 		kr = KERN_FAILURE;
2049 		goto done;
2050 	}
2051 	if (region_count > MWL_MAX_REGION_COUNT) {
2052 		printf("%s: [%d(%s)]: region_count too big %d\n",
2053 		    __func__, proc_getpid(p), p->p_comm, region_count);
2054 		kr = KERN_FAILURE;
2055 		goto done;
2056 	}
2057 
2058 	if (link_info_size <= MWL_MIN_LINK_INFO_SIZE) {
2059 		printf("%s: [%d(%s)]: link_info_size too small\n",
2060 		    __func__, proc_getpid(p), p->p_comm);
2061 		kr = KERN_FAILURE;
2062 		goto done;
2063 	}
2064 	if (link_info_size >= MWL_MAX_LINK_INFO_SIZE) {
2065 		printf("%s: [%d(%s)]: link_info_size too big %d\n",
2066 		    __func__, proc_getpid(p), p->p_comm, link_info_size);
2067 		kr = KERN_FAILURE;
2068 		goto done;
2069 	}
2070 
2071 	/*
2072 	 * Allocate and copyin the regions and link info
2073 	 */
2074 	regions = kalloc_data(region_count * sizeof(regions[0]), Z_WAITOK);
2075 	if (regions == NULL) {
2076 		printf("%s: [%d(%s)]: failed to allocate regions\n",
2077 		    __func__, proc_getpid(p), p->p_comm);
2078 		kr = KERN_RESOURCE_SHORTAGE;
2079 		goto done;
2080 	}
2081 	kr = shared_region_copyin(p, uap->regions, region_count, sizeof(regions[0]), regions);
2082 	if (kr != KERN_SUCCESS) {
2083 		printf("%s: [%d(%s)]: failed to copyin regions kr=%d\n",
2084 		    __func__, proc_getpid(p), p->p_comm, kr);
2085 		goto done;
2086 	}
2087 
2088 	link_info = kalloc_data(link_info_size, Z_WAITOK);
2089 	if (link_info == NULL) {
2090 		printf("%s: [%d(%s)]: failed to allocate link_info\n",
2091 		    __func__, proc_getpid(p), p->p_comm);
2092 		kr = KERN_RESOURCE_SHORTAGE;
2093 		goto done;
2094 	}
2095 	kr = shared_region_copyin(p, uap->link_info, 1, link_info_size, link_info);
2096 	if (kr != KERN_SUCCESS) {
2097 		printf("%s: [%d(%s)]: failed to copyin link_info kr=%d\n",
2098 		    __func__, proc_getpid(p), p->p_comm, kr);
2099 		goto done;
2100 	}
2101 
2102 	/*
2103 	 * Do some verification the data structures.
2104 	 */
2105 	info_hdr = (struct mwl_info_hdr *)link_info;
2106 	if (info_hdr->mwli_version != MWL_INFO_VERS) {
2107 		printf("%s: [%d(%s)]: unrecognized mwli_version=%d\n",
2108 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_version);
2109 		kr = KERN_FAILURE;
2110 		goto done;
2111 	}
2112 
2113 	if (info_hdr->mwli_binds_offset > link_info_size) {
2114 		printf("%s: [%d(%s)]: mwli_binds_offset too large %d\n",
2115 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_offset);
2116 		kr = KERN_FAILURE;
2117 		goto done;
2118 	}
2119 
2120 	/* some older devs have s/w page size > h/w page size, no need to support them */
2121 	if (info_hdr->mwli_page_size != PAGE_SIZE) {
2122 		/* no printf, since this is expected on some devices */
2123 		kr = KERN_INVALID_ARGUMENT;
2124 		goto done;
2125 	}
2126 
2127 	binds_size = (uint64_t)info_hdr->mwli_binds_count *
2128 	    ((info_hdr->mwli_pointer_format == DYLD_CHAINED_PTR_32) ? 4 : 8);
2129 	if (binds_size > link_info_size - info_hdr->mwli_binds_offset) {
2130 		printf("%s: [%d(%s)]: mwli_binds_count too large %d\n",
2131 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_count);
2132 		kr = KERN_FAILURE;
2133 		goto done;
2134 	}
2135 
2136 	if (info_hdr->mwli_chains_offset > link_info_size) {
2137 		printf("%s: [%d(%s)]: mwli_chains_offset too large %d\n",
2138 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_offset);
2139 		kr = KERN_FAILURE;
2140 		goto done;
2141 	}
2142 
2143 
2144 	/*
2145 	 * Ensure the chained starts in the link info and make sure the
2146 	 * segment info offsets are within bounds.
2147 	 */
2148 	if (info_hdr->mwli_chains_size < sizeof(struct dyld_chained_starts_in_image)) {
2149 		printf("%s: [%d(%s)]: mwli_chains_size too small %d\n",
2150 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2151 		kr = KERN_FAILURE;
2152 		goto done;
2153 	}
2154 	if (info_hdr->mwli_chains_size > link_info_size - info_hdr->mwli_chains_offset) {
2155 		printf("%s: [%d(%s)]: mwli_chains_size too large %d\n",
2156 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2157 		kr = KERN_FAILURE;
2158 		goto done;
2159 	}
2160 
2161 	/* Note that more verification of offsets is done in the pager itself */
2162 
2163 	/*
2164 	 * Ensure we've only been given one FD and verify valid protections.
2165 	 */
2166 	fd = regions[0].mwlr_fd;
2167 	for (r = 0; r < region_count; ++r) {
2168 		if (regions[r].mwlr_fd != fd) {
2169 			printf("%s: [%d(%s)]: mwlr_fd mismatch %d and %d\n",
2170 			    __func__, proc_getpid(p), p->p_comm, fd, regions[r].mwlr_fd);
2171 			kr = KERN_FAILURE;
2172 			goto done;
2173 		}
2174 
2175 		/*
2176 		 * Only allow data mappings and not zero fill. Permit TPRO
2177 		 * mappings only when VM_PROT_READ | VM_PROT_WRITE.
2178 		 */
2179 		if (regions[r].mwlr_protections & VM_PROT_EXECUTE) {
2180 			printf("%s: [%d(%s)]: mwlr_protections EXECUTE not allowed\n",
2181 			    __func__, proc_getpid(p), p->p_comm);
2182 			kr = KERN_FAILURE;
2183 			goto done;
2184 		}
2185 		if (regions[r].mwlr_protections & VM_PROT_ZF) {
2186 			printf("%s: [%d(%s)]: region %d, found VM_PROT_ZF not allowed\n",
2187 			    __func__, proc_getpid(p), p->p_comm, r);
2188 			kr = KERN_FAILURE;
2189 			goto done;
2190 		}
2191 		if ((regions[r].mwlr_protections & VM_PROT_TPRO) &&
2192 		    !(regions[r].mwlr_protections & VM_PROT_WRITE)) {
2193 			printf("%s: [%d(%s)]: region %d, found VM_PROT_TPRO without VM_PROT_WRITE\n",
2194 			    __func__, proc_getpid(p), p->p_comm, r);
2195 			kr = KERN_FAILURE;
2196 			goto done;
2197 		}
2198 	}
2199 
2200 
2201 	/* get file structure from file descriptor */
2202 	error = fp_get_ftype(p, fd, DTYPE_VNODE, EINVAL, &fp);
2203 	if (error) {
2204 		printf("%s: [%d(%s)]: fp_get_ftype() failed, error %d\n",
2205 		    __func__, proc_getpid(p), p->p_comm, error);
2206 		kr = KERN_FAILURE;
2207 		goto done;
2208 	}
2209 
2210 	/* We need at least read permission on the file */
2211 	if (!(fp->fp_glob->fg_flag & FREAD)) {
2212 		printf("%s: [%d(%s)]: not readable\n",
2213 		    __func__, proc_getpid(p), p->p_comm);
2214 		kr = KERN_FAILURE;
2215 		goto done;
2216 	}
2217 
2218 	/* Get the vnode from file structure */
2219 	vp = (struct vnode *)fp_get_data(fp);
2220 	error = vnode_getwithref(vp);
2221 	if (error) {
2222 		printf("%s: [%d(%s)]: failed to get vnode, error %d\n",
2223 		    __func__, proc_getpid(p), p->p_comm, error);
2224 		kr = KERN_FAILURE;
2225 		vp = NULL; /* just to be sure */
2226 		goto done;
2227 	}
2228 
2229 	/* Make sure the vnode is a regular file */
2230 	if (vp->v_type != VREG) {
2231 		printf("%s: [%d(%s)]: vnode not VREG\n",
2232 		    __func__, proc_getpid(p), p->p_comm);
2233 		kr = KERN_FAILURE;
2234 		goto done;
2235 	}
2236 
2237 	/* get vnode size */
2238 	error = vnode_size(vp, &fs, vfs_context_current());
2239 	if (error) {
2240 		goto done;
2241 	}
2242 	file_size = fs;
2243 
2244 	/* get the file's memory object handle */
2245 	file_control = ubc_getobject(vp, UBC_HOLDOBJECT);
2246 	if (file_control == MEMORY_OBJECT_CONTROL_NULL) {
2247 		printf("%s: [%d(%s)]: no memory object\n",
2248 		    __func__, proc_getpid(p), p->p_comm);
2249 		kr = KERN_FAILURE;
2250 		goto done;
2251 	}
2252 
2253 	for (r = 0; r < region_count; ++r) {
2254 		rp = &regions[r];
2255 
2256 #if CONFIG_MACF
2257 		vm_prot_t prot = (rp->mwlr_protections & VM_PROT_ALL);
2258 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
2259 		    fp->fp_glob, prot, MAP_FILE | MAP_PRIVATE | MAP_FIXED, rp->mwlr_file_offset, &prot);
2260 		if (error) {
2261 			printf("%s: [%d(%s)]: mac_file_check_mmap() failed, region %d, error %d\n",
2262 			    __func__, proc_getpid(p), p->p_comm, r, error);
2263 			kr = KERN_FAILURE;
2264 			goto done;
2265 		}
2266 #endif /* MAC */
2267 
2268 		/* check that the mappings are properly covered by code signatures */
2269 		if (cs_system_enforcement()) {
2270 			if (!ubc_cs_is_range_codesigned(vp, rp->mwlr_file_offset, rp->mwlr_size)) {
2271 				printf("%s: [%d(%s)]: region %d, not code signed\n",
2272 				    __func__, proc_getpid(p), p->p_comm, r);
2273 				kr = KERN_FAILURE;
2274 				goto done;
2275 			}
2276 		}
2277 	}
2278 
2279 	/* update the vnode's access time */
2280 	if (!(vnode_vfsvisflags(vp) & MNT_NOATIME)) {
2281 		VATTR_INIT(&va);
2282 		nanotime(&va.va_access_time);
2283 		VATTR_SET_ACTIVE(&va, va_access_time);
2284 		vnode_setattr(vp, &va, vfs_context_current());
2285 	}
2286 
2287 	/* get the VM to do the work */
2288 	kr = vm_map_with_linking(proc_task(p), regions, region_count, &link_info, link_info_size, file_control);
2289 
2290 done:
2291 	if (fp != NULL) {
2292 		/* release the file descriptor */
2293 		fp_drop(p, fd, fp, 0);
2294 	}
2295 	if (vp != NULL) {
2296 		(void)vnode_put(vp);
2297 	}
2298 	if (regions != NULL) {
2299 		kfree_data(regions, region_count * sizeof(regions[0]));
2300 	}
2301 	/* link info is NULL if it is used in the pager, if things worked */
2302 	if (link_info != NULL) {
2303 		kfree_data(link_info, link_info_size);
2304 	}
2305 
2306 	switch (kr) {
2307 	case KERN_SUCCESS:
2308 		return 0;
2309 	case KERN_RESOURCE_SHORTAGE:
2310 		return ENOMEM;
2311 	default:
2312 		return EINVAL;
2313 	}
2314 }
2315 
2316 #if DEBUG || DEVELOPMENT
2317 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count,
2318     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count, 0, "");
2319 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count_max,
2320     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count_max, 0, "");
2321 #endif /* DEBUG || DEVELOPMENT */
2322 
2323 /* sysctl overflow room */
2324 
2325 SYSCTL_INT(_vm, OID_AUTO, pagesize, CTLFLAG_RD | CTLFLAG_LOCKED,
2326     (int *) &page_size, 0, "vm page size");
2327 
2328 /* vm_page_free_target is provided as a makeshift solution for applications that want to
2329  *       allocate buffer space, possibly purgeable memory, but not cause inactive pages to be
2330  *       reclaimed. It allows the app to calculate how much memory is free outside the free target. */
2331 extern unsigned int     vm_page_free_target;
2332 SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED,
2333     &vm_page_free_target, 0, "Pageout daemon free target");
2334 
2335 SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED,
2336     &vm_pageout_state.vm_memory_pressure, 0, "Memory pressure indicator");
2337 
2338 static int
2339 vm_ctl_page_free_wanted SYSCTL_HANDLER_ARGS
2340 {
2341 #pragma unused(oidp, arg1, arg2)
2342 	unsigned int page_free_wanted;
2343 
2344 	page_free_wanted = mach_vm_ctl_page_free_wanted();
2345 	return SYSCTL_OUT(req, &page_free_wanted, sizeof(page_free_wanted));
2346 }
2347 SYSCTL_PROC(_vm, OID_AUTO, page_free_wanted,
2348     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
2349     0, 0, vm_ctl_page_free_wanted, "I", "");
2350 
2351 extern unsigned int     vm_page_purgeable_count;
2352 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2353     &vm_page_purgeable_count, 0, "Purgeable page count");
2354 
2355 extern unsigned int     vm_page_purgeable_wired_count;
2356 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2357     &vm_page_purgeable_wired_count, 0, "Wired purgeable page count");
2358 
2359 extern unsigned int vm_page_kern_lpage_count;
2360 SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2361     &vm_page_kern_lpage_count, 0, "kernel used large pages");
2362 
2363 SCALABLE_COUNTER_DECLARE(vm_page_grab_count);
2364 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed");
2365 
2366 #if DEVELOPMENT || DEBUG
2367 #if __ARM_MIXED_PAGE_SIZE__
2368 static int vm_mixed_pagesize_supported = 1;
2369 #else
2370 static int vm_mixed_pagesize_supported = 0;
2371 #endif /*__ARM_MIXED_PAGE_SIZE__ */
2372 SYSCTL_INT(_debug, OID_AUTO, vm_mixed_pagesize_supported, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED,
2373     &vm_mixed_pagesize_supported, 0, "kernel support for mixed pagesize");
2374 
2375 SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
2376     &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed");
2377 
2378 SYSCTL_INT(_vm, OID_AUTO, pageout_purged_objects, CTLFLAG_RD | CTLFLAG_LOCKED,
2379     &vm_pageout_debug.vm_pageout_purged_objects, 0, "System purged object count");
2380 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED,
2381     &vm_pageout_debug.vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)");
2382 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED,
2383     &vm_pageout_debug.vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)");
2384 
2385 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2386     &vm_pageout_debug.vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated");
2387 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2388     &vm_pageout_debug.vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated");
2389 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2390     &vm_pageout_debug.vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated");         /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */
2391 SYSCTL_ULONG(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2392     &vm_pageout_vminfo.vm_pageout_freed_cleaned, "Cleaned pages freed");
2393 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2394     &vm_pageout_debug.vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated");
2395 SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2396     &vm_pageout_debug.vm_pageout_enqueued_cleaned, 0, "");         /* sum of next two */
2397 #endif /* DEVELOPMENT || DEBUG */
2398 
2399 extern int madvise_free_debug;
2400 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
2401     &madvise_free_debug, 0, "zero-fill on madvise(MADV_FREE*)");
2402 extern int madvise_free_debug_sometimes;
2403 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug_sometimes, CTLFLAG_RW | CTLFLAG_LOCKED,
2404     &madvise_free_debug_sometimes, 0, "sometimes zero-fill on madvise(MADV_FREE*)");
2405 
2406 SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2407     &vm_page_stats_reusable.reusable_count, 0, "Reusable page count");
2408 SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2409     &vm_page_stats_reusable.reusable_pages_success, "");
2410 SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2411     &vm_page_stats_reusable.reusable_pages_failure, "");
2412 SYSCTL_QUAD(_vm, OID_AUTO, reusable_pages_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2413     &vm_page_stats_reusable.reusable_pages_shared, "");
2414 SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2415     &vm_page_stats_reusable.all_reusable_calls, "");
2416 SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2417     &vm_page_stats_reusable.partial_reusable_calls, "");
2418 SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2419     &vm_page_stats_reusable.reuse_pages_success, "");
2420 SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2421     &vm_page_stats_reusable.reuse_pages_failure, "");
2422 SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2423     &vm_page_stats_reusable.all_reuse_calls, "");
2424 SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2425     &vm_page_stats_reusable.partial_reuse_calls, "");
2426 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2427     &vm_page_stats_reusable.can_reuse_success, "");
2428 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2429     &vm_page_stats_reusable.can_reuse_failure, "");
2430 SYSCTL_QUAD(_vm, OID_AUTO, reusable_reclaimed, CTLFLAG_RD | CTLFLAG_LOCKED,
2431     &vm_page_stats_reusable.reusable_reclaimed, "");
2432 SYSCTL_QUAD(_vm, OID_AUTO, reusable_nonwritable, CTLFLAG_RD | CTLFLAG_LOCKED,
2433     &vm_page_stats_reusable.reusable_nonwritable, "");
2434 SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2435     &vm_page_stats_reusable.reusable_shared, "");
2436 SYSCTL_QUAD(_vm, OID_AUTO, free_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2437     &vm_page_stats_reusable.free_shared, "");
2438 
2439 
2440 extern unsigned int vm_page_free_count, vm_page_speculative_count;
2441 SYSCTL_UINT(_vm, OID_AUTO, page_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_count, 0, "");
2442 SYSCTL_UINT(_vm, OID_AUTO, page_speculative_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_speculative_count, 0, "");
2443 
2444 extern unsigned int vm_page_cleaned_count;
2445 SYSCTL_UINT(_vm, OID_AUTO, page_cleaned_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_cleaned_count, 0, "Cleaned queue size");
2446 
2447 extern unsigned int vm_page_pageable_internal_count, vm_page_pageable_external_count;
2448 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_internal_count, 0, "");
2449 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, "");
2450 
2451 /* pageout counts */
2452 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_clean, 0, "");
2453 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_used, 0, "");
2454 
2455 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, "");
2456 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_external, "");
2457 SYSCTL_ULONG(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2458 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_external, "");
2459 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2460 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_cleaned, "");
2461 
2462 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_sharedcache, "");
2463 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache, "");
2464 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_realtime, "");
2465 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime, "");
2466 extern unsigned int vm_page_realtime_count;
2467 SYSCTL_UINT(_vm, OID_AUTO, page_realtime_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_realtime_count, 0, "");
2468 extern int vm_pageout_protect_realtime;
2469 SYSCTL_INT(_vm, OID_AUTO, pageout_protect_realtime, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_protect_realtime, 0, "");
2470 
2471 /* counts of pages prefaulted when entering a memory object */
2472 extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout;
2473 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_pages, "");
2474 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_bailout, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_bailout, "");
2475 
2476 #if defined (__x86_64__)
2477 extern unsigned int vm_clump_promote_threshold;
2478 SYSCTL_UINT(_vm, OID_AUTO, vm_clump_promote_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_clump_promote_threshold, 0, "clump size threshold for promotes");
2479 #if DEVELOPMENT || DEBUG
2480 extern unsigned long vm_clump_stats[];
2481 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[1], "free page allocations from clump of 1 page");
2482 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[2], "free page allocations from clump of 2 pages");
2483 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[3], "free page allocations from clump of 3 pages");
2484 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats4, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[4], "free page allocations from clump of 4 pages");
2485 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats5, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[5], "free page allocations from clump of 5 pages");
2486 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats6, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[6], "free page allocations from clump of 6 pages");
2487 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats7, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[7], "free page allocations from clump of 7 pages");
2488 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats8, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[8], "free page allocations from clump of 8 pages");
2489 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats9, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[9], "free page allocations from clump of 9 pages");
2490 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats10, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[10], "free page allocations from clump of 10 pages");
2491 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats11, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[11], "free page allocations from clump of 11 pages");
2492 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats12, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[12], "free page allocations from clump of 12 pages");
2493 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats13, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[13], "free page allocations from clump of 13 pages");
2494 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats14, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[14], "free page allocations from clump of 14 pages");
2495 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats15, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[15], "free page allocations from clump of 15 pages");
2496 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats16, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[16], "free page allocations from clump of 16 pages");
2497 extern unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
2498 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_alloc, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_allocs, "free page allocations");
2499 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inserts, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inserts, "free page insertions");
2500 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inrange, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inrange, "free page insertions that are part of vm_pages");
2501 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_promotes, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_promotes, "pages promoted to head");
2502 #endif  /* if DEVELOPMENT || DEBUG */
2503 #endif  /* #if defined (__x86_64__) */
2504 
2505 #if CONFIG_SECLUDED_MEMORY
2506 
2507 SYSCTL_UINT(_vm, OID_AUTO, num_tasks_can_use_secluded_mem, CTLFLAG_RD | CTLFLAG_LOCKED, &num_tasks_can_use_secluded_mem, 0, "");
2508 extern unsigned int vm_page_secluded_target;
2509 extern unsigned int vm_page_secluded_count;
2510 extern unsigned int vm_page_secluded_count_free;
2511 extern unsigned int vm_page_secluded_count_inuse;
2512 extern unsigned int vm_page_secluded_count_over_target;
2513 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, "");
2514 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, "");
2515 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, "");
2516 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, "");
2517 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_over_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_over_target, 0, "");
2518 
2519 extern struct vm_page_secluded_data vm_page_secluded;
2520 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, "");
2521 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_free, 0, "");
2522 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_other, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_other, 0, "");
2523 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_locked, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_locked, 0, "");
2524 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_state, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_state, 0, "");
2525 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_realtime, 0, "");
2526 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_dirty, 0, "");
2527 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, "");
2528 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, "");
2529 
2530 #endif /* CONFIG_SECLUDED_MEMORY */
2531 
2532 #pragma mark Deferred Reclaim
2533 
2534 #if CONFIG_DEFERRED_RECLAIM
2535 
2536 #if DEVELOPMENT || DEBUG
2537 /*
2538  * VM reclaim testing
2539  */
2540 extern bool vm_deferred_reclamation_block_until_pid_has_been_reclaimed(pid_t pid);
2541 
2542 static int
2543 sysctl_vm_reclaim_drain_async_queue SYSCTL_HANDLER_ARGS
2544 {
2545 #pragma unused(arg1, arg2)
2546 	int error = EINVAL, pid = 0;
2547 	/*
2548 	 * Only send on write
2549 	 */
2550 	error = sysctl_handle_int(oidp, &pid, 0, req);
2551 	if (error || !req->newptr) {
2552 		return error;
2553 	}
2554 
2555 	bool success = vm_deferred_reclamation_block_until_pid_has_been_reclaimed(pid);
2556 	if (success) {
2557 		error = 0;
2558 	}
2559 
2560 	return error;
2561 }
2562 
2563 SYSCTL_PROC(_vm, OID_AUTO, reclaim_drain_async_queue,
2564     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2565     &sysctl_vm_reclaim_drain_async_queue, "I", "");
2566 
2567 static int
2568 sysctl_vm_reclaim_from_pid SYSCTL_HANDLER_ARGS
2569 {
2570 	int error = EINVAL;
2571 	pid_t pid;
2572 	error = sysctl_handle_int(oidp, &pid, 0, req);
2573 	/* Only reclaim on write */
2574 	if (error || !req->newptr) {
2575 		return error;
2576 	}
2577 	if (pid <= 0) {
2578 		return EINVAL;
2579 	}
2580 	proc_t p = proc_find(pid);
2581 	if (p == PROC_NULL) {
2582 		return ESRCH;
2583 	}
2584 	task_t t = proc_task(p);
2585 	if (t == TASK_NULL) {
2586 		proc_rele(p);
2587 		return ESRCH;
2588 	}
2589 	task_reference(t);
2590 	proc_rele(p);
2591 	vm_deferred_reclamation_reclaim_from_task_sync(t, UINT64_MAX);
2592 	task_deallocate(t);
2593 	return 0;
2594 }
2595 
2596 SYSCTL_PROC(_vm, OID_AUTO, reclaim_from_pid,
2597     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2598     &sysctl_vm_reclaim_from_pid, "I",
2599     "Drain the deferred reclamation buffer for a pid");
2600 
2601 static int
2602 sysctl_vm_reclaim_drain_all_buffers SYSCTL_HANDLER_ARGS
2603 {
2604 	/* Only reclaim on write */
2605 	if (!req->newptr) {
2606 		return EINVAL;
2607 	}
2608 	vm_deferred_reclamation_reclaim_all_memory(RECLAIM_OPTIONS_NONE);
2609 	return 0;
2610 }
2611 
2612 SYSCTL_PROC(_vm, OID_AUTO, reclaim_drain_all_buffers,
2613     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2614     &sysctl_vm_reclaim_drain_all_buffers, "I",
2615     "Drain all system-wide deferred reclamation buffers");
2616 
2617 
2618 extern uint64_t vm_reclaim_max_threshold;
2619 extern uint64_t vm_reclaim_trim_divisor;
2620 
2621 SYSCTL_ULONG(_vm, OID_AUTO, reclaim_max_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_max_threshold, "");
2622 SYSCTL_ULONG(_vm, OID_AUTO, reclaim_trim_divisor, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_trim_divisor, "");
2623 #endif /* DEVELOPMENT || DEBUG */
2624 
2625 #endif /* CONFIG_DEFERRED_RECLAIM */
2626 
2627 #include <kern/thread.h>
2628 #include <sys/user.h>
2629 
2630 void vm_pageout_io_throttle(void);
2631 
2632 void
vm_pageout_io_throttle(void)2633 vm_pageout_io_throttle(void)
2634 {
2635 	struct uthread *uthread = current_uthread();
2636 
2637 	/*
2638 	 * thread is marked as a low priority I/O type
2639 	 * and the I/O we issued while in this cleaning operation
2640 	 * collided with normal I/O operations... we'll
2641 	 * delay in order to mitigate the impact of this
2642 	 * task on the normal operation of the system
2643 	 */
2644 
2645 	if (uthread->uu_lowpri_window) {
2646 		throttle_lowpri_io(1);
2647 	}
2648 }
2649 
2650 int
vm_pressure_monitor(__unused struct proc * p,struct vm_pressure_monitor_args * uap,int * retval)2651 vm_pressure_monitor(
2652 	__unused struct proc *p,
2653 	struct vm_pressure_monitor_args *uap,
2654 	int *retval)
2655 {
2656 	kern_return_t   kr;
2657 	uint32_t        pages_reclaimed;
2658 	uint32_t        pages_wanted;
2659 
2660 	kr = mach_vm_pressure_monitor(
2661 		(boolean_t) uap->wait_for_pressure,
2662 		uap->nsecs_monitored,
2663 		(uap->pages_reclaimed) ? &pages_reclaimed : NULL,
2664 		&pages_wanted);
2665 
2666 	switch (kr) {
2667 	case KERN_SUCCESS:
2668 		break;
2669 	case KERN_ABORTED:
2670 		return EINTR;
2671 	default:
2672 		return EINVAL;
2673 	}
2674 
2675 	if (uap->pages_reclaimed) {
2676 		if (copyout((void *)&pages_reclaimed,
2677 		    uap->pages_reclaimed,
2678 		    sizeof(pages_reclaimed)) != 0) {
2679 			return EFAULT;
2680 		}
2681 	}
2682 
2683 	*retval = (int) pages_wanted;
2684 	return 0;
2685 }
2686 
2687 int
kas_info(struct proc * p,struct kas_info_args * uap,int * retval __unused)2688 kas_info(struct proc *p,
2689     struct kas_info_args *uap,
2690     int *retval __unused)
2691 {
2692 #ifndef CONFIG_KAS_INFO
2693 	(void)p;
2694 	(void)uap;
2695 	return ENOTSUP;
2696 #else /* CONFIG_KAS_INFO */
2697 	int                     selector = uap->selector;
2698 	user_addr_t     valuep = uap->value;
2699 	user_addr_t     sizep = uap->size;
2700 	user_size_t size, rsize;
2701 	int                     error;
2702 
2703 	if (!kauth_cred_issuser(kauth_cred_get())) {
2704 		return EPERM;
2705 	}
2706 
2707 #if CONFIG_MACF
2708 	error = mac_system_check_kas_info(kauth_cred_get(), selector);
2709 	if (error) {
2710 		return error;
2711 	}
2712 #endif
2713 
2714 	if (IS_64BIT_PROCESS(p)) {
2715 		user64_size_t size64;
2716 		error = copyin(sizep, &size64, sizeof(size64));
2717 		size = (user_size_t)size64;
2718 	} else {
2719 		user32_size_t size32;
2720 		error = copyin(sizep, &size32, sizeof(size32));
2721 		size = (user_size_t)size32;
2722 	}
2723 	if (error) {
2724 		return error;
2725 	}
2726 
2727 	switch (selector) {
2728 	case KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR:
2729 	{
2730 		uint64_t slide = vm_kernel_slide;
2731 
2732 		if (sizeof(slide) != size) {
2733 			return EINVAL;
2734 		}
2735 
2736 		error = copyout(&slide, valuep, sizeof(slide));
2737 		if (error) {
2738 			return error;
2739 		}
2740 		rsize = size;
2741 	}
2742 	break;
2743 	case KAS_INFO_KERNEL_SEGMENT_VMADDR_SELECTOR:
2744 	{
2745 		uint32_t i;
2746 		kernel_mach_header_t *mh = &_mh_execute_header;
2747 		struct load_command *cmd;
2748 		cmd = (struct load_command*) &mh[1];
2749 		uint64_t *bases;
2750 		rsize = mh->ncmds * sizeof(uint64_t);
2751 
2752 		/*
2753 		 * Return the size if no data was passed
2754 		 */
2755 		if (valuep == 0) {
2756 			break;
2757 		}
2758 
2759 		if (rsize > size) {
2760 			return EINVAL;
2761 		}
2762 
2763 		bases = kalloc_data(rsize, Z_WAITOK | Z_ZERO);
2764 
2765 		for (i = 0; i < mh->ncmds; i++) {
2766 			if (cmd->cmd == LC_SEGMENT_KERNEL) {
2767 				__IGNORE_WCASTALIGN(kernel_segment_command_t * sg = (kernel_segment_command_t *) cmd);
2768 				bases[i] = (uint64_t)sg->vmaddr;
2769 			}
2770 			cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize);
2771 		}
2772 
2773 		error = copyout(bases, valuep, rsize);
2774 
2775 		kfree_data(bases, rsize);
2776 
2777 		if (error) {
2778 			return error;
2779 		}
2780 	}
2781 	break;
2782 	case KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR:
2783 	case KAS_INFO_TXM_TEXT_SLIDE_SELECTOR:
2784 	{
2785 #if CONFIG_SPTM
2786 		const uint64_t slide =
2787 		    (selector == KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR) ? vm_sptm_offsets.slide : vm_txm_offsets.slide;
2788 #else
2789 		const uint64_t slide = 0;
2790 #endif
2791 
2792 		if (sizeof(slide) != size) {
2793 			return EINVAL;
2794 		}
2795 
2796 		error = copyout(&slide, valuep, sizeof(slide));
2797 		if (error) {
2798 			return error;
2799 		}
2800 		rsize = size;
2801 	}
2802 	break;
2803 	default:
2804 		return EINVAL;
2805 	}
2806 
2807 	if (IS_64BIT_PROCESS(p)) {
2808 		user64_size_t size64 = (user64_size_t)rsize;
2809 		error = copyout(&size64, sizep, sizeof(size64));
2810 	} else {
2811 		user32_size_t size32 = (user32_size_t)rsize;
2812 		error = copyout(&size32, sizep, sizeof(size32));
2813 	}
2814 
2815 	return error;
2816 #endif /* CONFIG_KAS_INFO */
2817 }
2818 
2819 #pragma clang diagnostic push
2820 #pragma clang diagnostic ignored "-Wcast-qual"
2821 #pragma clang diagnostic ignored "-Wunused-function"
2822 
2823 static void
asserts()2824 asserts()
2825 {
2826 	static_assert(sizeof(vm_min_kernel_address) == sizeof(unsigned long));
2827 	static_assert(sizeof(vm_max_kernel_address) == sizeof(unsigned long));
2828 }
2829 
2830 SYSCTL_ULONG(_vm, OID_AUTO, vm_min_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_min_kernel_address, "");
2831 SYSCTL_ULONG(_vm, OID_AUTO, vm_max_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_max_kernel_address, "");
2832 #pragma clang diagnostic pop
2833 
2834 extern uint32_t vm_page_pages;
2835 SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0, "");
2836 
2837 extern uint32_t vm_page_busy_absent_skipped;
2838 SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, "");
2839 
2840 extern uint32_t vm_page_upl_tainted;
2841 SYSCTL_UINT(_vm, OID_AUTO, upl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_upl_tainted, 0, "");
2842 
2843 extern uint32_t vm_page_iopl_tainted;
2844 SYSCTL_UINT(_vm, OID_AUTO, iopl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_iopl_tainted, 0, "");
2845 
2846 #if __arm64__ && (DEVELOPMENT || DEBUG)
2847 extern int vm_footprint_suspend_allowed;
2848 SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, "");
2849 
2850 extern void pmap_footprint_suspend(vm_map_t map, boolean_t suspend);
2851 static int
2852 sysctl_vm_footprint_suspend SYSCTL_HANDLER_ARGS
2853 {
2854 #pragma unused(oidp, arg1, arg2)
2855 	int error = 0;
2856 	int new_value;
2857 
2858 	if (req->newptr == USER_ADDR_NULL) {
2859 		return 0;
2860 	}
2861 	error = SYSCTL_IN(req, &new_value, sizeof(int));
2862 	if (error) {
2863 		return error;
2864 	}
2865 	if (!vm_footprint_suspend_allowed) {
2866 		if (new_value != 0) {
2867 			/* suspends are not allowed... */
2868 			return 0;
2869 		}
2870 		/* ... but let resumes proceed */
2871 	}
2872 	DTRACE_VM2(footprint_suspend,
2873 	    vm_map_t, current_map(),
2874 	    int, new_value);
2875 
2876 	pmap_footprint_suspend(current_map(), new_value);
2877 
2878 	return 0;
2879 }
2880 SYSCTL_PROC(_vm, OID_AUTO, footprint_suspend,
2881     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED,
2882     0, 0, &sysctl_vm_footprint_suspend, "I", "");
2883 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
2884 
2885 extern uint64_t vm_map_corpse_footprint_count;
2886 extern uint64_t vm_map_corpse_footprint_size_avg;
2887 extern uint64_t vm_map_corpse_footprint_size_max;
2888 extern uint64_t vm_map_corpse_footprint_full;
2889 extern uint64_t vm_map_corpse_footprint_no_buf;
2890 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_count,
2891     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_count, "");
2892 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_avg,
2893     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_avg, "");
2894 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_max,
2895     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_max, "");
2896 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_full,
2897     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_full, "");
2898 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_no_buf,
2899     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_no_buf, "");
2900 
2901 #if CODE_SIGNING_MONITOR
2902 extern uint64_t vm_cs_defer_to_csm;
2903 extern uint64_t vm_cs_defer_to_csm_not;
2904 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm,
2905     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm, "");
2906 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm_not,
2907     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm_not, "");
2908 #endif /* CODE_SIGNING_MONITOR */
2909 
2910 extern uint64_t shared_region_pager_copied;
2911 extern uint64_t shared_region_pager_slid;
2912 extern uint64_t shared_region_pager_slid_error;
2913 extern uint64_t shared_region_pager_reclaimed;
2914 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_copied,
2915     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_copied, "");
2916 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid,
2917     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid, "");
2918 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid_error,
2919     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid_error, "");
2920 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed,
2921     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_reclaimed, "");
2922 extern int shared_region_destroy_delay;
2923 SYSCTL_INT(_vm, OID_AUTO, shared_region_destroy_delay,
2924     CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_destroy_delay, 0, "");
2925 
2926 #if MACH_ASSERT
2927 extern int pmap_ledgers_panic_leeway;
2928 SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
2929 #endif /* MACH_ASSERT */
2930 
2931 
2932 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_count;
2933 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_size;
2934 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_max;
2935 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart;
2936 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_error;
2937 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_count;
2938 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_size;
2939 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_max;
2940 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart;
2941 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_error;
2942 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_count;
2943 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_size;
2944 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_max;
2945 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_count,
2946     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_count, "");
2947 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_size,
2948     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_size, "");
2949 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_max,
2950     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_max, "");
2951 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_restart,
2952     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_restart, "");
2953 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_error,
2954     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_error, "");
2955 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_count,
2956     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_count, "");
2957 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_size,
2958     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_size, "");
2959 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_max,
2960     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_max, "");
2961 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_restart,
2962     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_restart, "");
2963 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_error,
2964     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_error, "");
2965 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_count,
2966     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_count, "");
2967 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_size,
2968     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_size, "");
2969 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_max,
2970     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_max, "");
2971 
2972 extern int vm_protect_privileged_from_untrusted;
2973 SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted,
2974     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, "");
2975 extern uint64_t vm_copied_on_read;
2976 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read,
2977     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, "");
2978 
2979 extern int vm_shared_region_count;
2980 extern int vm_shared_region_peak;
2981 SYSCTL_INT(_vm, OID_AUTO, shared_region_count,
2982     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_count, 0, "");
2983 SYSCTL_INT(_vm, OID_AUTO, shared_region_peak,
2984     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_peak, 0, "");
2985 #if DEVELOPMENT || DEBUG
2986 extern unsigned int shared_region_pagers_resident_count;
2987 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_count,
2988     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_count, 0, "");
2989 extern unsigned int shared_region_pagers_resident_peak;
2990 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_peak,
2991     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_peak, 0, "");
2992 extern int shared_region_pager_count;
2993 SYSCTL_INT(_vm, OID_AUTO, shared_region_pager_count,
2994     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_count, 0, "");
2995 #if __has_feature(ptrauth_calls)
2996 extern int shared_region_key_count;
2997 SYSCTL_INT(_vm, OID_AUTO, shared_region_key_count,
2998     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_key_count, 0, "");
2999 extern int vm_shared_region_reslide_count;
3000 SYSCTL_INT(_vm, OID_AUTO, shared_region_reslide_count,
3001     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_reslide_count, 0, "");
3002 #endif /* __has_feature(ptrauth_calls) */
3003 #endif /* DEVELOPMENT || DEBUG */
3004 
3005 #if MACH_ASSERT
3006 extern int debug4k_filter;
3007 SYSCTL_INT(_vm, OID_AUTO, debug4k_filter, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_filter, 0, "");
3008 extern int debug4k_panic_on_terminate;
3009 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_terminate, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_terminate, 0, "");
3010 extern int debug4k_panic_on_exception;
3011 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_exception, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_exception, 0, "");
3012 extern int debug4k_panic_on_misaligned_sharing;
3013 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_misaligned_sharing, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_misaligned_sharing, 0, "");
3014 #endif /* MACH_ASSERT */
3015 
3016 extern uint64_t vm_map_set_size_limit_count;
3017 extern uint64_t vm_map_set_data_limit_count;
3018 extern uint64_t vm_map_enter_RLIMIT_AS_count;
3019 extern uint64_t vm_map_enter_RLIMIT_DATA_count;
3020 SYSCTL_QUAD(_vm, OID_AUTO, map_set_size_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_size_limit_count, "");
3021 SYSCTL_QUAD(_vm, OID_AUTO, map_set_data_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_data_limit_count, "");
3022 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_AS_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_AS_count, "");
3023 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_DATA_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_DATA_count, "");
3024 
3025 extern uint64_t vm_fault_resilient_media_initiate;
3026 extern uint64_t vm_fault_resilient_media_retry;
3027 extern uint64_t vm_fault_resilient_media_proceed;
3028 extern uint64_t vm_fault_resilient_media_release;
3029 extern uint64_t vm_fault_resilient_media_abort1;
3030 extern uint64_t vm_fault_resilient_media_abort2;
3031 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_initiate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_initiate, "");
3032 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_retry, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_retry, "");
3033 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_proceed, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_proceed, "");
3034 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_release, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_release, "");
3035 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort1, "");
3036 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort2, "");
3037 #if MACH_ASSERT
3038 extern int vm_fault_resilient_media_inject_error1_rate;
3039 extern int vm_fault_resilient_media_inject_error1;
3040 extern int vm_fault_resilient_media_inject_error2_rate;
3041 extern int vm_fault_resilient_media_inject_error2;
3042 extern int vm_fault_resilient_media_inject_error3_rate;
3043 extern int vm_fault_resilient_media_inject_error3;
3044 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1_rate, 0, "");
3045 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1, 0, "");
3046 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2_rate, 0, "");
3047 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2, 0, "");
3048 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3_rate, 0, "");
3049 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3, 0, "");
3050 #endif /* MACH_ASSERT */
3051 
3052 extern uint64_t pmap_query_page_info_retries;
3053 SYSCTL_QUAD(_vm, OID_AUTO, pmap_query_page_info_retries, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_query_page_info_retries, "");
3054 
3055 /*
3056  * A sysctl which causes all existing shared regions to become stale. They
3057  * will no longer be used by anything new and will be torn down as soon as
3058  * the last existing user exits. A write of non-zero value causes that to happen.
3059  * This should only be used by launchd, so we check that this is initproc.
3060  */
3061 static int
shared_region_pivot(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3062 shared_region_pivot(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3063 {
3064 	unsigned int value = 0;
3065 	int changed = 0;
3066 	int error = sysctl_io_number(req, 0, sizeof(value), &value, &changed);
3067 	if (error || !changed) {
3068 		return error;
3069 	}
3070 	if (current_proc() != initproc) {
3071 		return EPERM;
3072 	}
3073 
3074 	vm_shared_region_pivot();
3075 
3076 	return 0;
3077 }
3078 
3079 SYSCTL_PROC(_vm, OID_AUTO, shared_region_pivot,
3080     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
3081     0, 0, shared_region_pivot, "I", "");
3082 
3083 extern uint64_t vm_object_shadow_forced;
3084 extern uint64_t vm_object_shadow_skipped;
3085 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
3086     &vm_object_shadow_forced, "");
3087 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_skipped, CTLFLAG_RD | CTLFLAG_LOCKED,
3088     &vm_object_shadow_skipped, "");
3089 
3090 SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED,
3091     &vmtc_total, 0, "total text page corruptions detected");
3092 
3093 
3094 #if DEBUG || DEVELOPMENT
3095 /*
3096  * A sysctl that can be used to corrupt a text page with an illegal instruction.
3097  * Used for testing text page self healing.
3098  */
3099 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
3100 static int
corrupt_text_addr(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3101 corrupt_text_addr(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3102 {
3103 	uint64_t value = 0;
3104 	int error = sysctl_handle_quad(oidp, &value, 0, req);
3105 	if (error || !req->newptr) {
3106 		return error;
3107 	}
3108 
3109 	if (vm_corrupt_text_addr((uintptr_t)value) == KERN_SUCCESS) {
3110 		return 0;
3111 	} else {
3112 		return EINVAL;
3113 	}
3114 }
3115 
3116 SYSCTL_PROC(_vm, OID_AUTO, corrupt_text_addr,
3117     CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3118     0, 0, corrupt_text_addr, "-", "");
3119 #endif /* DEBUG || DEVELOPMENT */
3120 
3121 #if CONFIG_MAP_RANGES
3122 /*
3123  * vm.malloc_ranges
3124  *
3125  * space-separated list of <left:right> hexadecimal addresses.
3126  */
3127 static int
3128 vm_map_malloc_ranges SYSCTL_HANDLER_ARGS
3129 {
3130 	vm_map_t map = current_map();
3131 	struct mach_vm_range r1, r2;
3132 	char str[20 * 4];
3133 	int len;
3134 	mach_vm_offset_t right_hole_max;
3135 
3136 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_DEFAULT, &r1)) {
3137 		return ENOENT;
3138 	}
3139 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_HEAP, &r2)) {
3140 		return ENOENT;
3141 	}
3142 
3143 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
3144 	right_hole_max = MACH_VM_JUMBO_ADDRESS;
3145 #else /* !XNU_TARGET_OS_IOS || !EXTENDED_USER_VA_SUPPORT */
3146 	right_hole_max = get_map_max(map);
3147 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
3148 
3149 	len = scnprintf(str, sizeof(str), "0x%llx:0x%llx 0x%llx:0x%llx",
3150 	    r1.max_address, r2.min_address,
3151 	    r2.max_address, right_hole_max);
3152 
3153 	return SYSCTL_OUT(req, str, len);
3154 }
3155 
3156 SYSCTL_PROC(_vm, OID_AUTO, malloc_ranges,
3157     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3158     0, 0, &vm_map_malloc_ranges, "A", "");
3159 
3160 #if DEBUG || DEVELOPMENT
3161 static int
3162 vm_map_user_range_default SYSCTL_HANDLER_ARGS
3163 {
3164 #pragma unused(arg1, arg2, oidp)
3165 	struct mach_vm_range range;
3166 
3167 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_DEFAULT, &range)
3168 	    != KERN_SUCCESS) {
3169 		return EINVAL;
3170 	}
3171 
3172 	return SYSCTL_OUT(req, &range, sizeof(range));
3173 }
3174 
3175 static int
3176 vm_map_user_range_heap SYSCTL_HANDLER_ARGS
3177 {
3178 #pragma unused(arg1, arg2, oidp)
3179 	struct mach_vm_range range;
3180 
3181 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_HEAP, &range)
3182 	    != KERN_SUCCESS) {
3183 		return EINVAL;
3184 	}
3185 
3186 	return SYSCTL_OUT(req, &range, sizeof(range));
3187 }
3188 
3189 static int
3190 vm_map_user_range_large_file SYSCTL_HANDLER_ARGS
3191 {
3192 #pragma unused(arg1, arg2, oidp)
3193 	struct mach_vm_range range;
3194 
3195 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_LARGE_FILE, &range)
3196 	    != KERN_SUCCESS) {
3197 		return EINVAL;
3198 	}
3199 
3200 	return SYSCTL_OUT(req, &range, sizeof(range));
3201 }
3202 
3203 /*
3204  * A sysctl that can be used to return ranges for the current VM map.
3205  * Used for testing VM ranges.
3206  */
3207 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_default, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3208     0, 0, &vm_map_user_range_default, "S,mach_vm_range", "");
3209 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_heap, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3210     0, 0, &vm_map_user_range_heap, "S,mach_vm_range", "");
3211 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_large_file, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3212     0, 0, &vm_map_user_range_large_file, "S,mach_vm_range", "");
3213 
3214 #endif /* DEBUG || DEVELOPMENT */
3215 #endif /* CONFIG_MAP_RANGES */
3216 
3217 #if DEBUG || DEVELOPMENT
3218 #endif /* DEBUG || DEVELOPMENT */
3219 
3220 extern uint64_t vm_map_range_overflows_count;
3221 SYSCTL_QUAD(_vm, OID_AUTO, map_range_overflows_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_range_overflows_count, "");
3222 extern boolean_t vm_map_range_overflows_log;
3223 SYSCTL_INT(_vm, OID_AUTO, map_range_oveflows_log, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_range_overflows_log, 0, "");
3224 
3225 extern uint64_t c_seg_filled_no_contention;
3226 extern uint64_t c_seg_filled_contention;
3227 extern clock_sec_t c_seg_filled_contention_sec_max;
3228 extern clock_nsec_t c_seg_filled_contention_nsec_max;
3229 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_no_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_no_contention, "");
3230 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention, "");
3231 SYSCTL_ULONG(_vm, OID_AUTO, c_seg_filled_contention_sec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_sec_max, "");
3232 SYSCTL_UINT(_vm, OID_AUTO, c_seg_filled_contention_nsec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_nsec_max, 0, "");
3233 #if (XNU_TARGET_OS_OSX && __arm64__)
3234 extern clock_nsec_t c_process_major_report_over_ms; /* report if over ? ms */
3235 extern int c_process_major_yield_after; /* yield after moving ? segments */
3236 extern uint64_t c_process_major_reports;
3237 extern clock_sec_t c_process_major_max_sec;
3238 extern clock_nsec_t c_process_major_max_nsec;
3239 extern uint32_t c_process_major_peak_segcount;
3240 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_report_over_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_report_over_ms, 0, "");
3241 SYSCTL_INT(_vm, OID_AUTO, c_process_major_yield_after, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_yield_after, 0, "");
3242 SYSCTL_QUAD(_vm, OID_AUTO, c_process_major_reports, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_reports, "");
3243 SYSCTL_ULONG(_vm, OID_AUTO, c_process_major_max_sec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_sec, "");
3244 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_max_nsec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_nsec, 0, "");
3245 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_peak_segcount, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_peak_segcount, 0, "");
3246 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3247 
3248 #if DEVELOPMENT || DEBUG
3249 extern int panic_object_not_alive;
3250 SYSCTL_INT(_vm, OID_AUTO, panic_object_not_alive, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &panic_object_not_alive, 0, "");
3251 #endif /* DEVELOPMENT || DEBUG */
3252 
3253 #if FBDP_DEBUG_OBJECT_NO_PAGER
3254 extern int fbdp_no_panic;
3255 SYSCTL_INT(_vm, OID_AUTO, fbdp_no_panic, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &fbdp_no_panic, 0, "");
3256 #endif /* MACH_ASSERT */
3257 
3258 
3259 #if DEVELOPMENT || DEBUG
3260 
3261 
3262 /* The largest possible single segment + its slots is (sizeof(c_segment_info) + C_SLOT_MAX_INDEX * sizeof(c_slot_info)), so this should be enough  */
3263 #define SYSCTL_SEG_BUF_SIZE (8 * 1024)
3264 
3265 extern uint32_t c_segments_available;
3266 
3267 struct sysctl_buf_header {
3268 	uint32_t magic;
3269 } __attribute__((packed));
3270 
3271 /* This sysctl iterates over the populated c_segments and writes some info about each one and its slots.
3272  * instead of doing everything here, the function calls a function vm_compressor.c. */
3273 static int
sysctl_compressor_segments(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3274 sysctl_compressor_segments(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3275 {
3276 	char* buf = kalloc_data(SYSCTL_SEG_BUF_SIZE, Z_WAITOK | Z_ZERO);
3277 	if (!buf) {
3278 		return ENOMEM;
3279 	}
3280 	size_t offset = 0;
3281 	int error = 0;
3282 	int segno = 0;
3283 	/* 4 byte header to identify the version of the formatting of the data.
3284 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3285 	((struct sysctl_buf_header*)buf)->magic = VM_C_SEGMENT_INFO_MAGIC;
3286 	offset += sizeof(uint32_t);
3287 
3288 	while (segno < c_segments_available) {
3289 		size_t left_sz = SYSCTL_SEG_BUF_SIZE - offset;
3290 		kern_return_t kr = vm_compressor_serialize_segment_debug_info(segno, buf + offset, &left_sz);
3291 		if (kr == KERN_NO_SPACE) {
3292 			/* failed to add another segment, push the current buffer out and try again */
3293 			if (offset == 0) {
3294 				error = EINVAL; /* no space to write but I didn't write anything, shouldn't really happen */
3295 				goto out;
3296 			}
3297 			/* write out chunk */
3298 			error = SYSCTL_OUT(req, buf, offset);
3299 			if (error) {
3300 				goto out;
3301 			}
3302 			offset = 0;
3303 			bzero(buf, SYSCTL_SEG_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3304 			/* don't increment segno, need to try again saving the current one */
3305 		} else if (kr != KERN_SUCCESS) {
3306 			error = EINVAL;
3307 			goto out;
3308 		} else {
3309 			offset += left_sz;
3310 			++segno;
3311 		}
3312 	}
3313 
3314 	if (offset > 0) { /* write last chunk */
3315 		error = SYSCTL_OUT(req, buf, offset);
3316 	}
3317 
3318 out:
3319 	kfree_data(buf, SYSCTL_SEG_BUF_SIZE)
3320 	return error;
3321 }
3322 
3323 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments, "S", "");
3324 
3325 
3326 extern uint32_t vm_compressor_fragmentation_level(void);
3327 
3328 static int
sysctl_compressor_fragmentation_level(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3329 sysctl_compressor_fragmentation_level(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3330 {
3331 	uint32_t value = vm_compressor_fragmentation_level();
3332 	return SYSCTL_OUT(req, &value, sizeof(value));
3333 }
3334 
3335 SYSCTL_PROC(_vm, OID_AUTO, compressor_fragmentation_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_fragmentation_level, "IU", "");
3336 
3337 
3338 #define SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE (8 * 1024)
3339 
3340 
3341 /* This sysctl iterates over all the entries of the vm_map of the a given process and write some info about the vm_object pointed by the entries.
3342  * This can be used for mapping where are all the pages of a process located in the compressor.
3343  */
3344 static int
sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)3345 sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3346 {
3347 	int error = 0;
3348 	char *buf = NULL;
3349 	proc_t p = PROC_NULL;
3350 	task_t task = TASK_NULL;
3351 	vm_map_t map = VM_MAP_NULL;
3352 	__block size_t offset = 0;
3353 
3354 	/* go from pid to proc to task to vm_map. see sysctl_procargsx() for another example of this procession */
3355 	int *name = arg1;
3356 	int namelen = arg2;
3357 	if (namelen < 1) {
3358 		return EINVAL;
3359 	}
3360 	int pid = name[0];
3361 	p = proc_find(pid);  /* this increments a reference to the proc */
3362 	if (p == PROC_NULL) {
3363 		return EINVAL;
3364 	}
3365 	task = proc_task(p);
3366 	proc_rele(p);  /* decrement ref of proc */
3367 	p = PROC_NULL;
3368 	if (task == TASK_NULL) {
3369 		return EINVAL;
3370 	}
3371 	/* convert proc reference to task reference */
3372 	task_reference(task);
3373 	/* task reference to map reference */
3374 	map = get_task_map_reference(task);
3375 	task_deallocate(task);
3376 
3377 	if (map == VM_MAP_NULL) {
3378 		return EINVAL;  /* nothing allocated yet */
3379 	}
3380 
3381 	buf = kalloc_data(SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE, Z_WAITOK | Z_ZERO);
3382 	if (!buf) {
3383 		error = ENOMEM;
3384 		goto out;
3385 	}
3386 
3387 	/* 4 byte header to identify the version of the formatting of the data.
3388 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3389 	((struct sysctl_buf_header*)buf)->magic = VM_MAP_ENTRY_INFO_MAGIC;
3390 	offset += sizeof(uint32_t);
3391 
3392 	kern_return_t (^write_header)(int) = ^kern_return_t (int nentries) {
3393 		/* write the header, happens only once at the beginning so we should have enough space */
3394 		assert(offset + sizeof(struct vm_map_info_hdr) < SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE);
3395 		struct vm_map_info_hdr* out_hdr = (struct vm_map_info_hdr*)(buf + offset);
3396 		out_hdr->vmi_nentries = nentries;
3397 		offset += sizeof(struct vm_map_info_hdr);
3398 		return KERN_SUCCESS;
3399 	};
3400 
3401 	kern_return_t (^write_entry)(void*) = ^kern_return_t (void* entry) {
3402 		while (true) { /* try up to 2 times, first try write the the current buffer, otherwise to a new buffer */
3403 			size_t left_sz = SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE - offset;
3404 			kern_return_t kr = vm_map_dump_entry_and_compressor_pager(entry, buf + offset, &left_sz);
3405 			if (kr == KERN_NO_SPACE) {
3406 				/* failed to write anything, flush the current buffer and try again */
3407 				if (offset == 0) {
3408 					return KERN_FAILURE; /* no space to write but I didn't write anything yet, shouldn't really happen */
3409 				}
3410 				/* write out chunk */
3411 				int out_error = SYSCTL_OUT(req, buf, offset);
3412 				if (out_error) {
3413 					return KERN_FAILURE;
3414 				}
3415 				offset = 0;
3416 				bzero(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3417 				continue; /* need to retry the entry dump again with the cleaned buffer */
3418 			} else if (kr != KERN_SUCCESS) {
3419 				return kr;
3420 			}
3421 			offset += left_sz;
3422 			break;
3423 		}
3424 		return KERN_SUCCESS;
3425 	};
3426 
3427 	/* this foreach first calls to the first callback with the number of entries, then calls the second for every entry
3428 	 * when the buffer is exhausted, it is flushed to the sysctl and restarted */
3429 	kern_return_t kr = vm_map_entries_foreach(map, write_header, write_entry);
3430 
3431 	if (kr != KERN_SUCCESS) {
3432 		goto out;
3433 	}
3434 
3435 	if (offset > 0) { /* last chunk */
3436 		error = SYSCTL_OUT(req, buf, offset);
3437 	}
3438 
3439 out:
3440 	if (buf != NULL) {
3441 		kfree_data(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE)
3442 	}
3443 	if (map != NULL) {
3444 		vm_map_deallocate(map);
3445 	}
3446 	return error;
3447 }
3448 
3449 SYSCTL_PROC(_vm, OID_AUTO, task_vm_objects_slotmap, CTLTYPE_NODE | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_task_vm_objects_slotmap, "S", "");
3450 
3451 
3452 
3453 #endif /* DEVELOPMENT || DEBUG */
3454