xref: /xnu-11417.121.6/bsd/vm/vm_unix.c (revision a1e26a70f38d1d7daa7b49b258e2f8538ad81650)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Mach Operating System
30  * Copyright (c) 1987 Carnegie-Mellon University
31  * All rights reserved.  The CMU software License Agreement specifies
32  * the terms and conditions for use and redistribution.
33  */
34 /*
35  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
36  * support for mandatory and extensible security protections.  This notice
37  * is included in support of clause 2.2 (b) of the Apple Public License,
38  * Version 2.0.
39  */
40 #include <vm/vm_options.h>
41 
42 #include <kern/ecc.h>
43 #include <kern/task.h>
44 #include <kern/thread.h>
45 #include <kern/debug.h>
46 #include <kern/extmod_statistics.h>
47 #include <mach/mach_traps.h>
48 #include <mach/port.h>
49 #include <mach/sdt.h>
50 #include <mach/task.h>
51 #include <mach/task_access.h>
52 #include <mach/task_special_ports.h>
53 #include <mach/time_value.h>
54 #include <mach/vm_map.h>
55 #include <mach/vm_param.h>
56 #include <mach/vm_prot.h>
57 #include <machine/machine_routines.h>
58 
59 #include <sys/file_internal.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/dir.h>
63 #include <sys/namei.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/vm.h>
67 #include <sys/file.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/mount.h>
70 #include <sys/xattr.h>
71 #include <sys/trace.h>
72 #include <sys/kernel.h>
73 #include <sys/ubc_internal.h>
74 #include <sys/user.h>
75 #include <sys/syslog.h>
76 #include <sys/stat.h>
77 #include <sys/sysproto.h>
78 #include <sys/mman.h>
79 #include <sys/sysctl.h>
80 #include <sys/cprotect.h>
81 #include <sys/kpi_socket.h>
82 #include <sys/kas_info.h>
83 #include <sys/socket.h>
84 #include <sys/socketvar.h>
85 #include <sys/random.h>
86 #include <sys/code_signing.h>
87 #if NECP
88 #include <net/necp.h>
89 #endif /* NECP */
90 #if SKYWALK
91 #include <skywalk/os_channel.h>
92 #endif /* SKYWALK */
93 
94 #include <security/audit/audit.h>
95 #include <security/mac.h>
96 #include <bsm/audit_kevents.h>
97 
98 #include <kern/kalloc.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_kern_xnu.h>
101 #include <vm/vm_pageout_xnu.h>
102 
103 #include <mach/shared_region.h>
104 #include <vm/vm_shared_region_internal.h>
105 
106 #include <vm/vm_dyld_pager_internal.h>
107 #include <vm/vm_protos_internal.h>
108 #if DEVELOPMENT || DEBUG
109 #include <vm/vm_compressor_info.h>         /* for c_segment_info */
110 #include <vm/vm_compressor_xnu.h>          /* for vm_compressor_serialize_segment_debug_info() */
111 #endif
112 #include <vm/vm_reclaim_xnu.h>
113 
114 #include <sys/kern_memorystatus.h>
115 #include <sys/kern_memorystatus_freeze.h>
116 #include <sys/proc_internal.h>
117 
118 #include <mach-o/fixup-chains.h>
119 
120 #if CONFIG_MACF
121 #include <security/mac_framework.h>
122 #endif
123 
124 #include <kern/bits.h>
125 
126 #if CONFIG_CSR
127 #include <sys/csr.h>
128 #endif /* CONFIG_CSR */
129 #include <sys/trust_caches.h>
130 #include <libkern/amfi/amfi.h>
131 #include <IOKit/IOBSD.h>
132 
133 #if VM_MAP_DEBUG_APPLE_PROTECT
134 SYSCTL_INT(_vm, OID_AUTO, map_debug_apple_protect, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_apple_protect, 0, "");
135 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
136 
137 #if DEVELOPMENT || DEBUG
138 
139 extern int vm_object_cache_evict_all(void);
140 static int
141 sysctl_vm_object_cache_evict SYSCTL_HANDLER_ARGS
142 {
143 #pragma unused(arg1, arg2, req)
144 	(void) vm_object_cache_evict_all();
145 	return 0;
146 }
147 
148 SYSCTL_PROC(_vm, OID_AUTO, object_cache_evict, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
149     0, 0, &sysctl_vm_object_cache_evict, "I", "");
150 
151 static int
152 sysctl_kmem_alloc_contig SYSCTL_HANDLER_ARGS
153 {
154 #pragma unused(arg1, arg2)
155 	vm_offset_t     kaddr;
156 	kern_return_t   kr;
157 	int     error = 0;
158 	int     size = 0;
159 
160 	error = sysctl_handle_int(oidp, &size, 0, req);
161 	if (error || !req->newptr) {
162 		return error;
163 	}
164 
165 	kr = kmem_alloc_contig(kernel_map, &kaddr, (vm_size_t)size,
166 	    0, 0, 0, KMA_DATA, VM_KERN_MEMORY_IOKIT);
167 
168 	if (kr == KERN_SUCCESS) {
169 		kmem_free(kernel_map, kaddr, size);
170 	}
171 
172 	return error;
173 }
174 
175 SYSCTL_PROC(_vm, OID_AUTO, kmem_alloc_contig, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
176     0, 0, &sysctl_kmem_alloc_contig, "I", "");
177 
178 extern int vm_region_footprint;
179 SYSCTL_INT(_vm, OID_AUTO, region_footprint, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, &vm_region_footprint, 0, "");
180 
181 static int
182 sysctl_kmem_gobj_stats SYSCTL_HANDLER_ARGS
183 {
184 #pragma unused(arg1, arg2, oidp)
185 	kmem_gobj_stats stats = kmem_get_gobj_stats();
186 
187 	return SYSCTL_OUT(req, &stats, sizeof(stats));
188 }
189 
190 SYSCTL_PROC(_vm, OID_AUTO, kmem_gobj_stats,
191     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
192     0, 0, &sysctl_kmem_gobj_stats, "S,kmem_gobj_stats", "");
193 
194 #endif /* DEVELOPMENT || DEBUG */
195 
196 static int
197 sysctl_vm_self_region_footprint SYSCTL_HANDLER_ARGS
198 {
199 #pragma unused(arg1, arg2, oidp)
200 	int     error = 0;
201 	int     value;
202 
203 	value = task_self_region_footprint();
204 	error = SYSCTL_OUT(req, &value, sizeof(int));
205 	if (error) {
206 		return error;
207 	}
208 
209 	if (!req->newptr) {
210 		return 0;
211 	}
212 
213 	error = SYSCTL_IN(req, &value, sizeof(int));
214 	if (error) {
215 		return error;
216 	}
217 	task_self_region_footprint_set(value);
218 	return 0;
219 }
220 SYSCTL_PROC(_vm, OID_AUTO, self_region_footprint, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_footprint, "I", "");
221 
222 static int
223 sysctl_vm_self_region_page_size SYSCTL_HANDLER_ARGS
224 {
225 #pragma unused(arg1, arg2, oidp)
226 	int     error = 0;
227 	int     value;
228 
229 	value = (1 << thread_self_region_page_shift());
230 	error = SYSCTL_OUT(req, &value, sizeof(int));
231 	if (error) {
232 		return error;
233 	}
234 
235 	if (!req->newptr) {
236 		return 0;
237 	}
238 
239 	error = SYSCTL_IN(req, &value, sizeof(int));
240 	if (error) {
241 		return error;
242 	}
243 
244 	if (value != 0 && value != 4096 && value != 16384) {
245 		return EINVAL;
246 	}
247 
248 #if !__ARM_MIXED_PAGE_SIZE__
249 	if (value != vm_map_page_size(current_map())) {
250 		return EINVAL;
251 	}
252 #endif /* !__ARM_MIXED_PAGE_SIZE__ */
253 
254 	thread_self_region_page_shift_set(bit_first(value));
255 	return 0;
256 }
257 SYSCTL_PROC(_vm, OID_AUTO, self_region_page_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_page_size, "I", "");
258 
259 static int
260 sysctl_vm_self_region_info_flags SYSCTL_HANDLER_ARGS
261 {
262 #pragma unused(arg1, arg2, oidp)
263 	int     error = 0;
264 	int     value;
265 	kern_return_t kr;
266 
267 	value = task_self_region_info_flags();
268 	error = SYSCTL_OUT(req, &value, sizeof(int));
269 	if (error) {
270 		return error;
271 	}
272 
273 	if (!req->newptr) {
274 		return 0;
275 	}
276 
277 	error = SYSCTL_IN(req, &value, sizeof(int));
278 	if (error) {
279 		return error;
280 	}
281 
282 	kr = task_self_region_info_flags_set(value);
283 	if (kr != KERN_SUCCESS) {
284 		return EINVAL;
285 	}
286 
287 	return 0;
288 }
289 SYSCTL_PROC(_vm, OID_AUTO, self_region_info_flags, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_info_flags, "I", "");
290 
291 
292 #if DEVELOPMENT || DEBUG
293 extern int panic_on_unsigned_execute;
294 SYSCTL_INT(_vm, OID_AUTO, panic_on_unsigned_execute, CTLFLAG_RW | CTLFLAG_LOCKED, &panic_on_unsigned_execute, 0, "");
295 
296 extern int vm_log_xnu_user_debug;
297 SYSCTL_INT(_vm, OID_AUTO, log_xnu_user_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_xnu_user_debug, 0, "");
298 #endif /* DEVELOPMENT || DEBUG */
299 
300 extern int vm_log_map_delete_permanent_prot_none;
301 SYSCTL_INT(_vm, OID_AUTO, log_map_delete_permanent_prot_none, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_map_delete_permanent_prot_none, 0, "");
302 
303 extern int cs_executable_create_upl;
304 extern int cs_executable_wire;
305 SYSCTL_INT(_vm, OID_AUTO, cs_executable_create_upl, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_create_upl, 0, "");
306 SYSCTL_INT(_vm, OID_AUTO, cs_executable_wire, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_wire, 0, "");
307 
308 extern int apple_protect_pager_count;
309 extern int apple_protect_pager_count_mapped;
310 extern unsigned int apple_protect_pager_cache_limit;
311 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count, 0, "");
312 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count_mapped, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count_mapped, 0, "");
313 SYSCTL_UINT(_vm, OID_AUTO, apple_protect_pager_cache_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_cache_limit, 0, "");
314 
315 #if DEVELOPMENT || DEBUG
316 extern int radar_20146450;
317 SYSCTL_INT(_vm, OID_AUTO, radar_20146450, CTLFLAG_RW | CTLFLAG_LOCKED, &radar_20146450, 0, "");
318 
319 extern int macho_printf;
320 SYSCTL_INT(_vm, OID_AUTO, macho_printf, CTLFLAG_RW | CTLFLAG_LOCKED, &macho_printf, 0, "");
321 
322 extern int apple_protect_pager_data_request_debug;
323 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_data_request_debug, 0, "");
324 
325 extern unsigned int vm_object_copy_delayed_paging_wait_disable;
326 EXPERIMENT_FACTOR_UINT(_vm, vm_object_copy_delayed_paging_wait_disable, &vm_object_copy_delayed_paging_wait_disable, FALSE, TRUE, "");
327 
328 #if __arm64__
329 /* These are meant to support the page table accounting unit test. */
330 extern unsigned int arm_hardware_page_size;
331 extern unsigned int arm_pt_desc_size;
332 extern unsigned int arm_pt_root_size;
333 extern unsigned int inuse_user_tteroot_count;
334 extern unsigned int inuse_kernel_tteroot_count;
335 extern unsigned int inuse_user_ttepages_count;
336 extern unsigned int inuse_kernel_ttepages_count;
337 extern unsigned int inuse_user_ptepages_count;
338 extern unsigned int inuse_kernel_ptepages_count;
339 SYSCTL_UINT(_vm, OID_AUTO, native_hw_pagesize, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_hardware_page_size, 0, "");
340 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_desc_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_desc_size, 0, "");
341 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_root_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_root_size, 0, "");
342 SYSCTL_UINT(_vm, OID_AUTO, user_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_tteroot_count, 0, "");
343 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_tteroot_count, 0, "");
344 SYSCTL_UINT(_vm, OID_AUTO, user_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ttepages_count, 0, "");
345 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ttepages_count, 0, "");
346 SYSCTL_UINT(_vm, OID_AUTO, user_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ptepages_count, 0, "");
347 SYSCTL_UINT(_vm, OID_AUTO, kernel_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ptepages_count, 0, "");
348 #if !CONFIG_SPTM
349 extern unsigned int free_page_size_tt_count;
350 extern unsigned int free_tt_count;
351 SYSCTL_UINT(_vm, OID_AUTO, free_1page_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_page_size_tt_count, 0, "");
352 SYSCTL_UINT(_vm, OID_AUTO, free_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_tt_count, 0, "");
353 #endif
354 #if DEVELOPMENT || DEBUG
355 extern unsigned long pmap_asid_flushes;
356 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_flushes, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_flushes, "");
357 extern unsigned long pmap_asid_hits;
358 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_hits, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_hits, "");
359 extern unsigned long pmap_asid_misses;
360 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_misses, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_misses, "");
361 extern unsigned long pmap_speculation_restrictions;
362 SYSCTL_ULONG(_vm, OID_AUTO, pmap_speculation_restrictions, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_speculation_restrictions, "");
363 #endif
364 #endif /* __arm64__ */
365 #endif /* DEVELOPMENT || DEBUG */
366 
367 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor, 0, "");
368 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor_pages, 0, "");
369 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate, 0, "");
370 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate_failure, 0, "");
371 SYSCTL_INT(_vm, OID_AUTO, vm_should_cow_but_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.should_cow_but_wired, 0, "");
372 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow, 0, "");
373 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow_pages, 0, "");
374 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_write, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_write, 0, "");
375 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_copy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_copy, 0, "");
376 #if VM_SCAN_FOR_SHADOW_CHAIN
377 static int vm_shadow_max_enabled = 0;    /* Disabled by default */
378 extern int proc_shadow_max(void);
379 static int
380 vm_shadow_max SYSCTL_HANDLER_ARGS
381 {
382 #pragma unused(arg1, arg2, oidp)
383 	int value = 0;
384 
385 	if (vm_shadow_max_enabled) {
386 		value = proc_shadow_max();
387 	}
388 
389 	return SYSCTL_OUT(req, &value, sizeof(value));
390 }
391 SYSCTL_PROC(_vm, OID_AUTO, vm_shadow_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
392     0, 0, &vm_shadow_max, "I", "");
393 
394 SYSCTL_INT(_vm, OID_AUTO, vm_shadow_max_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_shadow_max_enabled, 0, "");
395 
396 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
397 
398 SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, "");
399 
400 #if PAGE_SLEEP_WITH_INHERITOR
401 #if DEVELOPMENT || DEBUG
402 extern uint32_t page_worker_table_size;
403 SYSCTL_INT(_vm, OID_AUTO, page_worker_table_size, CTLFLAG_RD | CTLFLAG_LOCKED, &page_worker_table_size, 0, "");
404 SCALABLE_COUNTER_DECLARE(page_worker_hash_collisions);
405 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_hash_collisions, page_worker_hash_collisions, "");
406 SCALABLE_COUNTER_DECLARE(page_worker_inheritor_sleeps);
407 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_inheritor_sleeps, page_worker_inheritor_sleeps, "");
408 #endif /* DEVELOPMENT || DEBUG */
409 #endif /* PAGE_SLEEP_WITH_INHERITOR */
410 
411 /*
412  * Sysctl's related to data/stack execution.  See osfmk/vm/vm_map.c
413  */
414 
415 #if DEVELOPMENT || DEBUG
416 extern int allow_stack_exec, allow_data_exec;
417 
418 SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, "");
419 SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, "");
420 
421 #endif /* DEVELOPMENT || DEBUG */
422 
423 static const char *prot_values[] = {
424 	"none",
425 	"read-only",
426 	"write-only",
427 	"read-write",
428 	"execute-only",
429 	"read-execute",
430 	"write-execute",
431 	"read-write-execute"
432 };
433 
434 void
log_stack_execution_failure(addr64_t vaddr,vm_prot_t prot)435 log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot)
436 {
437 	printf("Data/Stack execution not permitted: %s[pid %d] at virtual address 0x%qx, protections were %s\n",
438 	    current_proc()->p_comm, proc_getpid(current_proc()), vaddr, prot_values[prot & VM_PROT_ALL]);
439 }
440 
441 /*
442  * shared_region_unnest_logging: level of logging of unnesting events
443  * 0	- no logging
444  * 1	- throttled logging of unexpected unnesting events (default)
445  * 2	- unthrottled logging of unexpected unnesting events
446  * 3+	- unthrottled logging of all unnesting events
447  */
448 int shared_region_unnest_logging = 1;
449 
450 SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED,
451     &shared_region_unnest_logging, 0, "");
452 
453 int vm_shared_region_unnest_log_interval = 10;
454 int shared_region_unnest_log_count_threshold = 5;
455 
456 
457 #if XNU_TARGET_OS_OSX
458 
459 #if defined (__x86_64__)
460 static int scdir_enforce = 1;
461 #else /* defined (__x86_64__) */
462 static int scdir_enforce = 0;   /* AOT caches live elsewhere */
463 #endif /* defined (__x86_64__) */
464 
465 static char *scdir_path[] = {
466 	"/System/Library/dyld/",
467 	"/System/Volumes/Preboot/Cryptexes/OS/System/Library/dyld",
468 	"/System/Cryptexes/OS/System/Library/dyld",
469 	NULL
470 };
471 
472 #else /* XNU_TARGET_OS_OSX */
473 
474 static int scdir_enforce = 0;
475 static char *scdir_path[] = {
476 	"/System/Library/Caches/com.apple.dyld/",
477 	"/private/preboot/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
478 	"/System/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
479 	NULL
480 };
481 
482 #endif /* XNU_TARGET_OS_OSX */
483 
484 static char *driverkit_scdir_path[] = {
485 	"/System/DriverKit/System/Library/dyld/",
486 #if XNU_TARGET_OS_OSX
487 	"/System/Volumes/Preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
488 #else
489 	"/private/preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
490 #endif /* XNU_TARGET_OS_OSX */
491 	"/System/Cryptexes/OS/System/DriverKit/System/Library/dyld",
492 	NULL
493 };
494 
495 #ifndef SECURE_KERNEL
496 static int sysctl_scdir_enforce SYSCTL_HANDLER_ARGS
497 {
498 #if CONFIG_CSR
499 	if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
500 		printf("Failed attempt to set vm.enforce_shared_cache_dir sysctl\n");
501 		return EPERM;
502 	}
503 #endif /* CONFIG_CSR */
504 	return sysctl_handle_int(oidp, arg1, arg2, req);
505 }
506 
507 SYSCTL_PROC(_vm, OID_AUTO, enforce_shared_cache_dir, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, sysctl_scdir_enforce, "I", "");
508 #endif
509 
510 /* These log rate throttling state variables aren't thread safe, but
511  * are sufficient unto the task.
512  */
513 static int64_t last_unnest_log_time = 0;
514 static int shared_region_unnest_log_count = 0;
515 
516 void
log_unnest_badness(vm_map_t m,vm_map_offset_t s,vm_map_offset_t e,boolean_t is_nested_map,vm_map_offset_t lowest_unnestable_addr)517 log_unnest_badness(
518 	vm_map_t        m,
519 	vm_map_offset_t s,
520 	vm_map_offset_t e,
521 	boolean_t       is_nested_map,
522 	vm_map_offset_t lowest_unnestable_addr)
523 {
524 	struct timeval  tv;
525 
526 	if (shared_region_unnest_logging == 0) {
527 		return;
528 	}
529 
530 	if (shared_region_unnest_logging <= 2 &&
531 	    is_nested_map &&
532 	    s >= lowest_unnestable_addr) {
533 		/*
534 		 * Unnesting of writable map entries is fine.
535 		 */
536 		return;
537 	}
538 
539 	if (shared_region_unnest_logging <= 1) {
540 		microtime(&tv);
541 		if ((tv.tv_sec - last_unnest_log_time) <
542 		    vm_shared_region_unnest_log_interval) {
543 			if (shared_region_unnest_log_count++ >
544 			    shared_region_unnest_log_count_threshold) {
545 				return;
546 			}
547 		} else {
548 			last_unnest_log_time = tv.tv_sec;
549 			shared_region_unnest_log_count = 0;
550 		}
551 	}
552 
553 	DTRACE_VM4(log_unnest_badness,
554 	    vm_map_t, m,
555 	    vm_map_offset_t, s,
556 	    vm_map_offset_t, e,
557 	    vm_map_offset_t, lowest_unnestable_addr);
558 	printf("%s[%d] triggered unnest of range 0x%qx->0x%qx of DYLD shared region in VM map %p. While not abnormal for debuggers, this increases system memory footprint until the target exits.\n", current_proc()->p_comm, proc_getpid(current_proc()), (uint64_t)s, (uint64_t)e, (void *) VM_KERNEL_ADDRPERM(m));
559 }
560 
561 uint64_t
vm_purge_filebacked_pagers(void)562 vm_purge_filebacked_pagers(void)
563 {
564 	uint64_t pages_purged;
565 
566 	pages_purged = 0;
567 	pages_purged += apple_protect_pager_purge_all();
568 	pages_purged += shared_region_pager_purge_all();
569 	pages_purged += dyld_pager_purge_all();
570 #if DEVELOPMENT || DEBUG
571 	printf("%s:%d pages purged: %llu\n", __FUNCTION__, __LINE__, pages_purged);
572 #endif /* DEVELOPMENT || DEBUG */
573 	return pages_purged;
574 }
575 
576 int
useracc(user_addr_ut addr_u,user_size_ut len_u,int prot)577 useracc(
578 	user_addr_ut    addr_u,
579 	user_size_ut    len_u,
580 	int             prot)
581 {
582 	vm_map_t        map;
583 	vm_prot_t       vm_prot = VM_PROT_WRITE;
584 
585 	map = current_map();
586 
587 	if (prot == B_READ) {
588 		vm_prot = VM_PROT_READ;
589 	}
590 
591 	return vm_map_check_protection(map, addr_u,
592 	           vm_sanitize_compute_ut_end(addr_u, len_u), vm_prot,
593 	           VM_SANITIZE_CALLER_USERACC);
594 }
595 
596 #if XNU_PLATFORM_MacOSX
597 static __attribute__((always_inline, warn_unused_result))
598 kern_return_t
vslock_sanitize(vm_map_t map,user_addr_ut addr_u,user_size_ut len_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)599 vslock_sanitize(
600 	vm_map_t                map,
601 	user_addr_ut            addr_u,
602 	user_size_ut            len_u,
603 	vm_sanitize_caller_t    vm_sanitize_caller,
604 	vm_map_offset_t        *start,
605 	vm_map_offset_t        *end,
606 	vm_map_size_t          *size)
607 {
608 	return vm_sanitize_addr_size(addr_u, len_u, vm_sanitize_caller,
609 	           map,
610 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
611 	           size);
612 }
613 #endif /* XNU_PLATFORM_MacOSX */
614 
615 int
vslock(user_addr_ut addr,user_size_ut len)616 vslock(user_addr_ut addr, user_size_ut len)
617 {
618 	kern_return_t kret;
619 
620 #if XNU_PLATFORM_MacOSX
621 	/*
622 	 * Preserve previous behavior on macOS for overflows due to bin
623 	 * compatibility i.e. return success for overflows without doing
624 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
625 	 * for overflow errors which gets converted to KERN_SUCCESS by
626 	 * vm_sanitize_get_kr.
627 	 */
628 	vm_map_offset_t start, end;
629 	vm_map_size_t   size;
630 
631 	kret = vslock_sanitize(current_map(),
632 	    addr,
633 	    len,
634 	    VM_SANITIZE_CALLER_VSLOCK,
635 	    &start,
636 	    &end,
637 	    &size);
638 	if (__improbable(kret != KERN_SUCCESS)) {
639 		switch (vm_sanitize_get_kr(kret)) {
640 		case KERN_SUCCESS:
641 			return 0;
642 		case KERN_INVALID_ADDRESS:
643 		case KERN_NO_SPACE:
644 			return ENOMEM;
645 		case KERN_PROTECTION_FAILURE:
646 			return EACCES;
647 		default:
648 			return EINVAL;
649 		}
650 	}
651 #endif /* XNU_PLATFORM_MacOSX */
652 
653 	kret = vm_map_wire_kernel(current_map(), addr,
654 	    vm_sanitize_compute_ut_end(addr, len),
655 	    vm_sanitize_wrap_prot(VM_PROT_READ | VM_PROT_WRITE),
656 	    VM_KERN_MEMORY_BSD,
657 	    FALSE);
658 
659 	switch (kret) {
660 	case KERN_SUCCESS:
661 		return 0;
662 	case KERN_INVALID_ADDRESS:
663 	case KERN_NO_SPACE:
664 		return ENOMEM;
665 	case KERN_PROTECTION_FAILURE:
666 		return EACCES;
667 	default:
668 		return EINVAL;
669 	}
670 }
671 
672 int
vsunlock(user_addr_ut addr,user_size_ut len,__unused int dirtied)673 vsunlock(user_addr_ut addr, user_size_ut len, __unused int dirtied)
674 {
675 #if FIXME  /* [ */
676 	pmap_t          pmap;
677 	vm_page_t       pg;
678 	vm_map_offset_t vaddr;
679 	ppnum_t         paddr;
680 #endif  /* FIXME ] */
681 	kern_return_t   kret;
682 	vm_map_t        map;
683 
684 	map = current_map();
685 
686 #if FIXME  /* [ */
687 	if (dirtied) {
688 		pmap = get_task_pmap(current_task());
689 		for (vaddr = vm_map_trunc_page(addr, PAGE_MASK);
690 		    vaddr < vm_map_round_page(addr + len, PAGE_MASK);
691 		    vaddr += PAGE_SIZE) {
692 			paddr = pmap_find_phys(pmap, vaddr);
693 			pg = PHYS_TO_VM_PAGE(paddr);
694 			vm_page_set_modified(pg);
695 		}
696 	}
697 #endif  /* FIXME ] */
698 #ifdef  lint
699 	dirtied++;
700 #endif  /* lint */
701 
702 #if XNU_PLATFORM_MacOSX
703 	/*
704 	 * Preserve previous behavior on macOS for overflows due to bin
705 	 * compatibility i.e. return success for overflows without doing
706 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
707 	 * for overflow errors which gets converted to KERN_SUCCESS by
708 	 * vm_sanitize_get_kr.
709 	 */
710 	vm_map_offset_t start, end;
711 	vm_map_size_t   size;
712 
713 	kret = vslock_sanitize(map,
714 	    addr,
715 	    len,
716 	    VM_SANITIZE_CALLER_VSUNLOCK,
717 	    &start,
718 	    &end,
719 	    &size);
720 	if (__improbable(kret != KERN_SUCCESS)) {
721 		switch (vm_sanitize_get_kr(kret)) {
722 		case KERN_SUCCESS:
723 			return 0;
724 		case KERN_INVALID_ADDRESS:
725 		case KERN_NO_SPACE:
726 			return ENOMEM;
727 		case KERN_PROTECTION_FAILURE:
728 			return EACCES;
729 		default:
730 			return EINVAL;
731 		}
732 	}
733 #endif /* XNU_PLATFORM_MacOSX */
734 
735 	kret = vm_map_unwire(map, addr,
736 	    vm_sanitize_compute_ut_end(addr, len), false);
737 	switch (kret) {
738 	case KERN_SUCCESS:
739 		return 0;
740 	case KERN_INVALID_ADDRESS:
741 	case KERN_NO_SPACE:
742 		return ENOMEM;
743 	case KERN_PROTECTION_FAILURE:
744 		return EACCES;
745 	default:
746 		return EINVAL;
747 	}
748 }
749 
750 int
subyte(user_addr_t addr,int byte)751 subyte(
752 	user_addr_t addr,
753 	int byte)
754 {
755 	char character;
756 
757 	character = (char)byte;
758 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
759 }
760 
761 int
suibyte(user_addr_t addr,int byte)762 suibyte(
763 	user_addr_t addr,
764 	int byte)
765 {
766 	char character;
767 
768 	character = (char)byte;
769 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
770 }
771 
772 int
fubyte(user_addr_t addr)773 fubyte(user_addr_t addr)
774 {
775 	unsigned char byte;
776 
777 	if (copyin(addr, (void *) &byte, sizeof(char))) {
778 		return -1;
779 	}
780 	return byte;
781 }
782 
783 int
fuibyte(user_addr_t addr)784 fuibyte(user_addr_t addr)
785 {
786 	unsigned char byte;
787 
788 	if (copyin(addr, (void *) &(byte), sizeof(char))) {
789 		return -1;
790 	}
791 	return byte;
792 }
793 
794 int
suword(user_addr_t addr,long word)795 suword(
796 	user_addr_t addr,
797 	long word)
798 {
799 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
800 }
801 
802 long
fuword(user_addr_t addr)803 fuword(user_addr_t addr)
804 {
805 	long word = 0;
806 
807 	if (copyin(addr, (void *) &word, sizeof(int))) {
808 		return -1;
809 	}
810 	return word;
811 }
812 
813 /* suiword and fuiword are the same as suword and fuword, respectively */
814 
815 int
suiword(user_addr_t addr,long word)816 suiword(
817 	user_addr_t addr,
818 	long word)
819 {
820 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
821 }
822 
823 long
fuiword(user_addr_t addr)824 fuiword(user_addr_t addr)
825 {
826 	long word = 0;
827 
828 	if (copyin(addr, (void *) &word, sizeof(int))) {
829 		return -1;
830 	}
831 	return word;
832 }
833 
834 /*
835  * With a 32-bit kernel and mixed 32/64-bit user tasks, this interface allows the
836  * fetching and setting of process-sized size_t and pointer values.
837  */
838 int
sulong(user_addr_t addr,int64_t word)839 sulong(user_addr_t addr, int64_t word)
840 {
841 	if (IS_64BIT_PROCESS(current_proc())) {
842 		return copyout((void *)&word, addr, sizeof(word)) == 0 ? 0 : -1;
843 	} else {
844 		return suiword(addr, (long)word);
845 	}
846 }
847 
848 int64_t
fulong(user_addr_t addr)849 fulong(user_addr_t addr)
850 {
851 	int64_t longword;
852 
853 	if (IS_64BIT_PROCESS(current_proc())) {
854 		if (copyin(addr, (void *)&longword, sizeof(longword)) != 0) {
855 			return -1;
856 		}
857 		return longword;
858 	} else {
859 		return (int64_t)fuiword(addr);
860 	}
861 }
862 
863 int
suulong(user_addr_t addr,uint64_t uword)864 suulong(user_addr_t addr, uint64_t uword)
865 {
866 	if (IS_64BIT_PROCESS(current_proc())) {
867 		return copyout((void *)&uword, addr, sizeof(uword)) == 0 ? 0 : -1;
868 	} else {
869 		return suiword(addr, (uint32_t)uword);
870 	}
871 }
872 
873 uint64_t
fuulong(user_addr_t addr)874 fuulong(user_addr_t addr)
875 {
876 	uint64_t ulongword;
877 
878 	if (IS_64BIT_PROCESS(current_proc())) {
879 		if (copyin(addr, (void *)&ulongword, sizeof(ulongword)) != 0) {
880 			return -1ULL;
881 		}
882 		return ulongword;
883 	} else {
884 		return (uint64_t)fuiword(addr);
885 	}
886 }
887 
888 int
swapon(__unused proc_t procp,__unused struct swapon_args * uap,__unused int * retval)889 swapon(__unused proc_t procp, __unused struct swapon_args *uap, __unused int *retval)
890 {
891 	return ENOTSUP;
892 }
893 
894 #if defined(SECURE_KERNEL)
895 static int kern_secure_kernel = 1;
896 #else
897 static int kern_secure_kernel = 0;
898 #endif
899 
900 SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_secure_kernel, 0, "");
901 SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW | CTLFLAG_LOCKED,
902     &shared_region_trace_level, 0, "");
903 SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD | CTLFLAG_LOCKED,
904     &shared_region_version, 0, "");
905 SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED,
906     &shared_region_persistence, 0, "");
907 
908 /*
909  * shared_region_check_np:
910  *
911  * This system call is intended for dyld.
912  *
913  * dyld calls this when any process starts to see if the process's shared
914  * region is already set up and ready to use.
915  * This call returns the base address of the first mapping in the
916  * process's shared region's first mapping.
917  * dyld will then check what's mapped at that address.
918  *
919  * If the shared region is empty, dyld will then attempt to map the shared
920  * cache file in the shared region via the shared_region_map_np() system call.
921  *
922  * If something's already mapped in the shared region, dyld will check if it
923  * matches the shared cache it would like to use for that process.
924  * If it matches, evrything's ready and the process can proceed and use the
925  * shared region.
926  * If it doesn't match, dyld will unmap the shared region and map the shared
927  * cache into the process's address space via mmap().
928  *
929  * A NULL pointer argument can be used by dyld to indicate it has unmapped
930  * the shared region. We will remove the shared_region reference from the task.
931  *
932  * ERROR VALUES
933  * EINVAL	no shared region
934  * ENOMEM	shared region is empty
935  * EFAULT	bad address for "start_address"
936  */
937 int
shared_region_check_np(__unused struct proc * p,struct shared_region_check_np_args * uap,__unused int * retvalp)938 shared_region_check_np(
939 	__unused struct proc                    *p,
940 	struct shared_region_check_np_args      *uap,
941 	__unused int                            *retvalp)
942 {
943 	vm_shared_region_t      shared_region;
944 	mach_vm_offset_t        start_address = 0;
945 	int                     error = 0;
946 	kern_return_t           kr;
947 	task_t                  task = current_task();
948 
949 	SHARED_REGION_TRACE_DEBUG(
950 		("shared_region: %p [%d(%s)] -> check_np(0x%llx)\n",
951 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
952 		proc_getpid(p), p->p_comm,
953 		(uint64_t)uap->start_address));
954 
955 	/*
956 	 * Special value of start_address used to indicate that map_with_linking() should
957 	 * no longer be allowed in this process
958 	 */
959 	if (uap->start_address == (task_get_64bit_addr(task) ? DYLD_VM_END_MWL : (uint32_t)DYLD_VM_END_MWL)) {
960 		p->p_disallow_map_with_linking = TRUE;
961 		return 0;
962 	}
963 
964 	/* retrieve the current tasks's shared region */
965 	shared_region = vm_shared_region_get(task);
966 	if (shared_region != NULL) {
967 		/*
968 		 * A NULL argument is used by dyld to indicate the task
969 		 * has unmapped its shared region.
970 		 */
971 		if (uap->start_address == 0) {
972 			/* unmap it first */
973 			vm_shared_region_remove(task, shared_region);
974 			vm_shared_region_set(task, NULL);
975 		} else {
976 			/* retrieve address of its first mapping... */
977 			kr = vm_shared_region_start_address(shared_region, &start_address, task);
978 			if (kr != KERN_SUCCESS) {
979 				SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
980 				    "check_np(0x%llx) "
981 				    "vm_shared_region_start_address() failed\n",
982 				    (void *)VM_KERNEL_ADDRPERM(current_thread()),
983 				    proc_getpid(p), p->p_comm,
984 				    (uint64_t)uap->start_address));
985 				error = ENOMEM;
986 			} else {
987 #if __has_feature(ptrauth_calls)
988 				/*
989 				 * Remap any section of the shared library that
990 				 * has authenticated pointers into private memory.
991 				 */
992 				if (vm_shared_region_auth_remap(shared_region) != KERN_SUCCESS) {
993 					SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
994 					    "check_np(0x%llx) "
995 					    "vm_shared_region_auth_remap() failed\n",
996 					    (void *)VM_KERNEL_ADDRPERM(current_thread()),
997 					    proc_getpid(p), p->p_comm,
998 					    (uint64_t)uap->start_address));
999 					error = ENOMEM;
1000 				}
1001 #endif /* __has_feature(ptrauth_calls) */
1002 
1003 				/* ... and give it to the caller */
1004 				if (error == 0) {
1005 					error = copyout(&start_address,
1006 					    (user_addr_t) uap->start_address,
1007 					    sizeof(start_address));
1008 					if (error != 0) {
1009 						SHARED_REGION_TRACE_ERROR(
1010 							("shared_region: %p [%d(%s)] "
1011 							"check_np(0x%llx) "
1012 							"copyout(0x%llx) error %d\n",
1013 							(void *)VM_KERNEL_ADDRPERM(current_thread()),
1014 							proc_getpid(p), p->p_comm,
1015 							(uint64_t)uap->start_address, (uint64_t)start_address,
1016 							error));
1017 					}
1018 				}
1019 			}
1020 		}
1021 		vm_shared_region_deallocate(shared_region);
1022 	} else {
1023 		/* no shared region ! */
1024 		error = EINVAL;
1025 	}
1026 
1027 	SHARED_REGION_TRACE_DEBUG(
1028 		("shared_region: %p [%d(%s)] check_np(0x%llx) <- 0x%llx %d\n",
1029 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1030 		proc_getpid(p), p->p_comm,
1031 		(uint64_t)uap->start_address, (uint64_t)start_address, error));
1032 
1033 	return error;
1034 }
1035 
1036 
1037 static int
shared_region_copyin(struct proc * p,user_addr_t user_addr,unsigned int count,unsigned int element_size,void * kernel_data)1038 shared_region_copyin(
1039 	struct proc  *p,
1040 	user_addr_t  user_addr,
1041 	unsigned int count,
1042 	unsigned int element_size,
1043 	void         *kernel_data)
1044 {
1045 	int             error = 0;
1046 	vm_size_t       size = count * element_size;
1047 
1048 	error = copyin(user_addr, kernel_data, size);
1049 	if (error) {
1050 		SHARED_REGION_TRACE_ERROR(
1051 			("shared_region: %p [%d(%s)] map(): "
1052 			"copyin(0x%llx, %ld) failed (error=%d)\n",
1053 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1054 			proc_getpid(p), p->p_comm,
1055 			(uint64_t)user_addr, (long)size, error));
1056 	}
1057 	return error;
1058 }
1059 
1060 /*
1061  * A reasonable upper limit to prevent overflow of allocation/copyin.
1062  */
1063 #define _SR_FILE_MAPPINGS_MAX_FILES 256
1064 
1065 /* forward declaration */
1066 __attribute__((noinline))
1067 static void shared_region_map_and_slide_cleanup(
1068 	struct proc              *p,
1069 	uint32_t                 files_count,
1070 	struct _sr_file_mappings *sr_file_mappings,
1071 	struct vm_shared_region  *shared_region);
1072 
1073 /*
1074  * Setup part of _shared_region_map_and_slide().
1075  * It had to be broken out of _shared_region_map_and_slide() to
1076  * prevent compiler inlining from blowing out the stack.
1077  */
1078 __attribute__((noinline))
1079 static int
shared_region_map_and_slide_setup(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings,struct _sr_file_mappings ** sr_file_mappings,struct vm_shared_region ** shared_region_ptr,struct vnode * rdir_vp)1080 shared_region_map_and_slide_setup(
1081 	struct proc                         *p,
1082 	uint32_t                            files_count,
1083 	struct shared_file_np               *files,
1084 	uint32_t                            mappings_count,
1085 	struct shared_file_mapping_slide_np *mappings,
1086 	struct _sr_file_mappings            **sr_file_mappings,
1087 	struct vm_shared_region             **shared_region_ptr,
1088 	struct vnode                        *rdir_vp)
1089 {
1090 	int                             error = 0;
1091 	struct _sr_file_mappings        *srfmp;
1092 	uint32_t                        mappings_next;
1093 	struct vnode_attr               va;
1094 	off_t                           fs;
1095 #if CONFIG_MACF
1096 	vm_prot_t                       maxprot = VM_PROT_ALL;
1097 #endif
1098 	uint32_t                        i;
1099 	struct vm_shared_region         *shared_region = NULL;
1100 	boolean_t                       is_driverkit = task_is_driver(current_task());
1101 
1102 	SHARED_REGION_TRACE_DEBUG(
1103 		("shared_region: %p [%d(%s)] -> map\n",
1104 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1105 		proc_getpid(p), p->p_comm));
1106 
1107 	if (files_count > _SR_FILE_MAPPINGS_MAX_FILES) {
1108 		error = E2BIG;
1109 		goto done;
1110 	}
1111 	if (files_count == 0) {
1112 		error = EINVAL;
1113 		goto done;
1114 	}
1115 	*sr_file_mappings = kalloc_type(struct _sr_file_mappings, files_count,
1116 	    Z_WAITOK | Z_ZERO);
1117 	if (*sr_file_mappings == NULL) {
1118 		error = ENOMEM;
1119 		goto done;
1120 	}
1121 	mappings_next = 0;
1122 	for (i = 0; i < files_count; i++) {
1123 		srfmp = &(*sr_file_mappings)[i];
1124 		srfmp->fd = files[i].sf_fd;
1125 		srfmp->mappings_count = files[i].sf_mappings_count;
1126 		srfmp->mappings = &mappings[mappings_next];
1127 		mappings_next += srfmp->mappings_count;
1128 		if (mappings_next > mappings_count) {
1129 			error = EINVAL;
1130 			goto done;
1131 		}
1132 		srfmp->slide = files[i].sf_slide;
1133 	}
1134 
1135 	/* get the process's shared region (setup in vm_map_exec()) */
1136 	shared_region = vm_shared_region_trim_and_get(current_task());
1137 	*shared_region_ptr = shared_region;
1138 	if (shared_region == NULL) {
1139 		SHARED_REGION_TRACE_ERROR(
1140 			("shared_region: %p [%d(%s)] map(): "
1141 			"no shared region\n",
1142 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1143 			proc_getpid(p), p->p_comm));
1144 		error = EINVAL;
1145 		goto done;
1146 	}
1147 
1148 	/*
1149 	 * Check the shared region matches the current root
1150 	 * directory of this process.  Deny the mapping to
1151 	 * avoid tainting the shared region with something that
1152 	 * doesn't quite belong into it.
1153 	 */
1154 	struct vnode *sr_vnode = vm_shared_region_root_dir(shared_region);
1155 	if (sr_vnode != NULL ?  rdir_vp != sr_vnode : rdir_vp != rootvnode) {
1156 		SHARED_REGION_TRACE_ERROR(
1157 			("shared_region: map(%p) root_dir mismatch\n",
1158 			(void *)VM_KERNEL_ADDRPERM(current_thread())));
1159 		error = EPERM;
1160 		goto done;
1161 	}
1162 
1163 
1164 	for (srfmp = &(*sr_file_mappings)[0];
1165 	    srfmp < &(*sr_file_mappings)[files_count];
1166 	    srfmp++) {
1167 		if (srfmp->mappings_count == 0) {
1168 			/* no mappings here... */
1169 			continue;
1170 		}
1171 
1172 		/*
1173 		 * A file descriptor of -1 is used to indicate that the data
1174 		 * to be put in the shared region for this mapping comes directly
1175 		 * from the processes address space. Ensure we have proper alignments.
1176 		 */
1177 		if (srfmp->fd == -1) {
1178 			/* only allow one mapping per fd */
1179 			if (srfmp->mappings_count > 1) {
1180 				SHARED_REGION_TRACE_ERROR(
1181 					("shared_region: %p [%d(%s)] map data >1 mapping\n",
1182 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1183 					proc_getpid(p), p->p_comm));
1184 				error = EINVAL;
1185 				goto done;
1186 			}
1187 
1188 			/*
1189 			 * The destination address and size must be page aligned.
1190 			 */
1191 			struct shared_file_mapping_slide_np *mapping = &srfmp->mappings[0];
1192 			mach_vm_address_t dest_addr = mapping->sms_address;
1193 			mach_vm_size_t    map_size = mapping->sms_size;
1194 			if (!vm_map_page_aligned(dest_addr, vm_map_page_mask(current_map()))) {
1195 				SHARED_REGION_TRACE_ERROR(
1196 					("shared_region: %p [%d(%s)] map data destination 0x%llx not aligned\n",
1197 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1198 					proc_getpid(p), p->p_comm, dest_addr));
1199 				error = EINVAL;
1200 				goto done;
1201 			}
1202 			if (!vm_map_page_aligned(map_size, vm_map_page_mask(current_map()))) {
1203 				SHARED_REGION_TRACE_ERROR(
1204 					("shared_region: %p [%d(%s)] map data size 0x%llx not aligned\n",
1205 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1206 					proc_getpid(p), p->p_comm, map_size));
1207 				error = EINVAL;
1208 				goto done;
1209 			}
1210 			continue;
1211 		}
1212 
1213 		/* get file structure from file descriptor */
1214 		error = fp_get_ftype(p, srfmp->fd, DTYPE_VNODE, EINVAL, &srfmp->fp);
1215 		if (error) {
1216 			SHARED_REGION_TRACE_ERROR(
1217 				("shared_region: %p [%d(%s)] map: "
1218 				"fd=%d lookup failed (error=%d)\n",
1219 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1220 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1221 			goto done;
1222 		}
1223 
1224 		/* we need at least read permission on the file */
1225 		if (!(srfmp->fp->fp_glob->fg_flag & FREAD)) {
1226 			SHARED_REGION_TRACE_ERROR(
1227 				("shared_region: %p [%d(%s)] map: "
1228 				"fd=%d not readable\n",
1229 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1230 				proc_getpid(p), p->p_comm, srfmp->fd));
1231 			error = EPERM;
1232 			goto done;
1233 		}
1234 
1235 		/* get vnode from file structure */
1236 		error = vnode_getwithref((vnode_t)fp_get_data(srfmp->fp));
1237 		if (error) {
1238 			SHARED_REGION_TRACE_ERROR(
1239 				("shared_region: %p [%d(%s)] map: "
1240 				"fd=%d getwithref failed (error=%d)\n",
1241 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1242 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1243 			goto done;
1244 		}
1245 		srfmp->vp = (struct vnode *)fp_get_data(srfmp->fp);
1246 
1247 		/* make sure the vnode is a regular file */
1248 		if (srfmp->vp->v_type != VREG) {
1249 			SHARED_REGION_TRACE_ERROR(
1250 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1251 				"not a file (type=%d)\n",
1252 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1253 				proc_getpid(p), p->p_comm,
1254 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1255 				srfmp->vp->v_name, srfmp->vp->v_type));
1256 			error = EINVAL;
1257 			goto done;
1258 		}
1259 
1260 #if CONFIG_MACF
1261 		/* pass in 0 for the offset argument because AMFI does not need the offset
1262 		 *       of the shared cache */
1263 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
1264 		    srfmp->fp->fp_glob, VM_PROT_ALL, MAP_FILE | MAP_PRIVATE | MAP_FIXED, 0, &maxprot);
1265 		if (error) {
1266 			goto done;
1267 		}
1268 #endif /* MAC */
1269 
1270 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1271 		/*
1272 		 * Check if the shared cache is in the trust cache;
1273 		 * if so, we can skip the root ownership check.
1274 		 */
1275 #if DEVELOPMENT || DEBUG
1276 		/*
1277 		 * Skip both root ownership and trust cache check if
1278 		 * enforcement is disabled.
1279 		 */
1280 		if (!cs_system_enforcement()) {
1281 			goto after_root_check;
1282 		}
1283 #endif /* DEVELOPMENT || DEBUG */
1284 		struct cs_blob *blob = csvnode_get_blob(srfmp->vp, 0);
1285 		if (blob == NULL) {
1286 			SHARED_REGION_TRACE_ERROR(
1287 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1288 				"missing CS blob\n",
1289 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1290 				proc_getpid(p), p->p_comm,
1291 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1292 				srfmp->vp->v_name));
1293 			goto root_check;
1294 		}
1295 		const uint8_t *cdhash = csblob_get_cdhash(blob);
1296 		if (cdhash == NULL) {
1297 			SHARED_REGION_TRACE_ERROR(
1298 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1299 				"missing cdhash\n",
1300 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1301 				proc_getpid(p), p->p_comm,
1302 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1303 				srfmp->vp->v_name));
1304 			goto root_check;
1305 		}
1306 
1307 		bool in_trust_cache = false;
1308 		TrustCacheQueryToken_t qt;
1309 		if (query_trust_cache(kTCQueryTypeAll, cdhash, &qt) == KERN_SUCCESS) {
1310 			TCType_t tc_type = kTCTypeInvalid;
1311 			TCReturn_t tc_ret = amfi->TrustCache.queryGetTCType(&qt, &tc_type);
1312 			in_trust_cache = (tc_ret.error == kTCReturnSuccess &&
1313 			    (tc_type == kTCTypeCryptex1BootOS ||
1314 			    tc_type == kTCTypeStatic ||
1315 			    tc_type == kTCTypeEngineering));
1316 		}
1317 		if (!in_trust_cache) {
1318 			SHARED_REGION_TRACE_ERROR(
1319 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1320 				"not in trust cache\n",
1321 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1322 				proc_getpid(p), p->p_comm,
1323 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1324 				srfmp->vp->v_name));
1325 			goto root_check;
1326 		}
1327 		goto after_root_check;
1328 root_check:
1329 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1330 
1331 		/* The shared cache file must be owned by root */
1332 		VATTR_INIT(&va);
1333 		VATTR_WANTED(&va, va_uid);
1334 		error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1335 		if (error) {
1336 			SHARED_REGION_TRACE_ERROR(
1337 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1338 				"vnode_getattr(%p) failed (error=%d)\n",
1339 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1340 				proc_getpid(p), p->p_comm,
1341 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1342 				srfmp->vp->v_name,
1343 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1344 				error));
1345 			goto done;
1346 		}
1347 		if (va.va_uid != 0) {
1348 			SHARED_REGION_TRACE_ERROR(
1349 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1350 				"owned by uid=%d instead of 0\n",
1351 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1352 				proc_getpid(p), p->p_comm,
1353 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1354 				srfmp->vp->v_name, va.va_uid));
1355 			error = EPERM;
1356 			goto done;
1357 		}
1358 
1359 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1360 after_root_check:
1361 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1362 
1363 #if CONFIG_CSR
1364 		if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
1365 			VATTR_INIT(&va);
1366 			VATTR_WANTED(&va, va_flags);
1367 			error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1368 			if (error) {
1369 				SHARED_REGION_TRACE_ERROR(
1370 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1371 					"vnode_getattr(%p) failed (error=%d)\n",
1372 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1373 					proc_getpid(p), p->p_comm,
1374 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1375 					srfmp->vp->v_name,
1376 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1377 					error));
1378 				goto done;
1379 			}
1380 
1381 			if (!(va.va_flags & SF_RESTRICTED)) {
1382 				/*
1383 				 * CSR is not configured in CSR_ALLOW_UNRESTRICTED_FS mode, and
1384 				 * the shared cache file is NOT SIP-protected, so reject the
1385 				 * mapping request
1386 				 */
1387 				SHARED_REGION_TRACE_ERROR(
1388 					("shared_region: %p [%d(%s)] map(%p:'%s'), "
1389 					"vnode is not SIP-protected. \n",
1390 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1391 					proc_getpid(p), p->p_comm,
1392 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1393 					srfmp->vp->v_name));
1394 				error = EPERM;
1395 				goto done;
1396 			}
1397 		}
1398 #else /* CONFIG_CSR */
1399 
1400 		/*
1401 		 * Devices without SIP/ROSP need to make sure that the shared cache
1402 		 * is either on the root volume or in the preboot cryptex volume.
1403 		 */
1404 		assert(rdir_vp != NULL);
1405 		if (srfmp->vp->v_mount != rdir_vp->v_mount) {
1406 			vnode_t preboot_vp = NULL;
1407 #if XNU_TARGET_OS_OSX
1408 #define PREBOOT_CRYPTEX_PATH "/System/Volumes/Preboot/Cryptexes"
1409 #else
1410 #define PREBOOT_CRYPTEX_PATH "/private/preboot/Cryptexes"
1411 #endif
1412 			error = vnode_lookup(PREBOOT_CRYPTEX_PATH, 0, &preboot_vp, vfs_context_current());
1413 			if (error || srfmp->vp->v_mount != preboot_vp->v_mount) {
1414 				SHARED_REGION_TRACE_ERROR(
1415 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1416 					"not on process' root volume nor preboot volume\n",
1417 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1418 					proc_getpid(p), p->p_comm,
1419 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1420 					srfmp->vp->v_name));
1421 				error = EPERM;
1422 				if (preboot_vp) {
1423 					(void)vnode_put(preboot_vp);
1424 				}
1425 				goto done;
1426 			} else if (preboot_vp) {
1427 				(void)vnode_put(preboot_vp);
1428 			}
1429 		}
1430 #endif /* CONFIG_CSR */
1431 
1432 		if (scdir_enforce) {
1433 			char **expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1434 			struct vnode *scdir_vp = NULL;
1435 			for (expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1436 			    *expected_scdir_path != NULL;
1437 			    expected_scdir_path++) {
1438 				/* get vnode for expected_scdir_path */
1439 				error = vnode_lookup(*expected_scdir_path, 0, &scdir_vp, vfs_context_current());
1440 				if (error) {
1441 					SHARED_REGION_TRACE_ERROR(
1442 						("shared_region: %p [%d(%s)]: "
1443 						"vnode_lookup(%s) failed (error=%d)\n",
1444 						(void *)VM_KERNEL_ADDRPERM(current_thread()),
1445 						proc_getpid(p), p->p_comm,
1446 						*expected_scdir_path, error));
1447 					continue;
1448 				}
1449 
1450 				/* check if parent is scdir_vp */
1451 				assert(scdir_vp != NULL);
1452 				if (vnode_parent(srfmp->vp) == scdir_vp) {
1453 					(void)vnode_put(scdir_vp);
1454 					scdir_vp = NULL;
1455 					goto scdir_ok;
1456 				}
1457 				(void)vnode_put(scdir_vp);
1458 				scdir_vp = NULL;
1459 			}
1460 			/* nothing matches */
1461 			SHARED_REGION_TRACE_ERROR(
1462 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1463 				"shared cache file not in expected directory\n",
1464 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1465 				proc_getpid(p), p->p_comm,
1466 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1467 				srfmp->vp->v_name));
1468 			error = EPERM;
1469 			goto done;
1470 		}
1471 scdir_ok:
1472 
1473 		/* get vnode size */
1474 		error = vnode_size(srfmp->vp, &fs, vfs_context_current());
1475 		if (error) {
1476 			SHARED_REGION_TRACE_ERROR(
1477 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1478 				"vnode_size(%p) failed (error=%d)\n",
1479 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1480 				proc_getpid(p), p->p_comm,
1481 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1482 				srfmp->vp->v_name,
1483 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp), error));
1484 			goto done;
1485 		}
1486 		srfmp->file_size = fs;
1487 
1488 		/* get the file's memory object handle */
1489 		srfmp->file_control = ubc_getobject(srfmp->vp, UBC_HOLDOBJECT);
1490 		if (srfmp->file_control == MEMORY_OBJECT_CONTROL_NULL) {
1491 			SHARED_REGION_TRACE_ERROR(
1492 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1493 				"no memory object\n",
1494 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1495 				proc_getpid(p), p->p_comm,
1496 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1497 				srfmp->vp->v_name));
1498 			error = EINVAL;
1499 			goto done;
1500 		}
1501 
1502 		/* check that the mappings are properly covered by code signatures */
1503 		if (!cs_system_enforcement()) {
1504 			/* code signing is not enforced: no need to check */
1505 		} else {
1506 			for (i = 0; i < srfmp->mappings_count; i++) {
1507 				if (srfmp->mappings[i].sms_init_prot & VM_PROT_ZF) {
1508 					/* zero-filled mapping: not backed by the file */
1509 					continue;
1510 				}
1511 				if (ubc_cs_is_range_codesigned(srfmp->vp,
1512 				    srfmp->mappings[i].sms_file_offset,
1513 				    srfmp->mappings[i].sms_size)) {
1514 					/* this mapping is fully covered by code signatures */
1515 					continue;
1516 				}
1517 				SHARED_REGION_TRACE_ERROR(
1518 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1519 					"mapping #%d/%d [0x%llx:0x%llx:0x%llx:0x%x:0x%x] "
1520 					"is not code-signed\n",
1521 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1522 					proc_getpid(p), p->p_comm,
1523 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1524 					srfmp->vp->v_name,
1525 					i, srfmp->mappings_count,
1526 					srfmp->mappings[i].sms_address,
1527 					srfmp->mappings[i].sms_size,
1528 					srfmp->mappings[i].sms_file_offset,
1529 					srfmp->mappings[i].sms_max_prot,
1530 					srfmp->mappings[i].sms_init_prot));
1531 				error = EINVAL;
1532 				goto done;
1533 			}
1534 		}
1535 	}
1536 done:
1537 	if (error != 0) {
1538 		shared_region_map_and_slide_cleanup(p, files_count, *sr_file_mappings, shared_region);
1539 		*sr_file_mappings = NULL;
1540 		*shared_region_ptr = NULL;
1541 	}
1542 	return error;
1543 }
1544 
1545 /*
1546  * shared_region_map_np()
1547  *
1548  * This system call is intended for dyld.
1549  *
1550  * dyld uses this to map a shared cache file into a shared region.
1551  * This is usually done only the first time a shared cache is needed.
1552  * Subsequent processes will just use the populated shared region without
1553  * requiring any further setup.
1554  */
1555 static int
_shared_region_map_and_slide(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings)1556 _shared_region_map_and_slide(
1557 	struct proc                         *p,
1558 	uint32_t                            files_count,
1559 	struct shared_file_np               *files,
1560 	uint32_t                            mappings_count,
1561 	struct shared_file_mapping_slide_np *mappings)
1562 {
1563 	int                             error = 0;
1564 	kern_return_t                   kr = KERN_SUCCESS;
1565 	struct _sr_file_mappings        *sr_file_mappings = NULL;
1566 	struct vnode                    *rdir_vp = NULL;
1567 	struct vm_shared_region         *shared_region = NULL;
1568 
1569 	/*
1570 	 * Get a reference to the current proc's root dir.
1571 	 * Need this to prevent racing with chroot.
1572 	 */
1573 	proc_fdlock(p);
1574 	rdir_vp = p->p_fd.fd_rdir;
1575 	if (rdir_vp == NULL) {
1576 		rdir_vp = rootvnode;
1577 	}
1578 	assert(rdir_vp != NULL);
1579 	vnode_get(rdir_vp);
1580 	proc_fdunlock(p);
1581 
1582 	/*
1583 	 * Turn files, mappings into sr_file_mappings and other setup.
1584 	 */
1585 	error = shared_region_map_and_slide_setup(p, files_count,
1586 	    files, mappings_count, mappings,
1587 	    &sr_file_mappings, &shared_region, rdir_vp);
1588 	if (error != 0) {
1589 		vnode_put(rdir_vp);
1590 		return error;
1591 	}
1592 
1593 	/* map the file(s) into that shared region's submap */
1594 	kr = vm_shared_region_map_file(shared_region, files_count, sr_file_mappings);
1595 	if (kr != KERN_SUCCESS) {
1596 		SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] map(): "
1597 		    "vm_shared_region_map_file() failed kr=0x%x\n",
1598 		    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1599 		    proc_getpid(p), p->p_comm, kr));
1600 	}
1601 
1602 	/* convert kern_return_t to errno */
1603 	switch (kr) {
1604 	case KERN_SUCCESS:
1605 		error = 0;
1606 		break;
1607 	case KERN_INVALID_ADDRESS:
1608 		error = EFAULT;
1609 		break;
1610 	case KERN_PROTECTION_FAILURE:
1611 		error = EPERM;
1612 		break;
1613 	case KERN_NO_SPACE:
1614 		error = ENOMEM;
1615 		break;
1616 	case KERN_FAILURE:
1617 	case KERN_INVALID_ARGUMENT:
1618 	default:
1619 		error = EINVAL;
1620 		break;
1621 	}
1622 
1623 	/*
1624 	 * Mark that this process is now using split libraries.
1625 	 */
1626 	if (error == 0 && (p->p_flag & P_NOSHLIB)) {
1627 		OSBitAndAtomic(~((uint32_t)P_NOSHLIB), &p->p_flag);
1628 	}
1629 
1630 	vnode_put(rdir_vp);
1631 	shared_region_map_and_slide_cleanup(p, files_count, sr_file_mappings, shared_region);
1632 
1633 	SHARED_REGION_TRACE_DEBUG(
1634 		("shared_region: %p [%d(%s)] <- map\n",
1635 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1636 		proc_getpid(p), p->p_comm));
1637 
1638 	return error;
1639 }
1640 
1641 /*
1642  * Clean up part of _shared_region_map_and_slide()
1643  * It had to be broken out of _shared_region_map_and_slide() to
1644  * prevent compiler inlining from blowing out the stack.
1645  */
1646 __attribute__((noinline))
1647 static void
shared_region_map_and_slide_cleanup(struct proc * p,uint32_t files_count,struct _sr_file_mappings * sr_file_mappings,struct vm_shared_region * shared_region)1648 shared_region_map_and_slide_cleanup(
1649 	struct proc              *p,
1650 	uint32_t                 files_count,
1651 	struct _sr_file_mappings *sr_file_mappings,
1652 	struct vm_shared_region  *shared_region)
1653 {
1654 	struct _sr_file_mappings *srfmp;
1655 	struct vnode_attr        va;
1656 
1657 	if (sr_file_mappings != NULL) {
1658 		for (srfmp = &sr_file_mappings[0]; srfmp < &sr_file_mappings[files_count]; srfmp++) {
1659 			if (srfmp->vp != NULL) {
1660 				vnode_lock_spin(srfmp->vp);
1661 				srfmp->vp->v_flag |= VSHARED_DYLD;
1662 				vnode_unlock(srfmp->vp);
1663 
1664 				/* update the vnode's access time */
1665 				if (!(vnode_vfsvisflags(srfmp->vp) & MNT_NOATIME)) {
1666 					VATTR_INIT(&va);
1667 					nanotime(&va.va_access_time);
1668 					VATTR_SET_ACTIVE(&va, va_access_time);
1669 					vnode_setattr(srfmp->vp, &va, vfs_context_current());
1670 				}
1671 
1672 #if NAMEDSTREAMS
1673 				/*
1674 				 * If the shared cache is compressed, it may
1675 				 * have a namedstream vnode instantiated for
1676 				 * for it. That namedstream vnode will also
1677 				 * have to be marked with VSHARED_DYLD.
1678 				 */
1679 				if (vnode_hasnamedstreams(srfmp->vp)) {
1680 					vnode_t svp;
1681 					if (vnode_getnamedstream(srfmp->vp, &svp, XATTR_RESOURCEFORK_NAME,
1682 					    NS_OPEN, 0, vfs_context_kernel()) == 0) {
1683 						vnode_lock_spin(svp);
1684 						svp->v_flag |= VSHARED_DYLD;
1685 						vnode_unlock(svp);
1686 						vnode_put(svp);
1687 					}
1688 				}
1689 #endif /* NAMEDSTREAMS */
1690 				/*
1691 				 * release the vnode...
1692 				 * ubc_map() still holds it for us in the non-error case
1693 				 */
1694 				(void) vnode_put(srfmp->vp);
1695 				srfmp->vp = NULL;
1696 			}
1697 			if (srfmp->fp != NULL) {
1698 				/* release the file descriptor */
1699 				fp_drop(p, srfmp->fd, srfmp->fp, 0);
1700 				srfmp->fp = NULL;
1701 			}
1702 		}
1703 		kfree_type(struct _sr_file_mappings, files_count, sr_file_mappings);
1704 	}
1705 
1706 	if (shared_region != NULL) {
1707 		vm_shared_region_deallocate(shared_region);
1708 	}
1709 }
1710 
1711 /*
1712  * For each file mapped, we may have mappings for:
1713  *    TEXT, EXECUTE, LINKEDIT, DATA_CONST, __AUTH, DATA
1714  * so let's round up to 8 mappings per file.
1715  */
1716 #define SFM_MAX       (_SR_FILE_MAPPINGS_MAX_FILES * 8)     /* max mapping structs allowed to pass in */
1717 
1718 /*
1719  * This is the new interface for setting up shared region mappings.
1720  *
1721  * The slide used for shared regions setup using this interface is done differently
1722  * from the old interface. The slide value passed in the shared_files_np represents
1723  * a max value. The kernel will choose a random value based on that, then use it
1724  * for all shared regions.
1725  */
1726 #if defined (__x86_64__)
1727 #define SLIDE_AMOUNT_MASK ~FOURK_PAGE_MASK
1728 #else
1729 #define SLIDE_AMOUNT_MASK ~SIXTEENK_PAGE_MASK
1730 #endif
1731 
1732 static inline __result_use_check kern_return_t
shared_region_map_and_slide_2_np_sanitize(struct proc * p,user_addr_t mappings_userspace_addr,unsigned int count,shared_file_mapping_slide_np_t * mappings)1733 shared_region_map_and_slide_2_np_sanitize(
1734 	struct proc                         *p,
1735 	user_addr_t                         mappings_userspace_addr,
1736 	unsigned int                        count,
1737 	shared_file_mapping_slide_np_t      *mappings)
1738 {
1739 	kern_return_t kr;
1740 	vm_map_t map = current_map();
1741 	mach_vm_address_t addr, end;
1742 	mach_vm_offset_t offset, offset_end;
1743 	mach_vm_size_t size, offset_size;
1744 	user_addr_t slide_start, slide_end, slide_size;
1745 	vm_prot_t cur;
1746 	vm_prot_t max;
1747 
1748 	user_addr_t user_addr = mappings_userspace_addr;
1749 
1750 	for (size_t i = 0; i < count; i++) {
1751 		shared_file_mapping_slide_np_ut mapping_u;
1752 		/*
1753 		 * First we bring each mapping struct into our kernel stack to
1754 		 * avoid TOCTOU.
1755 		 */
1756 		kr = shared_region_copyin(
1757 			p,
1758 			user_addr,
1759 			1, // copy 1 element at a time
1760 			sizeof(shared_file_mapping_slide_np_ut),
1761 			&mapping_u);
1762 		if (__improbable(kr != KERN_SUCCESS)) {
1763 			return kr;
1764 		}
1765 
1766 		/*
1767 		 * Then, we sanitize the data on the kernel stack.
1768 		 */
1769 		kr = vm_sanitize_addr_size(
1770 			mapping_u.sms_address_u,
1771 			mapping_u.sms_size_u,
1772 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1773 			map,
1774 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1775 			| VM_SANITIZE_FLAGS_CHECK_ALIGNED_START
1776 			| VM_SANITIZE_FLAGS_CHECK_ALIGNED_SIZE),
1777 			&addr,
1778 			&end,
1779 			&size);
1780 		if (__improbable(kr != KERN_SUCCESS)) {
1781 			return kr;
1782 		}
1783 
1784 		kr = vm_sanitize_addr_size(
1785 			mapping_u.sms_file_offset_u,
1786 			mapping_u.sms_size_u,
1787 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1788 			PAGE_MASK,
1789 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1790 			| VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1791 			&offset,
1792 			&offset_end,
1793 			&offset_size);
1794 		if (__improbable(kr != KERN_SUCCESS)) {
1795 			return kr;
1796 		}
1797 		if (__improbable(0 != (offset & vm_map_page_mask(map)))) {
1798 			return KERN_INVALID_ARGUMENT;
1799 		}
1800 
1801 		/*
1802 		 * Unsafe access is immediately followed by wrap to
1803 		 * convert from addr to size.
1804 		 */
1805 		mach_vm_size_ut sms_slide_size_u =
1806 		    vm_sanitize_wrap_size(
1807 			VM_SANITIZE_UNSAFE_UNWRAP(
1808 				mapping_u.sms_slide_size_u));
1809 
1810 		kr = vm_sanitize_addr_size(
1811 			mapping_u.sms_slide_start_u,
1812 			sms_slide_size_u,
1813 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1814 			map,
1815 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1816 			| VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1817 			&slide_start,
1818 			&slide_end,
1819 			&slide_size);
1820 		if (__improbable(kr != KERN_SUCCESS)) {
1821 			return kr;
1822 		}
1823 
1824 		kr = vm_sanitize_cur_and_max_prots(
1825 			mapping_u.sms_init_prot_u,
1826 			mapping_u.sms_max_prot_u,
1827 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1828 			map,
1829 			VM_PROT_SFM_EXTENSIONS_MASK | VM_PROT_TPRO,
1830 			&cur,
1831 			&max);
1832 		if (__improbable(kr != KERN_SUCCESS)) {
1833 			return kr;
1834 		}
1835 
1836 		/*
1837 		 * Finally, we move the data from the kernel stack to our
1838 		 * caller-allocated kernel heap buffer.
1839 		 */
1840 		mappings[i].sms_address = addr;
1841 		mappings[i].sms_size = size;
1842 		mappings[i].sms_file_offset = offset;
1843 		mappings[i].sms_slide_size = slide_size;
1844 		mappings[i].sms_slide_start = slide_start;
1845 		mappings[i].sms_max_prot = max;
1846 		mappings[i].sms_init_prot = cur;
1847 
1848 		if (__improbable(os_add_overflow(
1849 			    user_addr,
1850 			    sizeof(shared_file_mapping_slide_np_ut),
1851 			    &user_addr))) {
1852 			return KERN_INVALID_ARGUMENT;
1853 		}
1854 	}
1855 
1856 	return KERN_SUCCESS;
1857 }
1858 
1859 int
shared_region_map_and_slide_2_np(struct proc * p,struct shared_region_map_and_slide_2_np_args * uap,__unused int * retvalp)1860 shared_region_map_and_slide_2_np(
1861 	struct proc                                  *p,
1862 	struct shared_region_map_and_slide_2_np_args *uap,
1863 	__unused int                                 *retvalp)
1864 {
1865 	unsigned int                  files_count;
1866 	struct shared_file_np         *shared_files = NULL;
1867 	unsigned int                  mappings_count;
1868 	struct shared_file_mapping_slide_np *mappings = NULL;
1869 	kern_return_t                 kr = KERN_SUCCESS;
1870 
1871 	files_count = uap->files_count;
1872 	mappings_count = uap->mappings_count;
1873 
1874 	if (files_count == 0) {
1875 		SHARED_REGION_TRACE_INFO(
1876 			("shared_region: %p [%d(%s)] map(): "
1877 			"no files\n",
1878 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1879 			proc_getpid(p), p->p_comm));
1880 		kr = 0; /* no files to map: we're done ! */
1881 		goto done;
1882 	} else if (files_count <= _SR_FILE_MAPPINGS_MAX_FILES) {
1883 		shared_files = kalloc_data(files_count * sizeof(shared_files[0]), Z_WAITOK);
1884 		if (shared_files == NULL) {
1885 			kr = KERN_RESOURCE_SHORTAGE;
1886 			goto done;
1887 		}
1888 	} else {
1889 		SHARED_REGION_TRACE_ERROR(
1890 			("shared_region: %p [%d(%s)] map(): "
1891 			"too many files (%d) max %d\n",
1892 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1893 			proc_getpid(p), p->p_comm,
1894 			files_count, _SR_FILE_MAPPINGS_MAX_FILES));
1895 		kr = KERN_FAILURE;
1896 		goto done;
1897 	}
1898 
1899 	if (mappings_count == 0) {
1900 		SHARED_REGION_TRACE_INFO(
1901 			("shared_region: %p [%d(%s)] map(): "
1902 			"no mappings\n",
1903 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1904 			proc_getpid(p), p->p_comm));
1905 		kr = 0; /* no mappings: we're done ! */
1906 		goto done;
1907 	} else if (mappings_count <= SFM_MAX) {
1908 		mappings = kalloc_data(mappings_count * sizeof(mappings[0]), Z_WAITOK);
1909 		if (mappings == NULL) {
1910 			kr = KERN_RESOURCE_SHORTAGE;
1911 			goto done;
1912 		}
1913 	} else {
1914 		SHARED_REGION_TRACE_ERROR(
1915 			("shared_region: %p [%d(%s)] map(): "
1916 			"too many mappings (%d) max %d\n",
1917 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1918 			proc_getpid(p), p->p_comm,
1919 			mappings_count, SFM_MAX));
1920 		kr = KERN_FAILURE;
1921 		goto done;
1922 	}
1923 
1924 	/*
1925 	 * struct shared_file_np does not have fields that are subject to
1926 	 * sanitization, it is thus copied from userspace as is.
1927 	 */
1928 	kr = shared_region_copyin(p, uap->files, files_count, sizeof(shared_files[0]), shared_files);
1929 	if (kr != KERN_SUCCESS) {
1930 		goto done;
1931 	}
1932 
1933 	kr = shared_region_map_and_slide_2_np_sanitize(
1934 		p,
1935 		uap->mappings_u,
1936 		mappings_count,
1937 		mappings);
1938 	if (__improbable(kr != KERN_SUCCESS)) {
1939 		kr = vm_sanitize_get_kr(kr);
1940 		goto done;
1941 	}
1942 
1943 	uint32_t max_slide = shared_files[0].sf_slide;
1944 	uint32_t random_val;
1945 	uint32_t slide_amount;
1946 
1947 	if (max_slide != 0) {
1948 		read_random(&random_val, sizeof random_val);
1949 		slide_amount = ((random_val % max_slide) & SLIDE_AMOUNT_MASK);
1950 	} else {
1951 		slide_amount = 0;
1952 	}
1953 #if DEVELOPMENT || DEBUG
1954 	extern bool bootarg_disable_aslr;
1955 	if (bootarg_disable_aslr) {
1956 		slide_amount = 0;
1957 	}
1958 #endif /* DEVELOPMENT || DEBUG */
1959 
1960 	/*
1961 	 * Fix up the mappings to reflect the desired slide.
1962 	 */
1963 	unsigned int f;
1964 	unsigned int m = 0;
1965 	unsigned int i;
1966 	for (f = 0; f < files_count; ++f) {
1967 		shared_files[f].sf_slide = slide_amount;
1968 		for (i = 0; i < shared_files[f].sf_mappings_count; ++i, ++m) {
1969 			if (m >= mappings_count) {
1970 				SHARED_REGION_TRACE_ERROR(
1971 					("shared_region: %p [%d(%s)] map(): "
1972 					"mapping count argument was too small\n",
1973 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1974 					proc_getpid(p), p->p_comm));
1975 				kr = KERN_FAILURE;
1976 				goto done;
1977 			}
1978 			if (__improbable(
1979 				    os_add_overflow(
1980 					    mappings[m].sms_address,
1981 					    slide_amount,
1982 					    &mappings[m].sms_address))) {
1983 				kr = KERN_INVALID_ARGUMENT;
1984 				goto done;
1985 			}
1986 			if (mappings[m].sms_slide_size != 0) {
1987 				mach_vm_address_t discard;
1988 				/* Slide and check that new start/size pairs do not overflow. */
1989 				if (__improbable(
1990 					    os_add_overflow(
1991 						    mappings[m].sms_slide_start,
1992 						    slide_amount,
1993 						    &mappings[m].sms_slide_start) ||
1994 					    os_add_overflow(
1995 						    mappings[m].sms_slide_start,
1996 						    mappings[m].sms_slide_size,
1997 						    &discard))) {
1998 					kr = KERN_INVALID_ARGUMENT;
1999 					goto done;
2000 				}
2001 			}
2002 		}
2003 	}
2004 
2005 	kr = _shared_region_map_and_slide(p, files_count, shared_files, mappings_count, mappings);
2006 done:
2007 	kfree_data(shared_files, files_count * sizeof(shared_files[0]));
2008 	kfree_data(mappings, mappings_count * sizeof(mappings[0]));
2009 	return kr;
2010 }
2011 
2012 /*
2013  * A syscall for dyld to use to map data pages that need load time relocation fixups.
2014  * The fixups are performed by a custom pager during page-in, so the pages still appear
2015  * "clean" and hence are easily discarded under memory pressure. They can be re-paged-in
2016  * on demand later, all w/o using the compressor.
2017  *
2018  * Note these page are treated as MAP_PRIVATE. So if the application dirties any pages while
2019  * running, they are COW'd as normal.
2020  */
2021 int
map_with_linking_np(struct proc * p,struct map_with_linking_np_args * uap,__unused int * retvalp)2022 map_with_linking_np(
2023 	struct proc                     *p,
2024 	struct map_with_linking_np_args *uap,
2025 	__unused int                    *retvalp)
2026 {
2027 	uint32_t                        region_count;
2028 	uint32_t                        r;
2029 	struct mwl_region               *regions = NULL;
2030 	struct mwl_region               *rp;
2031 	uint32_t                        link_info_size;
2032 	void                            *link_info = NULL;      /* starts with a struct mwl_info_hdr */
2033 	struct mwl_info_hdr             *info_hdr = NULL;
2034 	uint64_t                        binds_size;
2035 	int                             fd;
2036 	struct fileproc                 *fp = NULL;
2037 	struct vnode                    *vp = NULL;
2038 	size_t                          file_size;
2039 	off_t                           fs;
2040 	struct vnode_attr               va;
2041 	memory_object_control_t         file_control = NULL;
2042 	int                             error;
2043 	kern_return_t                   kr = KERN_SUCCESS;
2044 
2045 	/*
2046 	 * Check if dyld has told us it finished with this call.
2047 	 */
2048 	if (p->p_disallow_map_with_linking) {
2049 		printf("%s: [%d(%s)]: map__with_linking() was disabled\n",
2050 		    __func__, proc_getpid(p), p->p_comm);
2051 		kr = KERN_FAILURE;
2052 		goto done;
2053 	}
2054 
2055 	/*
2056 	 * First we do some sanity checking on what dyld has passed us.
2057 	 */
2058 	region_count = uap->region_count;
2059 	link_info_size = uap->link_info_size;
2060 	if (region_count == 0) {
2061 		printf("%s: [%d(%s)]: region_count == 0\n",
2062 		    __func__, proc_getpid(p), p->p_comm);
2063 		kr = KERN_FAILURE;
2064 		goto done;
2065 	}
2066 	if (region_count > MWL_MAX_REGION_COUNT) {
2067 		printf("%s: [%d(%s)]: region_count too big %d\n",
2068 		    __func__, proc_getpid(p), p->p_comm, region_count);
2069 		kr = KERN_FAILURE;
2070 		goto done;
2071 	}
2072 
2073 	if (link_info_size <= MWL_MIN_LINK_INFO_SIZE) {
2074 		printf("%s: [%d(%s)]: link_info_size too small\n",
2075 		    __func__, proc_getpid(p), p->p_comm);
2076 		kr = KERN_FAILURE;
2077 		goto done;
2078 	}
2079 	if (link_info_size >= MWL_MAX_LINK_INFO_SIZE) {
2080 		printf("%s: [%d(%s)]: link_info_size too big %d\n",
2081 		    __func__, proc_getpid(p), p->p_comm, link_info_size);
2082 		kr = KERN_FAILURE;
2083 		goto done;
2084 	}
2085 
2086 	/*
2087 	 * Allocate and copyin the regions and link info
2088 	 */
2089 	regions = kalloc_data(region_count * sizeof(regions[0]), Z_WAITOK);
2090 	if (regions == NULL) {
2091 		printf("%s: [%d(%s)]: failed to allocate regions\n",
2092 		    __func__, proc_getpid(p), p->p_comm);
2093 		kr = KERN_RESOURCE_SHORTAGE;
2094 		goto done;
2095 	}
2096 	kr = shared_region_copyin(p, uap->regions, region_count, sizeof(regions[0]), regions);
2097 	if (kr != KERN_SUCCESS) {
2098 		printf("%s: [%d(%s)]: failed to copyin regions kr=%d\n",
2099 		    __func__, proc_getpid(p), p->p_comm, kr);
2100 		goto done;
2101 	}
2102 
2103 	link_info = kalloc_data(link_info_size, Z_WAITOK);
2104 	if (link_info == NULL) {
2105 		printf("%s: [%d(%s)]: failed to allocate link_info\n",
2106 		    __func__, proc_getpid(p), p->p_comm);
2107 		kr = KERN_RESOURCE_SHORTAGE;
2108 		goto done;
2109 	}
2110 	kr = shared_region_copyin(p, uap->link_info, 1, link_info_size, link_info);
2111 	if (kr != KERN_SUCCESS) {
2112 		printf("%s: [%d(%s)]: failed to copyin link_info kr=%d\n",
2113 		    __func__, proc_getpid(p), p->p_comm, kr);
2114 		goto done;
2115 	}
2116 
2117 	/*
2118 	 * Do some verification the data structures.
2119 	 */
2120 	info_hdr = (struct mwl_info_hdr *)link_info;
2121 	if (info_hdr->mwli_version != MWL_INFO_VERS) {
2122 		printf("%s: [%d(%s)]: unrecognized mwli_version=%d\n",
2123 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_version);
2124 		kr = KERN_FAILURE;
2125 		goto done;
2126 	}
2127 
2128 	if (info_hdr->mwli_binds_offset > link_info_size) {
2129 		printf("%s: [%d(%s)]: mwli_binds_offset too large %d\n",
2130 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_offset);
2131 		kr = KERN_FAILURE;
2132 		goto done;
2133 	}
2134 
2135 	/* some older devs have s/w page size > h/w page size, no need to support them */
2136 	if (info_hdr->mwli_page_size != PAGE_SIZE) {
2137 		/* no printf, since this is expected on some devices */
2138 		kr = KERN_INVALID_ARGUMENT;
2139 		goto done;
2140 	}
2141 
2142 	binds_size = (uint64_t)info_hdr->mwli_binds_count *
2143 	    ((info_hdr->mwli_pointer_format == DYLD_CHAINED_PTR_32) ? 4 : 8);
2144 	if (binds_size > link_info_size - info_hdr->mwli_binds_offset) {
2145 		printf("%s: [%d(%s)]: mwli_binds_count too large %d\n",
2146 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_count);
2147 		kr = KERN_FAILURE;
2148 		goto done;
2149 	}
2150 
2151 	if (info_hdr->mwli_chains_offset > link_info_size) {
2152 		printf("%s: [%d(%s)]: mwli_chains_offset too large %d\n",
2153 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_offset);
2154 		kr = KERN_FAILURE;
2155 		goto done;
2156 	}
2157 
2158 
2159 	/*
2160 	 * Ensure the chained starts in the link info and make sure the
2161 	 * segment info offsets are within bounds.
2162 	 */
2163 	if (info_hdr->mwli_chains_size < sizeof(struct dyld_chained_starts_in_image)) {
2164 		printf("%s: [%d(%s)]: mwli_chains_size too small %d\n",
2165 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2166 		kr = KERN_FAILURE;
2167 		goto done;
2168 	}
2169 	if (info_hdr->mwli_chains_size > link_info_size - info_hdr->mwli_chains_offset) {
2170 		printf("%s: [%d(%s)]: mwli_chains_size too large %d\n",
2171 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2172 		kr = KERN_FAILURE;
2173 		goto done;
2174 	}
2175 
2176 	/* Note that more verification of offsets is done in the pager itself */
2177 
2178 	/*
2179 	 * Ensure we've only been given one FD and verify valid protections.
2180 	 */
2181 	fd = regions[0].mwlr_fd;
2182 	for (r = 0; r < region_count; ++r) {
2183 		if (regions[r].mwlr_fd != fd) {
2184 			printf("%s: [%d(%s)]: mwlr_fd mismatch %d and %d\n",
2185 			    __func__, proc_getpid(p), p->p_comm, fd, regions[r].mwlr_fd);
2186 			kr = KERN_FAILURE;
2187 			goto done;
2188 		}
2189 
2190 		/*
2191 		 * Only allow data mappings and not zero fill. Permit TPRO
2192 		 * mappings only when VM_PROT_READ | VM_PROT_WRITE.
2193 		 */
2194 		if (regions[r].mwlr_protections & VM_PROT_EXECUTE) {
2195 			printf("%s: [%d(%s)]: mwlr_protections EXECUTE not allowed\n",
2196 			    __func__, proc_getpid(p), p->p_comm);
2197 			kr = KERN_FAILURE;
2198 			goto done;
2199 		}
2200 		if (regions[r].mwlr_protections & VM_PROT_ZF) {
2201 			printf("%s: [%d(%s)]: region %d, found VM_PROT_ZF not allowed\n",
2202 			    __func__, proc_getpid(p), p->p_comm, r);
2203 			kr = KERN_FAILURE;
2204 			goto done;
2205 		}
2206 		if ((regions[r].mwlr_protections & VM_PROT_TPRO) &&
2207 		    !(regions[r].mwlr_protections & VM_PROT_WRITE)) {
2208 			printf("%s: [%d(%s)]: region %d, found VM_PROT_TPRO without VM_PROT_WRITE\n",
2209 			    __func__, proc_getpid(p), p->p_comm, r);
2210 			kr = KERN_FAILURE;
2211 			goto done;
2212 		}
2213 	}
2214 
2215 
2216 	/* get file structure from file descriptor */
2217 	error = fp_get_ftype(p, fd, DTYPE_VNODE, EINVAL, &fp);
2218 	if (error) {
2219 		printf("%s: [%d(%s)]: fp_get_ftype() failed, error %d\n",
2220 		    __func__, proc_getpid(p), p->p_comm, error);
2221 		kr = KERN_FAILURE;
2222 		goto done;
2223 	}
2224 
2225 	/* We need at least read permission on the file */
2226 	if (!(fp->fp_glob->fg_flag & FREAD)) {
2227 		printf("%s: [%d(%s)]: not readable\n",
2228 		    __func__, proc_getpid(p), p->p_comm);
2229 		kr = KERN_FAILURE;
2230 		goto done;
2231 	}
2232 
2233 	/* Get the vnode from file structure */
2234 	vp = (struct vnode *)fp_get_data(fp);
2235 	error = vnode_getwithref(vp);
2236 	if (error) {
2237 		printf("%s: [%d(%s)]: failed to get vnode, error %d\n",
2238 		    __func__, proc_getpid(p), p->p_comm, error);
2239 		kr = KERN_FAILURE;
2240 		vp = NULL; /* just to be sure */
2241 		goto done;
2242 	}
2243 
2244 	/* Make sure the vnode is a regular file */
2245 	if (vp->v_type != VREG) {
2246 		printf("%s: [%d(%s)]: vnode not VREG\n",
2247 		    __func__, proc_getpid(p), p->p_comm);
2248 		kr = KERN_FAILURE;
2249 		goto done;
2250 	}
2251 
2252 	/* get vnode size */
2253 	error = vnode_size(vp, &fs, vfs_context_current());
2254 	if (error) {
2255 		goto done;
2256 	}
2257 	file_size = fs;
2258 
2259 	/* get the file's memory object handle */
2260 	file_control = ubc_getobject(vp, UBC_HOLDOBJECT);
2261 	if (file_control == MEMORY_OBJECT_CONTROL_NULL) {
2262 		printf("%s: [%d(%s)]: no memory object\n",
2263 		    __func__, proc_getpid(p), p->p_comm);
2264 		kr = KERN_FAILURE;
2265 		goto done;
2266 	}
2267 
2268 	for (r = 0; r < region_count; ++r) {
2269 		rp = &regions[r];
2270 
2271 #if CONFIG_MACF
2272 		vm_prot_t prot = (rp->mwlr_protections & VM_PROT_ALL);
2273 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
2274 		    fp->fp_glob, prot, MAP_FILE | MAP_PRIVATE | MAP_FIXED, rp->mwlr_file_offset, &prot);
2275 		if (error) {
2276 			printf("%s: [%d(%s)]: mac_file_check_mmap() failed, region %d, error %d\n",
2277 			    __func__, proc_getpid(p), p->p_comm, r, error);
2278 			kr = KERN_FAILURE;
2279 			goto done;
2280 		}
2281 #endif /* MAC */
2282 
2283 		/* check that the mappings are properly covered by code signatures */
2284 		if (cs_system_enforcement()) {
2285 			if (!ubc_cs_is_range_codesigned(vp, rp->mwlr_file_offset, rp->mwlr_size)) {
2286 				printf("%s: [%d(%s)]: region %d, not code signed\n",
2287 				    __func__, proc_getpid(p), p->p_comm, r);
2288 				kr = KERN_FAILURE;
2289 				goto done;
2290 			}
2291 		}
2292 	}
2293 
2294 	/* update the vnode's access time */
2295 	if (!(vnode_vfsvisflags(vp) & MNT_NOATIME)) {
2296 		VATTR_INIT(&va);
2297 		nanotime(&va.va_access_time);
2298 		VATTR_SET_ACTIVE(&va, va_access_time);
2299 		vnode_setattr(vp, &va, vfs_context_current());
2300 	}
2301 
2302 	/* get the VM to do the work */
2303 	kr = vm_map_with_linking(proc_task(p), regions, region_count, &link_info, link_info_size, file_control);
2304 
2305 done:
2306 	if (fp != NULL) {
2307 		/* release the file descriptor */
2308 		fp_drop(p, fd, fp, 0);
2309 	}
2310 	if (vp != NULL) {
2311 		(void)vnode_put(vp);
2312 	}
2313 	if (regions != NULL) {
2314 		kfree_data(regions, region_count * sizeof(regions[0]));
2315 	}
2316 	/* link info is NULL if it is used in the pager, if things worked */
2317 	if (link_info != NULL) {
2318 		kfree_data(link_info, link_info_size);
2319 	}
2320 
2321 	switch (kr) {
2322 	case KERN_SUCCESS:
2323 		return 0;
2324 	case KERN_RESOURCE_SHORTAGE:
2325 		return ENOMEM;
2326 	default:
2327 		return EINVAL;
2328 	}
2329 }
2330 
2331 #if DEBUG || DEVELOPMENT
2332 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count,
2333     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count, 0, "");
2334 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count_max,
2335     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count_max, 0, "");
2336 #endif /* DEBUG || DEVELOPMENT */
2337 
2338 /* sysctl overflow room */
2339 
2340 SYSCTL_INT(_vm, OID_AUTO, pagesize, CTLFLAG_RD | CTLFLAG_LOCKED,
2341     (int *) &page_size, 0, "vm page size");
2342 
2343 /* vm_page_free_target is provided as a makeshift solution for applications that want to
2344  *       allocate buffer space, possibly purgeable memory, but not cause inactive pages to be
2345  *       reclaimed. It allows the app to calculate how much memory is free outside the free target. */
2346 extern unsigned int     vm_page_free_target;
2347 SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED,
2348     &vm_page_free_target, 0, "Pageout daemon free target");
2349 
2350 SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED,
2351     &vm_pageout_state.vm_memory_pressure, 0, "Memory pressure indicator");
2352 
2353 static int
2354 vm_ctl_page_free_wanted SYSCTL_HANDLER_ARGS
2355 {
2356 #pragma unused(oidp, arg1, arg2)
2357 	unsigned int page_free_wanted;
2358 
2359 	page_free_wanted = mach_vm_ctl_page_free_wanted();
2360 	return SYSCTL_OUT(req, &page_free_wanted, sizeof(page_free_wanted));
2361 }
2362 SYSCTL_PROC(_vm, OID_AUTO, page_free_wanted,
2363     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
2364     0, 0, vm_ctl_page_free_wanted, "I", "");
2365 
2366 extern unsigned int     vm_page_purgeable_count;
2367 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2368     &vm_page_purgeable_count, 0, "Purgeable page count");
2369 
2370 extern unsigned int     vm_page_purgeable_wired_count;
2371 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2372     &vm_page_purgeable_wired_count, 0, "Wired purgeable page count");
2373 
2374 extern unsigned int vm_page_kern_lpage_count;
2375 SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2376     &vm_page_kern_lpage_count, 0, "kernel used large pages");
2377 
2378 SCALABLE_COUNTER_DECLARE(vm_page_grab_count);
2379 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed");
2380 
2381 #if DEVELOPMENT || DEBUG
2382 #if __ARM_MIXED_PAGE_SIZE__
2383 static int vm_mixed_pagesize_supported = 1;
2384 #else
2385 static int vm_mixed_pagesize_supported = 0;
2386 #endif /*__ARM_MIXED_PAGE_SIZE__ */
2387 SYSCTL_INT(_debug, OID_AUTO, vm_mixed_pagesize_supported, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED,
2388     &vm_mixed_pagesize_supported, 0, "kernel support for mixed pagesize");
2389 
2390 SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
2391     &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed");
2392 
2393 SYSCTL_INT(_vm, OID_AUTO, pageout_purged_objects, CTLFLAG_RD | CTLFLAG_LOCKED,
2394     &vm_pageout_debug.vm_pageout_purged_objects, 0, "System purged object count");
2395 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED,
2396     &vm_pageout_debug.vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)");
2397 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED,
2398     &vm_pageout_debug.vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)");
2399 
2400 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2401     &vm_pageout_debug.vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated");
2402 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2403     &vm_pageout_debug.vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated");
2404 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2405     &vm_pageout_debug.vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated");         /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */
2406 SYSCTL_ULONG(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2407     &vm_pageout_vminfo.vm_pageout_freed_cleaned, "Cleaned pages freed");
2408 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2409     &vm_pageout_debug.vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated");
2410 SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2411     &vm_pageout_debug.vm_pageout_enqueued_cleaned, 0, "");         /* sum of next two */
2412 #endif /* DEVELOPMENT || DEBUG */
2413 
2414 extern int madvise_free_debug;
2415 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
2416     &madvise_free_debug, 0, "zero-fill on madvise(MADV_FREE*)");
2417 extern int madvise_free_debug_sometimes;
2418 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug_sometimes, CTLFLAG_RW | CTLFLAG_LOCKED,
2419     &madvise_free_debug_sometimes, 0, "sometimes zero-fill on madvise(MADV_FREE*)");
2420 
2421 SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2422     &vm_page_stats_reusable.reusable_count, 0, "Reusable page count");
2423 SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2424     &vm_page_stats_reusable.reusable_pages_success, "");
2425 SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2426     &vm_page_stats_reusable.reusable_pages_failure, "");
2427 SYSCTL_QUAD(_vm, OID_AUTO, reusable_pages_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2428     &vm_page_stats_reusable.reusable_pages_shared, "");
2429 SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2430     &vm_page_stats_reusable.all_reusable_calls, "");
2431 SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2432     &vm_page_stats_reusable.partial_reusable_calls, "");
2433 SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2434     &vm_page_stats_reusable.reuse_pages_success, "");
2435 SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2436     &vm_page_stats_reusable.reuse_pages_failure, "");
2437 SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2438     &vm_page_stats_reusable.all_reuse_calls, "");
2439 SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2440     &vm_page_stats_reusable.partial_reuse_calls, "");
2441 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2442     &vm_page_stats_reusable.can_reuse_success, "");
2443 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2444     &vm_page_stats_reusable.can_reuse_failure, "");
2445 SYSCTL_QUAD(_vm, OID_AUTO, reusable_reclaimed, CTLFLAG_RD | CTLFLAG_LOCKED,
2446     &vm_page_stats_reusable.reusable_reclaimed, "");
2447 SYSCTL_QUAD(_vm, OID_AUTO, reusable_nonwritable, CTLFLAG_RD | CTLFLAG_LOCKED,
2448     &vm_page_stats_reusable.reusable_nonwritable, "");
2449 SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2450     &vm_page_stats_reusable.reusable_shared, "");
2451 SYSCTL_QUAD(_vm, OID_AUTO, free_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2452     &vm_page_stats_reusable.free_shared, "");
2453 
2454 
2455 extern unsigned int vm_page_free_count, vm_page_speculative_count;
2456 SYSCTL_UINT(_vm, OID_AUTO, page_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_count, 0, "");
2457 SYSCTL_UINT(_vm, OID_AUTO, page_speculative_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_speculative_count, 0, "");
2458 
2459 extern unsigned int vm_page_cleaned_count;
2460 SYSCTL_UINT(_vm, OID_AUTO, page_cleaned_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_cleaned_count, 0, "Cleaned queue size");
2461 
2462 extern unsigned int vm_page_pageable_internal_count, vm_page_pageable_external_count;
2463 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_internal_count, 0, "");
2464 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, "");
2465 
2466 /* pageout counts */
2467 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_clean, 0, "");
2468 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_used, 0, "");
2469 
2470 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, "");
2471 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_external, "");
2472 SYSCTL_ULONG(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2473 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_external, "");
2474 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2475 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_cleaned, "");
2476 
2477 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_sharedcache, "");
2478 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache, "");
2479 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_realtime, "");
2480 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime, "");
2481 extern unsigned int vm_page_realtime_count;
2482 SYSCTL_UINT(_vm, OID_AUTO, page_realtime_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_realtime_count, 0, "");
2483 extern int vm_pageout_protect_realtime;
2484 SYSCTL_INT(_vm, OID_AUTO, pageout_protect_realtime, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_protect_realtime, 0, "");
2485 
2486 /* counts of pages prefaulted when entering a memory object */
2487 extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout;
2488 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_pages, "");
2489 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_bailout, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_bailout, "");
2490 
2491 #if defined (__x86_64__)
2492 extern unsigned int vm_clump_promote_threshold;
2493 SYSCTL_UINT(_vm, OID_AUTO, vm_clump_promote_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_clump_promote_threshold, 0, "clump size threshold for promotes");
2494 #if DEVELOPMENT || DEBUG
2495 extern unsigned long vm_clump_stats[];
2496 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[1], "free page allocations from clump of 1 page");
2497 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[2], "free page allocations from clump of 2 pages");
2498 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[3], "free page allocations from clump of 3 pages");
2499 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats4, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[4], "free page allocations from clump of 4 pages");
2500 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats5, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[5], "free page allocations from clump of 5 pages");
2501 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats6, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[6], "free page allocations from clump of 6 pages");
2502 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats7, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[7], "free page allocations from clump of 7 pages");
2503 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats8, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[8], "free page allocations from clump of 8 pages");
2504 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats9, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[9], "free page allocations from clump of 9 pages");
2505 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats10, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[10], "free page allocations from clump of 10 pages");
2506 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats11, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[11], "free page allocations from clump of 11 pages");
2507 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats12, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[12], "free page allocations from clump of 12 pages");
2508 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats13, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[13], "free page allocations from clump of 13 pages");
2509 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats14, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[14], "free page allocations from clump of 14 pages");
2510 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats15, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[15], "free page allocations from clump of 15 pages");
2511 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats16, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[16], "free page allocations from clump of 16 pages");
2512 extern unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
2513 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_alloc, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_allocs, "free page allocations");
2514 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inserts, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inserts, "free page insertions");
2515 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inrange, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inrange, "free page insertions that are part of vm_pages");
2516 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_promotes, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_promotes, "pages promoted to head");
2517 #endif  /* if DEVELOPMENT || DEBUG */
2518 #endif  /* #if defined (__x86_64__) */
2519 
2520 #if CONFIG_SECLUDED_MEMORY
2521 
2522 SYSCTL_UINT(_vm, OID_AUTO, num_tasks_can_use_secluded_mem, CTLFLAG_RD | CTLFLAG_LOCKED, &num_tasks_can_use_secluded_mem, 0, "");
2523 extern unsigned int vm_page_secluded_target;
2524 extern unsigned int vm_page_secluded_count;
2525 extern unsigned int vm_page_secluded_count_free;
2526 extern unsigned int vm_page_secluded_count_inuse;
2527 extern unsigned int vm_page_secluded_count_over_target;
2528 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, "");
2529 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, "");
2530 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, "");
2531 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, "");
2532 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_over_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_over_target, 0, "");
2533 
2534 extern struct vm_page_secluded_data vm_page_secluded;
2535 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, "");
2536 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_free, 0, "");
2537 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_other, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_other, 0, "");
2538 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_locked, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_locked, 0, "");
2539 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_state, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_state, 0, "");
2540 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_realtime, 0, "");
2541 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_dirty, 0, "");
2542 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, "");
2543 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, "");
2544 
2545 #endif /* CONFIG_SECLUDED_MEMORY */
2546 
2547 #if CONFIG_DEFERRED_RECLAIM
2548 #pragma mark Deferred Reclaim
2549 SYSCTL_NODE(_vm, OID_AUTO, reclaim, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Deferred Memory Reclamation");
2550 #if DEVELOPMENT || DEBUG
2551 /*
2552  * VM reclaim testing
2553  */
2554 extern bool vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task);
2555 
2556 static int
2557 sysctl_vm_reclaim_wait_for_pid SYSCTL_HANDLER_ARGS
2558 {
2559 	int error = EINVAL, pid = 0;
2560 	/*
2561 	 * Only send on write
2562 	 */
2563 	error = sysctl_handle_int(oidp, &pid, 0, req);
2564 	if (error || !req->newptr) {
2565 		return error;
2566 	}
2567 	if (pid <= 0) {
2568 		return EINVAL;
2569 	}
2570 	proc_t p = proc_find(pid);
2571 	if (p == PROC_NULL) {
2572 		return ESRCH;
2573 	}
2574 	task_t t = proc_task(p);
2575 	if (t == TASK_NULL) {
2576 		proc_rele(p);
2577 		return ESRCH;
2578 	}
2579 	task_reference(t);
2580 	proc_rele(p);
2581 
2582 	bool success = vm_deferred_reclamation_block_until_task_has_been_reclaimed(t);
2583 	if (success) {
2584 		error = 0;
2585 	}
2586 	task_deallocate(t);
2587 
2588 	return error;
2589 }
2590 
2591 SYSCTL_PROC(_vm_reclaim, OID_AUTO, wait_for_pid,
2592     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2593     &sysctl_vm_reclaim_wait_for_pid, "I",
2594     "Block until the given pid has been drained by kernel GC");
2595 
2596 static int
2597 sysctl_vm_reclaim_drain_pid SYSCTL_HANDLER_ARGS
2598 {
2599 	int error = EINVAL;
2600 	kern_return_t kr;
2601 	pid_t pid;
2602 	error = sysctl_handle_int(oidp, &pid, 0, req);
2603 	/* Only reclaim on write */
2604 	if (error || !req->newptr) {
2605 		return error;
2606 	}
2607 	if (pid <= 0) {
2608 		return EINVAL;
2609 	}
2610 	proc_t p = proc_find(pid);
2611 	if (p == PROC_NULL) {
2612 		return ESRCH;
2613 	}
2614 	task_t t = proc_task(p);
2615 	if (t == TASK_NULL) {
2616 		proc_rele(p);
2617 		return ESRCH;
2618 	}
2619 	task_reference(t);
2620 	proc_rele(p);
2621 	kr = vm_deferred_reclamation_task_drain(t, RECLAIM_OPTIONS_NONE);
2622 	task_deallocate(t);
2623 	return mach_to_bsd_errno(kr);
2624 }
2625 
2626 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_pid,
2627     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2628     &sysctl_vm_reclaim_drain_pid, "I",
2629     "Drain the deferred reclamation buffer for a pid");
2630 
2631 static int
proc_filter_reclaimable(proc_t p,__unused void * arg)2632 proc_filter_reclaimable(proc_t p, __unused void *arg)
2633 {
2634 	task_t task = proc_task(p);
2635 	return vm_deferred_reclamation_task_has_ring(task);
2636 }
2637 
2638 static int
proc_reclaim_drain(proc_t p,__unused void * arg)2639 proc_reclaim_drain(proc_t p, __unused void *arg)
2640 {
2641 	kern_return_t kr;
2642 	task_t task = proc_task(p);
2643 	kr = vm_deferred_reclamation_task_drain(task, RECLAIM_OPTIONS_NONE);
2644 	return mach_to_bsd_errno(kr);
2645 }
2646 
2647 static int
2648 sysctl_vm_reclaim_drain_all SYSCTL_HANDLER_ARGS
2649 {
2650 	int error;
2651 	int val;
2652 	if (!req->newptr) {
2653 		return EINVAL;
2654 	}
2655 	error = sysctl_handle_int(oidp, &val, 0, req);
2656 	if (error || val == FALSE) {
2657 		return error;
2658 	}
2659 	proc_iterate(PROC_ALLPROCLIST, proc_reclaim_drain, NULL,
2660 	    proc_filter_reclaimable, NULL);
2661 	return 0;
2662 }
2663 
2664 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_all,
2665     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2666     &sysctl_vm_reclaim_drain_all, "I",
2667     "Fully reclaim from every deferred reclamation buffer on the system");
2668 
2669 extern uint32_t vm_reclaim_buffer_count;
2670 extern uint64_t vm_reclaim_gc_epoch;
2671 extern uint64_t vm_reclaim_gc_reclaim_count;
2672 #if XNU_TARGET_OS_IOS
2673 extern uint64_t vm_reclaim_max_threshold;
2674 #else /* !XNU_TARGET_OS_IOS */
2675 extern bool vm_reclaim_debug;
2676 extern bool vm_reclaim_enabled;
2677 extern uint64_t vm_reclaim_sampling_period_ns;
2678 extern uint64_t vm_reclaim_sampling_period_abs;
2679 extern uint32_t vm_reclaim_autotrim_pct_normal;
2680 extern uint32_t vm_reclaim_autotrim_pct_pressure;
2681 extern uint32_t vm_reclaim_autotrim_pct_critical;
2682 extern uint32_t vm_reclaim_wma_weight_base;
2683 extern uint32_t vm_reclaim_wma_weight_cur;
2684 extern uint32_t vm_reclaim_wma_denom;
2685 extern uint64_t vm_reclaim_abandonment_threshold;
2686 #endif /* XNU_TARGET_OS_IOS */
2687 
2688 SYSCTL_UINT(_vm_reclaim, OID_AUTO, reclaim_buffer_count,
2689     CTLFLAG_RD | CTLFLAG_LOCKED, (uint32_t *)&vm_reclaim_buffer_count, 0,
2690     "The number of deferred memory buffers currently alive");
2691 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_epoch,
2692     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_epoch,
2693     "Number of times the global GC thread has run");
2694 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_reclaim_count,
2695     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_reclaim_count,
2696     "Number of times the global GC thread has reclaimed from a buffer");
2697 #if XNU_TARGET_OS_IOS
2698 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, max_threshold,
2699     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_max_threshold,
2700     "Maximum amount of virtual memory (in B) that may be deferred without "
2701     "synchronous reclamation");
2702 #else /* !XNU_TARGET_OS_IOS */
2703 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, enabled,
2704     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_enabled, 0,
2705     "Whether deferred memory reclamation is enabled on this system");
2706 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, debug,
2707     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_debug, 0,
2708     "Whether vm.reclaim debug logs are enabled");
2709 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_normal,
2710     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_normal, 0,
2711     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2712     "to engage auto-trim when the system is operating normally");
2713 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_pressure,
2714     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_pressure, 0,
2715     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2716     "to engage auto-trim when the system is under memory pressure");
2717 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_critical,
2718     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_critical, 0,
2719     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2720     "to engage auto-trim when the system is under critical memory pressure");
2721 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_base,
2722     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_base, 0,
2723     "Weight applied to historical minimum buffer size samples");
2724 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_cur,
2725     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_cur, 0,
2726     "Weight applied to current sampled minimum buffer size");
2727 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_denom,
2728     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_denom, 0,
2729     "Denominator for weighted moving average calculation");
2730 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, abandonment_threshold,
2731     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_abandonment_threshold,
2732     "The number of sampling periods between accounting updates that may elapse "
2733     "before the buffer is considered \"abandoned\"");
2734 
2735 static int
2736 sysctl_vm_reclaim_sampling_period SYSCTL_HANDLER_ARGS
2737 {
2738 	uint64_t new_val_ns;
2739 	uint64_t old_val_ns = vm_reclaim_sampling_period_ns;
2740 	int err = sysctl_io_number(req, vm_reclaim_sampling_period_ns,
2741 	    sizeof(vm_reclaim_sampling_period_ns), &new_val_ns, NULL);
2742 	if (err || !req->newptr) {
2743 		return err;
2744 	}
2745 	if (new_val_ns != old_val_ns) {
2746 		vm_reclaim_sampling_period_ns = new_val_ns;
2747 		nanoseconds_to_absolutetime(vm_reclaim_sampling_period_ns, &vm_reclaim_sampling_period_abs);
2748 	}
2749 	return 0;
2750 }
2751 
2752 SYSCTL_PROC(_vm_reclaim, OID_AUTO, sampling_period_ns,
2753     CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0, sysctl_vm_reclaim_sampling_period, "I",
2754     "Interval (nanoseconds) at which to sample the minimum buffer size and "
2755     "consider trimming excess");
2756 #endif /* XNU_TARGET_OS_IOS */
2757 #endif /* DEVELOPMENT || DEBUG */
2758 #endif /* CONFIG_DEFERRED_RECLAIM */
2759 
2760 #include <kern/thread.h>
2761 #include <sys/user.h>
2762 
2763 void vm_pageout_io_throttle(void);
2764 
2765 void
vm_pageout_io_throttle(void)2766 vm_pageout_io_throttle(void)
2767 {
2768 	struct uthread *uthread = current_uthread();
2769 
2770 	/*
2771 	 * thread is marked as a low priority I/O type
2772 	 * and the I/O we issued while in this cleaning operation
2773 	 * collided with normal I/O operations... we'll
2774 	 * delay in order to mitigate the impact of this
2775 	 * task on the normal operation of the system
2776 	 */
2777 
2778 	if (uthread->uu_lowpri_window) {
2779 		throttle_lowpri_io(1);
2780 	}
2781 }
2782 
2783 int
vm_pressure_monitor(__unused struct proc * p,struct vm_pressure_monitor_args * uap,int * retval)2784 vm_pressure_monitor(
2785 	__unused struct proc *p,
2786 	struct vm_pressure_monitor_args *uap,
2787 	int *retval)
2788 {
2789 	kern_return_t   kr;
2790 	uint32_t        pages_reclaimed;
2791 	uint32_t        pages_wanted;
2792 
2793 	kr = mach_vm_pressure_monitor(
2794 		(boolean_t) uap->wait_for_pressure,
2795 		uap->nsecs_monitored,
2796 		(uap->pages_reclaimed) ? &pages_reclaimed : NULL,
2797 		&pages_wanted);
2798 
2799 	switch (kr) {
2800 	case KERN_SUCCESS:
2801 		break;
2802 	case KERN_ABORTED:
2803 		return EINTR;
2804 	default:
2805 		return EINVAL;
2806 	}
2807 
2808 	if (uap->pages_reclaimed) {
2809 		if (copyout((void *)&pages_reclaimed,
2810 		    uap->pages_reclaimed,
2811 		    sizeof(pages_reclaimed)) != 0) {
2812 			return EFAULT;
2813 		}
2814 	}
2815 
2816 	*retval = (int) pages_wanted;
2817 	return 0;
2818 }
2819 
2820 int
kas_info(struct proc * p,struct kas_info_args * uap,int * retval __unused)2821 kas_info(struct proc *p,
2822     struct kas_info_args *uap,
2823     int *retval __unused)
2824 {
2825 #ifndef CONFIG_KAS_INFO
2826 	(void)p;
2827 	(void)uap;
2828 	return ENOTSUP;
2829 #else /* CONFIG_KAS_INFO */
2830 	int                     selector = uap->selector;
2831 	user_addr_t     valuep = uap->value;
2832 	user_addr_t     sizep = uap->size;
2833 	user_size_t size, rsize;
2834 	int                     error;
2835 
2836 	if (!kauth_cred_issuser(kauth_cred_get())) {
2837 		return EPERM;
2838 	}
2839 
2840 #if CONFIG_MACF
2841 	error = mac_system_check_kas_info(kauth_cred_get(), selector);
2842 	if (error) {
2843 		return error;
2844 	}
2845 #endif
2846 
2847 	if (IS_64BIT_PROCESS(p)) {
2848 		user64_size_t size64;
2849 		error = copyin(sizep, &size64, sizeof(size64));
2850 		size = (user_size_t)size64;
2851 	} else {
2852 		user32_size_t size32;
2853 		error = copyin(sizep, &size32, sizeof(size32));
2854 		size = (user_size_t)size32;
2855 	}
2856 	if (error) {
2857 		return error;
2858 	}
2859 
2860 	switch (selector) {
2861 	case KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR:
2862 	{
2863 		uint64_t slide = vm_kernel_slide;
2864 
2865 		if (sizeof(slide) != size) {
2866 			return EINVAL;
2867 		}
2868 
2869 		error = copyout(&slide, valuep, sizeof(slide));
2870 		if (error) {
2871 			return error;
2872 		}
2873 		rsize = size;
2874 	}
2875 	break;
2876 	case KAS_INFO_KERNEL_SEGMENT_VMADDR_SELECTOR:
2877 	{
2878 		uint32_t i;
2879 		kernel_mach_header_t *mh = &_mh_execute_header;
2880 		struct load_command *cmd;
2881 		cmd = (struct load_command*) &mh[1];
2882 		uint64_t *bases;
2883 		rsize = mh->ncmds * sizeof(uint64_t);
2884 
2885 		/*
2886 		 * Return the size if no data was passed
2887 		 */
2888 		if (valuep == 0) {
2889 			break;
2890 		}
2891 
2892 		if (rsize > size) {
2893 			return EINVAL;
2894 		}
2895 
2896 		bases = kalloc_data(rsize, Z_WAITOK | Z_ZERO);
2897 
2898 		for (i = 0; i < mh->ncmds; i++) {
2899 			if (cmd->cmd == LC_SEGMENT_KERNEL) {
2900 				__IGNORE_WCASTALIGN(kernel_segment_command_t * sg = (kernel_segment_command_t *) cmd);
2901 				bases[i] = (uint64_t)sg->vmaddr;
2902 			}
2903 			cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize);
2904 		}
2905 
2906 		error = copyout(bases, valuep, rsize);
2907 
2908 		kfree_data(bases, rsize);
2909 
2910 		if (error) {
2911 			return error;
2912 		}
2913 	}
2914 	break;
2915 	case KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR:
2916 	case KAS_INFO_TXM_TEXT_SLIDE_SELECTOR:
2917 	{
2918 #if CONFIG_SPTM
2919 		const uint64_t slide =
2920 		    (selector == KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR) ? vm_sptm_offsets.slide : vm_txm_offsets.slide;
2921 #else
2922 		const uint64_t slide = 0;
2923 #endif
2924 
2925 		if (sizeof(slide) != size) {
2926 			return EINVAL;
2927 		}
2928 
2929 		error = copyout(&slide, valuep, sizeof(slide));
2930 		if (error) {
2931 			return error;
2932 		}
2933 		rsize = size;
2934 	}
2935 	break;
2936 	default:
2937 		return EINVAL;
2938 	}
2939 
2940 	if (IS_64BIT_PROCESS(p)) {
2941 		user64_size_t size64 = (user64_size_t)rsize;
2942 		error = copyout(&size64, sizep, sizeof(size64));
2943 	} else {
2944 		user32_size_t size32 = (user32_size_t)rsize;
2945 		error = copyout(&size32, sizep, sizeof(size32));
2946 	}
2947 
2948 	return error;
2949 #endif /* CONFIG_KAS_INFO */
2950 }
2951 
2952 #pragma clang diagnostic push
2953 #pragma clang diagnostic ignored "-Wcast-qual"
2954 #pragma clang diagnostic ignored "-Wunused-function"
2955 
2956 static void
asserts()2957 asserts()
2958 {
2959 	static_assert(sizeof(vm_min_kernel_address) == sizeof(unsigned long));
2960 	static_assert(sizeof(vm_max_kernel_address) == sizeof(unsigned long));
2961 }
2962 
2963 SYSCTL_ULONG(_vm, OID_AUTO, vm_min_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_min_kernel_address, "");
2964 SYSCTL_ULONG(_vm, OID_AUTO, vm_max_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_max_kernel_address, "");
2965 #pragma clang diagnostic pop
2966 
2967 extern uint32_t vm_page_pages;
2968 SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0, "");
2969 
2970 extern uint32_t vm_page_busy_absent_skipped;
2971 SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, "");
2972 
2973 extern uint32_t vm_page_upl_tainted;
2974 SYSCTL_UINT(_vm, OID_AUTO, upl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_upl_tainted, 0, "");
2975 
2976 extern uint32_t vm_page_iopl_tainted;
2977 SYSCTL_UINT(_vm, OID_AUTO, iopl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_iopl_tainted, 0, "");
2978 
2979 #if __arm64__ && (DEVELOPMENT || DEBUG)
2980 extern int vm_footprint_suspend_allowed;
2981 SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, "");
2982 
2983 extern void pmap_footprint_suspend(vm_map_t map, boolean_t suspend);
2984 static int
2985 sysctl_vm_footprint_suspend SYSCTL_HANDLER_ARGS
2986 {
2987 #pragma unused(oidp, arg1, arg2)
2988 	int error = 0;
2989 	int new_value;
2990 
2991 	if (req->newptr == USER_ADDR_NULL) {
2992 		return 0;
2993 	}
2994 	error = SYSCTL_IN(req, &new_value, sizeof(int));
2995 	if (error) {
2996 		return error;
2997 	}
2998 	if (!vm_footprint_suspend_allowed) {
2999 		if (new_value != 0) {
3000 			/* suspends are not allowed... */
3001 			return 0;
3002 		}
3003 		/* ... but let resumes proceed */
3004 	}
3005 	DTRACE_VM2(footprint_suspend,
3006 	    vm_map_t, current_map(),
3007 	    int, new_value);
3008 
3009 	pmap_footprint_suspend(current_map(), new_value);
3010 
3011 	return 0;
3012 }
3013 SYSCTL_PROC(_vm, OID_AUTO, footprint_suspend,
3014     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3015     0, 0, &sysctl_vm_footprint_suspend, "I", "");
3016 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
3017 
3018 extern uint64_t vm_map_corpse_footprint_count;
3019 extern uint64_t vm_map_corpse_footprint_size_avg;
3020 extern uint64_t vm_map_corpse_footprint_size_max;
3021 extern uint64_t vm_map_corpse_footprint_full;
3022 extern uint64_t vm_map_corpse_footprint_no_buf;
3023 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_count,
3024     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_count, "");
3025 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_avg,
3026     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_avg, "");
3027 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_max,
3028     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_max, "");
3029 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_full,
3030     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_full, "");
3031 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_no_buf,
3032     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_no_buf, "");
3033 
3034 #if CODE_SIGNING_MONITOR
3035 extern uint64_t vm_cs_defer_to_csm;
3036 extern uint64_t vm_cs_defer_to_csm_not;
3037 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm,
3038     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm, "");
3039 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm_not,
3040     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm_not, "");
3041 #endif /* CODE_SIGNING_MONITOR */
3042 
3043 extern uint64_t shared_region_pager_copied;
3044 extern uint64_t shared_region_pager_slid;
3045 extern uint64_t shared_region_pager_slid_error;
3046 extern uint64_t shared_region_pager_reclaimed;
3047 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_copied,
3048     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_copied, "");
3049 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid,
3050     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid, "");
3051 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid_error,
3052     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid_error, "");
3053 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed,
3054     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_reclaimed, "");
3055 extern int shared_region_destroy_delay;
3056 SYSCTL_INT(_vm, OID_AUTO, shared_region_destroy_delay,
3057     CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_destroy_delay, 0, "");
3058 
3059 #if MACH_ASSERT
3060 extern int pmap_ledgers_panic_leeway;
3061 SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
3062 #endif /* MACH_ASSERT */
3063 
3064 
3065 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_count;
3066 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_size;
3067 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_max;
3068 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart;
3069 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_error;
3070 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_count;
3071 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_size;
3072 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_max;
3073 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart;
3074 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_error;
3075 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_count;
3076 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_size;
3077 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_max;
3078 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_count,
3079     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_count, "");
3080 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_size,
3081     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_size, "");
3082 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_max,
3083     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_max, "");
3084 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_restart,
3085     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_restart, "");
3086 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_error,
3087     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_error, "");
3088 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_count,
3089     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_count, "");
3090 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_size,
3091     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_size, "");
3092 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_max,
3093     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_max, "");
3094 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_restart,
3095     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_restart, "");
3096 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_error,
3097     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_error, "");
3098 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_count,
3099     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_count, "");
3100 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_size,
3101     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_size, "");
3102 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_max,
3103     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_max, "");
3104 
3105 extern int vm_protect_privileged_from_untrusted;
3106 SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted,
3107     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, "");
3108 extern uint64_t vm_copied_on_read;
3109 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read,
3110     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, "");
3111 
3112 extern int vm_shared_region_count;
3113 extern int vm_shared_region_peak;
3114 SYSCTL_INT(_vm, OID_AUTO, shared_region_count,
3115     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_count, 0, "");
3116 SYSCTL_INT(_vm, OID_AUTO, shared_region_peak,
3117     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_peak, 0, "");
3118 #if DEVELOPMENT || DEBUG
3119 extern unsigned int shared_region_pagers_resident_count;
3120 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_count,
3121     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_count, 0, "");
3122 extern unsigned int shared_region_pagers_resident_peak;
3123 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_peak,
3124     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_peak, 0, "");
3125 extern int shared_region_pager_count;
3126 SYSCTL_INT(_vm, OID_AUTO, shared_region_pager_count,
3127     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_count, 0, "");
3128 #if __has_feature(ptrauth_calls)
3129 extern int shared_region_key_count;
3130 SYSCTL_INT(_vm, OID_AUTO, shared_region_key_count,
3131     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_key_count, 0, "");
3132 extern int vm_shared_region_reslide_count;
3133 SYSCTL_INT(_vm, OID_AUTO, shared_region_reslide_count,
3134     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_reslide_count, 0, "");
3135 #endif /* __has_feature(ptrauth_calls) */
3136 #endif /* DEVELOPMENT || DEBUG */
3137 
3138 #if MACH_ASSERT
3139 extern int debug4k_filter;
3140 SYSCTL_INT(_vm, OID_AUTO, debug4k_filter, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_filter, 0, "");
3141 extern int debug4k_panic_on_terminate;
3142 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_terminate, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_terminate, 0, "");
3143 extern int debug4k_panic_on_exception;
3144 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_exception, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_exception, 0, "");
3145 extern int debug4k_panic_on_misaligned_sharing;
3146 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_misaligned_sharing, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_misaligned_sharing, 0, "");
3147 #endif /* MACH_ASSERT */
3148 
3149 extern uint64_t vm_map_set_size_limit_count;
3150 extern uint64_t vm_map_set_data_limit_count;
3151 extern uint64_t vm_map_enter_RLIMIT_AS_count;
3152 extern uint64_t vm_map_enter_RLIMIT_DATA_count;
3153 SYSCTL_QUAD(_vm, OID_AUTO, map_set_size_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_size_limit_count, "");
3154 SYSCTL_QUAD(_vm, OID_AUTO, map_set_data_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_data_limit_count, "");
3155 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_AS_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_AS_count, "");
3156 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_DATA_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_DATA_count, "");
3157 
3158 extern uint64_t vm_fault_resilient_media_initiate;
3159 extern uint64_t vm_fault_resilient_media_retry;
3160 extern uint64_t vm_fault_resilient_media_proceed;
3161 extern uint64_t vm_fault_resilient_media_release;
3162 extern uint64_t vm_fault_resilient_media_abort1;
3163 extern uint64_t vm_fault_resilient_media_abort2;
3164 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_initiate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_initiate, "");
3165 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_retry, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_retry, "");
3166 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_proceed, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_proceed, "");
3167 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_release, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_release, "");
3168 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort1, "");
3169 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort2, "");
3170 #if MACH_ASSERT
3171 extern int vm_fault_resilient_media_inject_error1_rate;
3172 extern int vm_fault_resilient_media_inject_error1;
3173 extern int vm_fault_resilient_media_inject_error2_rate;
3174 extern int vm_fault_resilient_media_inject_error2;
3175 extern int vm_fault_resilient_media_inject_error3_rate;
3176 extern int vm_fault_resilient_media_inject_error3;
3177 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1_rate, 0, "");
3178 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1, 0, "");
3179 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2_rate, 0, "");
3180 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2, 0, "");
3181 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3_rate, 0, "");
3182 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3, 0, "");
3183 #endif /* MACH_ASSERT */
3184 
3185 extern uint64_t pmap_query_page_info_retries;
3186 SYSCTL_QUAD(_vm, OID_AUTO, pmap_query_page_info_retries, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_query_page_info_retries, "");
3187 
3188 /*
3189  * A sysctl which causes all existing shared regions to become stale. They
3190  * will no longer be used by anything new and will be torn down as soon as
3191  * the last existing user exits. A write of non-zero value causes that to happen.
3192  * This should only be used by launchd, so we check that this is initproc.
3193  */
3194 static int
shared_region_pivot(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3195 shared_region_pivot(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3196 {
3197 	unsigned int value = 0;
3198 	int changed = 0;
3199 	int error = sysctl_io_number(req, 0, sizeof(value), &value, &changed);
3200 	if (error || !changed) {
3201 		return error;
3202 	}
3203 	if (current_proc() != initproc) {
3204 		return EPERM;
3205 	}
3206 
3207 	vm_shared_region_pivot();
3208 
3209 	return 0;
3210 }
3211 
3212 SYSCTL_PROC(_vm, OID_AUTO, shared_region_pivot,
3213     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
3214     0, 0, shared_region_pivot, "I", "");
3215 
3216 extern uint64_t vm_object_shadow_forced;
3217 extern uint64_t vm_object_shadow_skipped;
3218 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
3219     &vm_object_shadow_forced, "");
3220 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_skipped, CTLFLAG_RD | CTLFLAG_LOCKED,
3221     &vm_object_shadow_skipped, "");
3222 
3223 
3224 
3225 SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED,
3226     &vmtc_total, 0, "total text page corruptions detected");
3227 
3228 
3229 #if DEBUG || DEVELOPMENT
3230 /*
3231  * A sysctl that can be used to corrupt a text page with an illegal instruction.
3232  * Used for testing text page self healing.
3233  */
3234 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
3235 static int
corrupt_text_addr(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3236 corrupt_text_addr(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3237 {
3238 	uint64_t value = 0;
3239 	int error = sysctl_handle_quad(oidp, &value, 0, req);
3240 	if (error || !req->newptr) {
3241 		return error;
3242 	}
3243 
3244 	if (vm_corrupt_text_addr((uintptr_t)value) == KERN_SUCCESS) {
3245 		return 0;
3246 	} else {
3247 		return EINVAL;
3248 	}
3249 }
3250 
3251 SYSCTL_PROC(_vm, OID_AUTO, corrupt_text_addr,
3252     CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3253     0, 0, corrupt_text_addr, "-", "");
3254 #endif /* DEBUG || DEVELOPMENT */
3255 
3256 #if CONFIG_MAP_RANGES
3257 /*
3258  * vm.malloc_ranges
3259  *
3260  * space-separated list of <left:right> hexadecimal addresses.
3261  */
3262 static int
3263 vm_map_malloc_ranges SYSCTL_HANDLER_ARGS
3264 {
3265 	vm_map_t map = current_map();
3266 	struct mach_vm_range r1, r2;
3267 	char str[20 * 4];
3268 	int len;
3269 	mach_vm_offset_t right_hole_max;
3270 
3271 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_DEFAULT, &r1)) {
3272 		return ENOENT;
3273 	}
3274 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_HEAP, &r2)) {
3275 		return ENOENT;
3276 	}
3277 
3278 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
3279 	right_hole_max = MACH_VM_JUMBO_ADDRESS;
3280 #else /* !XNU_TARGET_OS_IOS || !EXTENDED_USER_VA_SUPPORT */
3281 	right_hole_max = get_map_max(map);
3282 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
3283 
3284 	len = scnprintf(str, sizeof(str), "0x%llx:0x%llx 0x%llx:0x%llx",
3285 	    r1.max_address, r2.min_address,
3286 	    r2.max_address, right_hole_max);
3287 
3288 	return SYSCTL_OUT(req, str, len);
3289 }
3290 
3291 SYSCTL_PROC(_vm, OID_AUTO, malloc_ranges,
3292     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3293     0, 0, &vm_map_malloc_ranges, "A", "");
3294 
3295 #if DEBUG || DEVELOPMENT
3296 static int
3297 vm_map_user_range_default SYSCTL_HANDLER_ARGS
3298 {
3299 #pragma unused(arg1, arg2, oidp)
3300 	struct mach_vm_range range;
3301 
3302 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_DEFAULT, &range)
3303 	    != KERN_SUCCESS) {
3304 		return EINVAL;
3305 	}
3306 
3307 	return SYSCTL_OUT(req, &range, sizeof(range));
3308 }
3309 
3310 static int
3311 vm_map_user_range_heap SYSCTL_HANDLER_ARGS
3312 {
3313 #pragma unused(arg1, arg2, oidp)
3314 	struct mach_vm_range range;
3315 
3316 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_HEAP, &range)
3317 	    != KERN_SUCCESS) {
3318 		return EINVAL;
3319 	}
3320 
3321 	return SYSCTL_OUT(req, &range, sizeof(range));
3322 }
3323 
3324 static int
3325 vm_map_user_range_large_file SYSCTL_HANDLER_ARGS
3326 {
3327 #pragma unused(arg1, arg2, oidp)
3328 	struct mach_vm_range range;
3329 
3330 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_LARGE_FILE, &range)
3331 	    != KERN_SUCCESS) {
3332 		return EINVAL;
3333 	}
3334 
3335 	return SYSCTL_OUT(req, &range, sizeof(range));
3336 }
3337 
3338 /*
3339  * A sysctl that can be used to return ranges for the current VM map.
3340  * Used for testing VM ranges.
3341  */
3342 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_default, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3343     0, 0, &vm_map_user_range_default, "S,mach_vm_range", "");
3344 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_heap, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3345     0, 0, &vm_map_user_range_heap, "S,mach_vm_range", "");
3346 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_large_file, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3347     0, 0, &vm_map_user_range_large_file, "S,mach_vm_range", "");
3348 
3349 #endif /* DEBUG || DEVELOPMENT */
3350 #endif /* CONFIG_MAP_RANGES */
3351 
3352 #if DEBUG || DEVELOPMENT
3353 #endif /* DEBUG || DEVELOPMENT */
3354 
3355 extern uint64_t vm_map_range_overflows_count;
3356 SYSCTL_QUAD(_vm, OID_AUTO, map_range_overflows_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_range_overflows_count, "");
3357 extern boolean_t vm_map_range_overflows_log;
3358 SYSCTL_INT(_vm, OID_AUTO, map_range_oveflows_log, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_range_overflows_log, 0, "");
3359 
3360 extern uint64_t c_seg_filled_no_contention;
3361 extern uint64_t c_seg_filled_contention;
3362 extern clock_sec_t c_seg_filled_contention_sec_max;
3363 extern clock_nsec_t c_seg_filled_contention_nsec_max;
3364 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_no_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_no_contention, "");
3365 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention, "");
3366 SYSCTL_ULONG(_vm, OID_AUTO, c_seg_filled_contention_sec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_sec_max, "");
3367 SYSCTL_UINT(_vm, OID_AUTO, c_seg_filled_contention_nsec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_nsec_max, 0, "");
3368 #if (XNU_TARGET_OS_OSX && __arm64__)
3369 extern clock_nsec_t c_process_major_report_over_ms; /* report if over ? ms */
3370 extern int c_process_major_yield_after; /* yield after moving ? segments */
3371 extern uint64_t c_process_major_reports;
3372 extern clock_sec_t c_process_major_max_sec;
3373 extern clock_nsec_t c_process_major_max_nsec;
3374 extern uint32_t c_process_major_peak_segcount;
3375 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_report_over_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_report_over_ms, 0, "");
3376 SYSCTL_INT(_vm, OID_AUTO, c_process_major_yield_after, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_yield_after, 0, "");
3377 SYSCTL_QUAD(_vm, OID_AUTO, c_process_major_reports, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_reports, "");
3378 SYSCTL_ULONG(_vm, OID_AUTO, c_process_major_max_sec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_sec, "");
3379 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_max_nsec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_nsec, 0, "");
3380 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_peak_segcount, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_peak_segcount, 0, "");
3381 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3382 
3383 #if DEVELOPMENT || DEBUG
3384 extern int panic_object_not_alive;
3385 SYSCTL_INT(_vm, OID_AUTO, panic_object_not_alive, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &panic_object_not_alive, 0, "");
3386 #endif /* DEVELOPMENT || DEBUG */
3387 
3388 #if FBDP_DEBUG_OBJECT_NO_PAGER
3389 extern int fbdp_no_panic;
3390 SYSCTL_INT(_vm, OID_AUTO, fbdp_no_panic, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &fbdp_no_panic, 0, "");
3391 #endif /* MACH_ASSERT */
3392 
3393 extern uint64_t cluster_direct_write_wired;
3394 SYSCTL_QUAD(_vm, OID_AUTO, cluster_direct_write_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &cluster_direct_write_wired, "");
3395 
3396 extern uint64_t vm_object_pageout_not_on_queue;
3397 extern uint64_t vm_object_pageout_not_pageable;
3398 extern uint64_t vm_object_pageout_pageable;
3399 extern uint64_t vm_object_pageout_active_local;
3400 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_on_queue, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_on_queue, "");
3401 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_pageable, "");
3402 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_pageable, "");
3403 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_active_local, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_active_local, "");
3404 
3405 
3406 #if DEVELOPMENT || DEBUG
3407 
3408 static uint32_t
sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)3409 sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)
3410 {
3411 #pragma unused(with_data)
3412 	return VM_C_SEGMENT_INFO_MAGIC;
3413 }
3414 
3415 /* The largest possible single segment + its slots is
3416  * (sizeof(c_segment_info) + C_SLOT_MAX_INDEX * sizeof(c_slot_info)) + (data of a single segment) */
3417 #define SYSCTL_SEG_BUF_SIZE (8 * 1024 + 64 * 1024)
3418 
3419 extern uint32_t c_segments_available;
3420 
3421 struct sysctl_buf_header {
3422 	uint32_t magic;
3423 } __attribute__((packed));
3424 
3425 /* This sysctl iterates over the populated c_segments and writes some info about each one and its slots.
3426  * instead of doing everything here, the function calls a function vm_compressor.c. */
3427 static int
sysctl_compressor_segments_stream(struct sysctl_req * req,vm_c_serialize_add_data_t with_data)3428 sysctl_compressor_segments_stream(struct sysctl_req *req, vm_c_serialize_add_data_t with_data)
3429 {
3430 	char* buf = kalloc_data(SYSCTL_SEG_BUF_SIZE, Z_WAITOK | Z_ZERO);
3431 	if (!buf) {
3432 		return ENOMEM;
3433 	}
3434 	size_t offset = 0;
3435 	int error = 0;
3436 	int segno = 0;
3437 	/* 4 byte header to identify the version of the formatting of the data.
3438 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3439 	((struct sysctl_buf_header*)buf)->magic = sysctl_compressor_seg_magic(with_data);
3440 	offset += sizeof(uint32_t);
3441 
3442 	while (segno < c_segments_available) {
3443 		size_t left_sz = SYSCTL_SEG_BUF_SIZE - offset;
3444 		kern_return_t kr = vm_compressor_serialize_segment_debug_info(segno, buf + offset, &left_sz, with_data);
3445 		if (kr == KERN_NO_SPACE) {
3446 			/* failed to add another segment, push the current buffer out and try again */
3447 			if (offset == 0) {
3448 				error = EINVAL; /* no space to write but I didn't write anything, shouldn't really happen */
3449 				goto out;
3450 			}
3451 			/* write out chunk */
3452 			error = SYSCTL_OUT(req, buf, offset);
3453 			if (error) {
3454 				goto out;
3455 			}
3456 			offset = 0;
3457 			bzero(buf, SYSCTL_SEG_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3458 			/* don't increment segno, need to try again saving the current one */
3459 		} else if (kr != KERN_SUCCESS) {
3460 			error = EINVAL;
3461 			goto out;
3462 		} else {
3463 			offset += left_sz;
3464 			++segno;
3465 			assert(offset <= SYSCTL_SEG_BUF_SIZE);
3466 		}
3467 	}
3468 
3469 	if (offset > 0) { /* write last chunk */
3470 		error = SYSCTL_OUT(req, buf, offset);
3471 	}
3472 
3473 out:
3474 	kfree_data(buf, SYSCTL_SEG_BUF_SIZE)
3475 	return error;
3476 }
3477 
3478 static int
sysctl_compressor_segments(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3479 sysctl_compressor_segments(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3480 {
3481 	return sysctl_compressor_segments_stream(req, VM_C_SERIALIZE_DATA_NONE);
3482 }
3483 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments, "S", "");
3484 
3485 
3486 extern uint32_t vm_compressor_fragmentation_level(void);
3487 
3488 static int
sysctl_compressor_fragmentation_level(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3489 sysctl_compressor_fragmentation_level(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3490 {
3491 	uint32_t value = vm_compressor_fragmentation_level();
3492 	return SYSCTL_OUT(req, &value, sizeof(value));
3493 }
3494 
3495 SYSCTL_PROC(_vm, OID_AUTO, compressor_fragmentation_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_fragmentation_level, "IU", "");
3496 
3497 extern uint32_t vm_compressor_incore_fragmentation_wasted_pages(void);
3498 
3499 static int
sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3500 sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3501 {
3502 	uint32_t value = vm_compressor_incore_fragmentation_wasted_pages();
3503 	return SYSCTL_OUT(req, &value, sizeof(value));
3504 }
3505 
3506 SYSCTL_PROC(_vm, OID_AUTO, compressor_incore_fragmentation_wasted_pages, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_incore_fragmentation_wasted_pages, "IU", "");
3507 
3508 
3509 
3510 #define SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE (8 * 1024)
3511 
3512 
3513 /* This sysctl iterates over all the entries of the vm_map of the a given process and write some info about the vm_object pointed by the entries.
3514  * This can be used for mapping where are all the pages of a process located in the compressor.
3515  */
3516 static int
sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)3517 sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3518 {
3519 	int error = 0;
3520 	char *buf = NULL;
3521 	proc_t p = PROC_NULL;
3522 	task_t task = TASK_NULL;
3523 	vm_map_t map = VM_MAP_NULL;
3524 	__block size_t offset = 0;
3525 
3526 	/* go from pid to proc to task to vm_map. see sysctl_procargsx() for another example of this procession */
3527 	int *name = arg1;
3528 	int namelen = arg2;
3529 	if (namelen < 1) {
3530 		return EINVAL;
3531 	}
3532 	int pid = name[0];
3533 	p = proc_find(pid);  /* this increments a reference to the proc */
3534 	if (p == PROC_NULL) {
3535 		return EINVAL;
3536 	}
3537 	task = proc_task(p);
3538 	proc_rele(p);  /* decrement ref of proc */
3539 	p = PROC_NULL;
3540 	if (task == TASK_NULL) {
3541 		return EINVAL;
3542 	}
3543 	/* convert proc reference to task reference */
3544 	task_reference(task);
3545 	/* task reference to map reference */
3546 	map = get_task_map_reference(task);
3547 	task_deallocate(task);
3548 
3549 	if (map == VM_MAP_NULL) {
3550 		return EINVAL;  /* nothing allocated yet */
3551 	}
3552 
3553 	buf = kalloc_data(SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE, Z_WAITOK | Z_ZERO);
3554 	if (!buf) {
3555 		error = ENOMEM;
3556 		goto out;
3557 	}
3558 
3559 	/* 4 byte header to identify the version of the formatting of the data.
3560 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3561 	((struct sysctl_buf_header*)buf)->magic = VM_MAP_ENTRY_INFO_MAGIC;
3562 	offset += sizeof(uint32_t);
3563 
3564 	kern_return_t (^write_header)(int) = ^kern_return_t (int nentries) {
3565 		/* write the header, happens only once at the beginning so we should have enough space */
3566 		assert(offset + sizeof(struct vm_map_info_hdr) < SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE);
3567 		struct vm_map_info_hdr* out_hdr = (struct vm_map_info_hdr*)(buf + offset);
3568 		out_hdr->vmi_nentries = nentries;
3569 		offset += sizeof(struct vm_map_info_hdr);
3570 		return KERN_SUCCESS;
3571 	};
3572 
3573 	kern_return_t (^write_entry)(void*) = ^kern_return_t (void* entry) {
3574 		while (true) { /* try up to 2 times, first try write the the current buffer, otherwise to a new buffer */
3575 			size_t left_sz = SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE - offset;
3576 			kern_return_t kr = vm_map_dump_entry_and_compressor_pager(entry, buf + offset, &left_sz);
3577 			if (kr == KERN_NO_SPACE) {
3578 				/* failed to write anything, flush the current buffer and try again */
3579 				if (offset == 0) {
3580 					return KERN_FAILURE; /* no space to write but I didn't write anything yet, shouldn't really happen */
3581 				}
3582 				/* write out chunk */
3583 				int out_error = SYSCTL_OUT(req, buf, offset);
3584 				if (out_error) {
3585 					return KERN_FAILURE;
3586 				}
3587 				offset = 0;
3588 				bzero(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3589 				continue; /* need to retry the entry dump again with the cleaned buffer */
3590 			} else if (kr != KERN_SUCCESS) {
3591 				return kr;
3592 			}
3593 			offset += left_sz;
3594 			break;
3595 		}
3596 		return KERN_SUCCESS;
3597 	};
3598 
3599 	/* this foreach first calls to the first callback with the number of entries, then calls the second for every entry
3600 	 * when the buffer is exhausted, it is flushed to the sysctl and restarted */
3601 	kern_return_t kr = vm_map_entries_foreach(map, write_header, write_entry);
3602 
3603 	if (kr != KERN_SUCCESS) {
3604 		goto out;
3605 	}
3606 
3607 	if (offset > 0) { /* last chunk */
3608 		error = SYSCTL_OUT(req, buf, offset);
3609 	}
3610 
3611 out:
3612 	if (buf != NULL) {
3613 		kfree_data(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE)
3614 	}
3615 	if (map != NULL) {
3616 		vm_map_deallocate(map);
3617 	}
3618 	return error;
3619 }
3620 
3621 SYSCTL_PROC(_vm, OID_AUTO, task_vm_objects_slotmap, CTLTYPE_NODE | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_task_vm_objects_slotmap, "S", "");
3622 
3623 
3624 
3625 #endif /* DEVELOPMENT || DEBUG */
3626