xref: /xnu-11215.1.10/bsd/vm/vm_unix.c (revision 8d741a5de7ff4191bf97d57b9f54c2f6d4a15585)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Mach Operating System
30  * Copyright (c) 1987 Carnegie-Mellon University
31  * All rights reserved.  The CMU software License Agreement specifies
32  * the terms and conditions for use and redistribution.
33  */
34 /*
35  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
36  * support for mandatory and extensible security protections.  This notice
37  * is included in support of clause 2.2 (b) of the Apple Public License,
38  * Version 2.0.
39  */
40 #include <vm/vm_options.h>
41 
42 #include <kern/ecc.h>
43 #include <kern/task.h>
44 #include <kern/thread.h>
45 #include <kern/debug.h>
46 #include <kern/extmod_statistics.h>
47 #include <mach/mach_traps.h>
48 #include <mach/port.h>
49 #include <mach/sdt.h>
50 #include <mach/task.h>
51 #include <mach/task_access.h>
52 #include <mach/task_special_ports.h>
53 #include <mach/time_value.h>
54 #include <mach/vm_map.h>
55 #include <mach/vm_param.h>
56 #include <mach/vm_prot.h>
57 #include <machine/machine_routines.h>
58 
59 #include <sys/file_internal.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/dir.h>
63 #include <sys/namei.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/vm.h>
67 #include <sys/file.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/mount.h>
70 #include <sys/xattr.h>
71 #include <sys/trace.h>
72 #include <sys/kernel.h>
73 #include <sys/ubc_internal.h>
74 #include <sys/user.h>
75 #include <sys/syslog.h>
76 #include <sys/stat.h>
77 #include <sys/sysproto.h>
78 #include <sys/mman.h>
79 #include <sys/sysctl.h>
80 #include <sys/cprotect.h>
81 #include <sys/kpi_socket.h>
82 #include <sys/kas_info.h>
83 #include <sys/socket.h>
84 #include <sys/socketvar.h>
85 #include <sys/random.h>
86 #include <sys/code_signing.h>
87 #if NECP
88 #include <net/necp.h>
89 #endif /* NECP */
90 #if SKYWALK
91 #include <skywalk/os_channel.h>
92 #endif /* SKYWALK */
93 
94 #include <security/audit/audit.h>
95 #include <security/mac.h>
96 #include <bsm/audit_kevents.h>
97 
98 #include <kern/kalloc.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_kern_xnu.h>
101 #include <vm/vm_pageout_xnu.h>
102 
103 #include <mach/shared_region.h>
104 #include <vm/vm_shared_region_internal.h>
105 
106 #include <vm/vm_dyld_pager_internal.h>
107 #include <vm/vm_protos_internal.h>
108 #if DEVELOPMENT || DEBUG
109 #include <vm/vm_compressor_info.h>         /* for c_segment_info */
110 #include <vm/vm_compressor_xnu.h>          /* for vm_compressor_serialize_segment_debug_info() */
111 #endif
112 #include <vm/vm_reclaim_xnu.h>
113 
114 #include <sys/kern_memorystatus.h>
115 #include <sys/kern_memorystatus_freeze.h>
116 #include <sys/proc_internal.h>
117 
118 #include <mach-o/fixup-chains.h>
119 
120 #if CONFIG_MACF
121 #include <security/mac_framework.h>
122 #endif
123 
124 #include <kern/bits.h>
125 
126 #if CONFIG_CSR
127 #include <sys/csr.h>
128 #endif /* CONFIG_CSR */
129 #include <sys/trust_caches.h>
130 #include <libkern/amfi/amfi.h>
131 #include <IOKit/IOBSD.h>
132 
133 #if VM_MAP_DEBUG_APPLE_PROTECT
134 SYSCTL_INT(_vm, OID_AUTO, map_debug_apple_protect, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_apple_protect, 0, "");
135 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
136 
137 #if DEVELOPMENT || DEBUG
138 
139 static int
140 sysctl_kmem_alloc_contig SYSCTL_HANDLER_ARGS
141 {
142 #pragma unused(arg1, arg2)
143 	vm_offset_t     kaddr;
144 	kern_return_t   kr;
145 	int     error = 0;
146 	int     size = 0;
147 
148 	error = sysctl_handle_int(oidp, &size, 0, req);
149 	if (error || !req->newptr) {
150 		return error;
151 	}
152 
153 	kr = kmem_alloc_contig(kernel_map, &kaddr, (vm_size_t)size,
154 	    0, 0, 0, KMA_DATA, VM_KERN_MEMORY_IOKIT);
155 
156 	if (kr == KERN_SUCCESS) {
157 		kmem_free(kernel_map, kaddr, size);
158 	}
159 
160 	return error;
161 }
162 
163 SYSCTL_PROC(_vm, OID_AUTO, kmem_alloc_contig, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
164     0, 0, &sysctl_kmem_alloc_contig, "I", "");
165 
166 extern int vm_region_footprint;
167 SYSCTL_INT(_vm, OID_AUTO, region_footprint, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, &vm_region_footprint, 0, "");
168 
169 static int
170 sysctl_kmem_gobj_stats SYSCTL_HANDLER_ARGS
171 {
172 #pragma unused(arg1, arg2, oidp)
173 	kmem_gobj_stats stats = kmem_get_gobj_stats();
174 
175 	return SYSCTL_OUT(req, &stats, sizeof(stats));
176 }
177 
178 SYSCTL_PROC(_vm, OID_AUTO, kmem_gobj_stats,
179     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
180     0, 0, &sysctl_kmem_gobj_stats, "S,kmem_gobj_stats", "");
181 
182 #endif /* DEVELOPMENT || DEBUG */
183 
184 static int
185 sysctl_vm_self_region_footprint SYSCTL_HANDLER_ARGS
186 {
187 #pragma unused(arg1, arg2, oidp)
188 	int     error = 0;
189 	int     value;
190 
191 	value = task_self_region_footprint();
192 	error = SYSCTL_OUT(req, &value, sizeof(int));
193 	if (error) {
194 		return error;
195 	}
196 
197 	if (!req->newptr) {
198 		return 0;
199 	}
200 
201 	error = SYSCTL_IN(req, &value, sizeof(int));
202 	if (error) {
203 		return error;
204 	}
205 	task_self_region_footprint_set(value);
206 	return 0;
207 }
208 SYSCTL_PROC(_vm, OID_AUTO, self_region_footprint, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_footprint, "I", "");
209 
210 static int
211 sysctl_vm_self_region_page_size SYSCTL_HANDLER_ARGS
212 {
213 #pragma unused(arg1, arg2, oidp)
214 	int     error = 0;
215 	int     value;
216 
217 	value = (1 << thread_self_region_page_shift());
218 	error = SYSCTL_OUT(req, &value, sizeof(int));
219 	if (error) {
220 		return error;
221 	}
222 
223 	if (!req->newptr) {
224 		return 0;
225 	}
226 
227 	error = SYSCTL_IN(req, &value, sizeof(int));
228 	if (error) {
229 		return error;
230 	}
231 
232 	if (value != 0 && value != 4096 && value != 16384) {
233 		return EINVAL;
234 	}
235 
236 #if !__ARM_MIXED_PAGE_SIZE__
237 	if (value != vm_map_page_size(current_map())) {
238 		return EINVAL;
239 	}
240 #endif /* !__ARM_MIXED_PAGE_SIZE__ */
241 
242 	thread_self_region_page_shift_set(bit_first(value));
243 	return 0;
244 }
245 SYSCTL_PROC(_vm, OID_AUTO, self_region_page_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_page_size, "I", "");
246 
247 static int
248 sysctl_vm_self_region_info_flags SYSCTL_HANDLER_ARGS
249 {
250 #pragma unused(arg1, arg2, oidp)
251 	int     error = 0;
252 	int     value;
253 	kern_return_t kr;
254 
255 	value = task_self_region_info_flags();
256 	error = SYSCTL_OUT(req, &value, sizeof(int));
257 	if (error) {
258 		return error;
259 	}
260 
261 	if (!req->newptr) {
262 		return 0;
263 	}
264 
265 	error = SYSCTL_IN(req, &value, sizeof(int));
266 	if (error) {
267 		return error;
268 	}
269 
270 	kr = task_self_region_info_flags_set(value);
271 	if (kr != KERN_SUCCESS) {
272 		return EINVAL;
273 	}
274 
275 	return 0;
276 }
277 SYSCTL_PROC(_vm, OID_AUTO, self_region_info_flags, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_info_flags, "I", "");
278 
279 
280 #if DEVELOPMENT || DEBUG
281 extern int panic_on_unsigned_execute;
282 SYSCTL_INT(_vm, OID_AUTO, panic_on_unsigned_execute, CTLFLAG_RW | CTLFLAG_LOCKED, &panic_on_unsigned_execute, 0, "");
283 
284 extern int vm_log_xnu_user_debug;
285 SYSCTL_INT(_vm, OID_AUTO, log_xnu_user_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_xnu_user_debug, 0, "");
286 #endif /* DEVELOPMENT || DEBUG */
287 
288 extern int cs_executable_create_upl;
289 extern int cs_executable_wire;
290 SYSCTL_INT(_vm, OID_AUTO, cs_executable_create_upl, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_create_upl, 0, "");
291 SYSCTL_INT(_vm, OID_AUTO, cs_executable_wire, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_wire, 0, "");
292 
293 extern int apple_protect_pager_count;
294 extern int apple_protect_pager_count_mapped;
295 extern unsigned int apple_protect_pager_cache_limit;
296 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count, 0, "");
297 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count_mapped, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count_mapped, 0, "");
298 SYSCTL_UINT(_vm, OID_AUTO, apple_protect_pager_cache_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_cache_limit, 0, "");
299 
300 #if DEVELOPMENT || DEBUG
301 extern int radar_20146450;
302 SYSCTL_INT(_vm, OID_AUTO, radar_20146450, CTLFLAG_RW | CTLFLAG_LOCKED, &radar_20146450, 0, "");
303 
304 extern int macho_printf;
305 SYSCTL_INT(_vm, OID_AUTO, macho_printf, CTLFLAG_RW | CTLFLAG_LOCKED, &macho_printf, 0, "");
306 
307 extern int apple_protect_pager_data_request_debug;
308 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_data_request_debug, 0, "");
309 
310 #if __arm64__
311 /* These are meant to support the page table accounting unit test. */
312 extern unsigned int arm_hardware_page_size;
313 extern unsigned int arm_pt_desc_size;
314 extern unsigned int arm_pt_root_size;
315 extern unsigned int inuse_user_tteroot_count;
316 extern unsigned int inuse_kernel_tteroot_count;
317 extern unsigned int inuse_user_ttepages_count;
318 extern unsigned int inuse_kernel_ttepages_count;
319 extern unsigned int inuse_user_ptepages_count;
320 extern unsigned int inuse_kernel_ptepages_count;
321 SYSCTL_UINT(_vm, OID_AUTO, native_hw_pagesize, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_hardware_page_size, 0, "");
322 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_desc_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_desc_size, 0, "");
323 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_root_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_root_size, 0, "");
324 SYSCTL_UINT(_vm, OID_AUTO, user_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_tteroot_count, 0, "");
325 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_tteroot_count, 0, "");
326 SYSCTL_UINT(_vm, OID_AUTO, user_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ttepages_count, 0, "");
327 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ttepages_count, 0, "");
328 SYSCTL_UINT(_vm, OID_AUTO, user_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ptepages_count, 0, "");
329 SYSCTL_UINT(_vm, OID_AUTO, kernel_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ptepages_count, 0, "");
330 #if !CONFIG_SPTM
331 extern unsigned int free_page_size_tt_count;
332 extern unsigned int free_tt_count;
333 SYSCTL_UINT(_vm, OID_AUTO, free_1page_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_page_size_tt_count, 0, "");
334 SYSCTL_UINT(_vm, OID_AUTO, free_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_tt_count, 0, "");
335 #endif
336 #if DEVELOPMENT || DEBUG
337 extern unsigned long pmap_asid_flushes;
338 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_flushes, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_flushes, "");
339 extern unsigned long pmap_asid_hits;
340 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_hits, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_hits, "");
341 extern unsigned long pmap_asid_misses;
342 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_misses, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_misses, "");
343 extern unsigned long pmap_speculation_restrictions;
344 SYSCTL_ULONG(_vm, OID_AUTO, pmap_speculation_restrictions, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_speculation_restrictions, "");
345 #endif
346 #endif /* __arm64__ */
347 #endif /* DEVELOPMENT || DEBUG */
348 
349 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor, 0, "");
350 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor_pages, 0, "");
351 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate, 0, "");
352 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate_failure, 0, "");
353 SYSCTL_INT(_vm, OID_AUTO, vm_should_cow_but_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.should_cow_but_wired, 0, "");
354 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow, 0, "");
355 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow_pages, 0, "");
356 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_write, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_write, 0, "");
357 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_copy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_copy, 0, "");
358 #if VM_SCAN_FOR_SHADOW_CHAIN
359 static int vm_shadow_max_enabled = 0;    /* Disabled by default */
360 extern int proc_shadow_max(void);
361 static int
362 vm_shadow_max SYSCTL_HANDLER_ARGS
363 {
364 #pragma unused(arg1, arg2, oidp)
365 	int value = 0;
366 
367 	if (vm_shadow_max_enabled) {
368 		value = proc_shadow_max();
369 	}
370 
371 	return SYSCTL_OUT(req, &value, sizeof(value));
372 }
373 SYSCTL_PROC(_vm, OID_AUTO, vm_shadow_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
374     0, 0, &vm_shadow_max, "I", "");
375 
376 SYSCTL_INT(_vm, OID_AUTO, vm_shadow_max_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_shadow_max_enabled, 0, "");
377 
378 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
379 
380 SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, "");
381 
382 /*
383  * Sysctl's related to data/stack execution.  See osfmk/vm/vm_map.c
384  */
385 
386 #if DEVELOPMENT || DEBUG
387 extern int allow_stack_exec, allow_data_exec;
388 
389 SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, "");
390 SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, "");
391 
392 #endif /* DEVELOPMENT || DEBUG */
393 
394 static const char *prot_values[] = {
395 	"none",
396 	"read-only",
397 	"write-only",
398 	"read-write",
399 	"execute-only",
400 	"read-execute",
401 	"write-execute",
402 	"read-write-execute"
403 };
404 
405 void
log_stack_execution_failure(addr64_t vaddr,vm_prot_t prot)406 log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot)
407 {
408 	printf("Data/Stack execution not permitted: %s[pid %d] at virtual address 0x%qx, protections were %s\n",
409 	    current_proc()->p_comm, proc_getpid(current_proc()), vaddr, prot_values[prot & VM_PROT_ALL]);
410 }
411 
412 /*
413  * shared_region_unnest_logging: level of logging of unnesting events
414  * 0	- no logging
415  * 1	- throttled logging of unexpected unnesting events (default)
416  * 2	- unthrottled logging of unexpected unnesting events
417  * 3+	- unthrottled logging of all unnesting events
418  */
419 int shared_region_unnest_logging = 1;
420 
421 SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED,
422     &shared_region_unnest_logging, 0, "");
423 
424 int vm_shared_region_unnest_log_interval = 10;
425 int shared_region_unnest_log_count_threshold = 5;
426 
427 
428 #if XNU_TARGET_OS_OSX
429 
430 #if defined (__x86_64__)
431 static int scdir_enforce = 1;
432 #else /* defined (__x86_64__) */
433 static int scdir_enforce = 0;   /* AOT caches live elsewhere */
434 #endif /* defined (__x86_64__) */
435 
436 static char *scdir_path[] = {
437 	"/System/Library/dyld/",
438 	"/System/Volumes/Preboot/Cryptexes/OS/System/Library/dyld",
439 	"/System/Cryptexes/OS/System/Library/dyld",
440 	NULL
441 };
442 
443 #else /* XNU_TARGET_OS_OSX */
444 
445 static int scdir_enforce = 0;
446 static char *scdir_path[] = {
447 	"/System/Library/Caches/com.apple.dyld/",
448 	"/private/preboot/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
449 	"/System/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
450 	NULL
451 };
452 
453 #endif /* XNU_TARGET_OS_OSX */
454 
455 static char *driverkit_scdir_path[] = {
456 	"/System/DriverKit/System/Library/dyld/",
457 #if XNU_TARGET_OS_OSX
458 	"/System/Volumes/Preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
459 #else
460 	"/private/preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
461 #endif /* XNU_TARGET_OS_OSX */
462 	"/System/Cryptexes/OS/System/DriverKit/System/Library/dyld",
463 	NULL
464 };
465 
466 #ifndef SECURE_KERNEL
467 static int sysctl_scdir_enforce SYSCTL_HANDLER_ARGS
468 {
469 #if CONFIG_CSR
470 	if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
471 		printf("Failed attempt to set vm.enforce_shared_cache_dir sysctl\n");
472 		return EPERM;
473 	}
474 #endif /* CONFIG_CSR */
475 	return sysctl_handle_int(oidp, arg1, arg2, req);
476 }
477 
478 SYSCTL_PROC(_vm, OID_AUTO, enforce_shared_cache_dir, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, sysctl_scdir_enforce, "I", "");
479 #endif
480 
481 /* These log rate throttling state variables aren't thread safe, but
482  * are sufficient unto the task.
483  */
484 static int64_t last_unnest_log_time = 0;
485 static int shared_region_unnest_log_count = 0;
486 
487 void
log_unnest_badness(vm_map_t m,vm_map_offset_t s,vm_map_offset_t e,boolean_t is_nested_map,vm_map_offset_t lowest_unnestable_addr)488 log_unnest_badness(
489 	vm_map_t        m,
490 	vm_map_offset_t s,
491 	vm_map_offset_t e,
492 	boolean_t       is_nested_map,
493 	vm_map_offset_t lowest_unnestable_addr)
494 {
495 	struct timeval  tv;
496 
497 	if (shared_region_unnest_logging == 0) {
498 		return;
499 	}
500 
501 	if (shared_region_unnest_logging <= 2 &&
502 	    is_nested_map &&
503 	    s >= lowest_unnestable_addr) {
504 		/*
505 		 * Unnesting of writable map entries is fine.
506 		 */
507 		return;
508 	}
509 
510 	if (shared_region_unnest_logging <= 1) {
511 		microtime(&tv);
512 		if ((tv.tv_sec - last_unnest_log_time) <
513 		    vm_shared_region_unnest_log_interval) {
514 			if (shared_region_unnest_log_count++ >
515 			    shared_region_unnest_log_count_threshold) {
516 				return;
517 			}
518 		} else {
519 			last_unnest_log_time = tv.tv_sec;
520 			shared_region_unnest_log_count = 0;
521 		}
522 	}
523 
524 	DTRACE_VM4(log_unnest_badness,
525 	    vm_map_t, m,
526 	    vm_map_offset_t, s,
527 	    vm_map_offset_t, e,
528 	    vm_map_offset_t, lowest_unnestable_addr);
529 	printf("%s[%d] triggered unnest of range 0x%qx->0x%qx of DYLD shared region in VM map %p. While not abnormal for debuggers, this increases system memory footprint until the target exits.\n", current_proc()->p_comm, proc_getpid(current_proc()), (uint64_t)s, (uint64_t)e, (void *) VM_KERNEL_ADDRPERM(m));
530 }
531 
532 uint64_t
vm_purge_filebacked_pagers(void)533 vm_purge_filebacked_pagers(void)
534 {
535 	uint64_t pages_purged;
536 
537 	pages_purged = 0;
538 	pages_purged += apple_protect_pager_purge_all();
539 	pages_purged += shared_region_pager_purge_all();
540 	pages_purged += dyld_pager_purge_all();
541 #if DEVELOPMENT || DEBUG
542 	printf("%s:%d pages purged: %llu\n", __FUNCTION__, __LINE__, pages_purged);
543 #endif /* DEVELOPMENT || DEBUG */
544 	return pages_purged;
545 }
546 
547 int
useracc(user_addr_t addr,user_size_t len,int prot)548 useracc(
549 	user_addr_t     addr,
550 	user_size_t     len,
551 	int     prot)
552 {
553 	vm_map_t        map;
554 
555 	map = current_map();
556 	return vm_map_check_protection(
557 		map,
558 		vm_map_trunc_page(addr,
559 		vm_map_page_mask(map)),
560 		vm_map_round_page(addr + len,
561 		vm_map_page_mask(map)),
562 		prot == B_READ ? VM_PROT_READ : VM_PROT_WRITE);
563 }
564 
565 #if XNU_PLATFORM_MacOSX
566 static inline kern_return_t
vslock_sanitize(vm_map_t map,user_addr_ut addr_u,user_size_ut len_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)567 vslock_sanitize(
568 	vm_map_t                map,
569 	user_addr_ut            addr_u,
570 	user_size_ut            len_u,
571 	vm_sanitize_caller_t    vm_sanitize_caller,
572 	vm_map_offset_t        *start,
573 	vm_map_offset_t        *end,
574 	vm_map_size_t          *size)
575 {
576 	return vm_sanitize_addr_size(addr_u, len_u, vm_sanitize_caller,
577 	           map,
578 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
579 	           size);
580 }
581 #endif /* XNU_PLATFORM_MacOSX */
582 
583 int
vslock(user_addr_ut addr,user_size_ut len)584 vslock(user_addr_ut addr, user_size_ut len)
585 {
586 	kern_return_t kret;
587 
588 #if XNU_PLATFORM_MacOSX
589 	/*
590 	 * Preserve previous behavior on macOS for overflows due to bin
591 	 * compatibility i.e. return success for overflows without doing
592 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
593 	 * for overflow errors which gets converted to KERN_SUCCESS by
594 	 * vm_sanitize_get_kr.
595 	 */
596 	vm_map_offset_t start, end;
597 	vm_map_size_t   size;
598 
599 	kret = vslock_sanitize(current_map(),
600 	    addr,
601 	    len,
602 	    VM_SANITIZE_CALLER_VSLOCK,
603 	    &start,
604 	    &end,
605 	    &size);
606 	if (__improbable(kret != KERN_SUCCESS)) {
607 		switch (vm_sanitize_get_kr(kret)) {
608 		case KERN_SUCCESS:
609 			return 0;
610 		case KERN_INVALID_ADDRESS:
611 		case KERN_NO_SPACE:
612 			return ENOMEM;
613 		case KERN_PROTECTION_FAILURE:
614 			return EACCES;
615 		default:
616 			return EINVAL;
617 		}
618 	}
619 #endif /* XNU_PLATFORM_MacOSX */
620 
621 	kret = vm_map_wire_kernel(current_map(), addr,
622 	    vm_sanitize_compute_unsafe_end(addr, len),
623 	    vm_sanitize_wrap_prot(VM_PROT_READ | VM_PROT_WRITE),
624 	    VM_KERN_MEMORY_BSD,
625 	    FALSE);
626 
627 	switch (kret) {
628 	case KERN_SUCCESS:
629 		return 0;
630 	case KERN_INVALID_ADDRESS:
631 	case KERN_NO_SPACE:
632 		return ENOMEM;
633 	case KERN_PROTECTION_FAILURE:
634 		return EACCES;
635 	default:
636 		return EINVAL;
637 	}
638 }
639 
640 int
vsunlock(user_addr_ut addr,user_size_ut len,__unused int dirtied)641 vsunlock(user_addr_ut addr, user_size_ut len, __unused int dirtied)
642 {
643 #if FIXME  /* [ */
644 	pmap_t          pmap;
645 	vm_page_t       pg;
646 	vm_map_offset_t vaddr;
647 	ppnum_t         paddr;
648 #endif  /* FIXME ] */
649 	kern_return_t   kret;
650 	vm_map_t        map;
651 
652 	map = current_map();
653 
654 #if FIXME  /* [ */
655 	if (dirtied) {
656 		pmap = get_task_pmap(current_task());
657 		for (vaddr = vm_map_trunc_page(addr, PAGE_MASK);
658 		    vaddr < vm_map_round_page(addr + len, PAGE_MASK);
659 		    vaddr += PAGE_SIZE) {
660 			paddr = pmap_find_phys(pmap, vaddr);
661 			pg = PHYS_TO_VM_PAGE(paddr);
662 			vm_page_set_modified(pg);
663 		}
664 	}
665 #endif  /* FIXME ] */
666 #ifdef  lint
667 	dirtied++;
668 #endif  /* lint */
669 
670 #if XNU_PLATFORM_MacOSX
671 	/*
672 	 * Preserve previous behavior on macOS for overflows due to bin
673 	 * compatibility i.e. return success for overflows without doing
674 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
675 	 * for overflow errors which gets converted to KERN_SUCCESS by
676 	 * vm_sanitize_get_kr.
677 	 */
678 	vm_map_offset_t start, end;
679 	vm_map_size_t   size;
680 
681 	kret = vslock_sanitize(map,
682 	    addr,
683 	    len,
684 	    VM_SANITIZE_CALLER_VSUNLOCK,
685 	    &start,
686 	    &end,
687 	    &size);
688 	if (__improbable(kret != KERN_SUCCESS)) {
689 		switch (vm_sanitize_get_kr(kret)) {
690 		case KERN_SUCCESS:
691 			return 0;
692 		case KERN_INVALID_ADDRESS:
693 		case KERN_NO_SPACE:
694 			return ENOMEM;
695 		case KERN_PROTECTION_FAILURE:
696 			return EACCES;
697 		default:
698 			return EINVAL;
699 		}
700 	}
701 #endif /* XNU_PLATFORM_MacOSX */
702 
703 	kret = vm_map_unwire(map, addr,
704 	    vm_sanitize_compute_unsafe_end(addr, len), false);
705 	switch (kret) {
706 	case KERN_SUCCESS:
707 		return 0;
708 	case KERN_INVALID_ADDRESS:
709 	case KERN_NO_SPACE:
710 		return ENOMEM;
711 	case KERN_PROTECTION_FAILURE:
712 		return EACCES;
713 	default:
714 		return EINVAL;
715 	}
716 }
717 
718 int
subyte(user_addr_t addr,int byte)719 subyte(
720 	user_addr_t addr,
721 	int byte)
722 {
723 	char character;
724 
725 	character = (char)byte;
726 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
727 }
728 
729 int
suibyte(user_addr_t addr,int byte)730 suibyte(
731 	user_addr_t addr,
732 	int byte)
733 {
734 	char character;
735 
736 	character = (char)byte;
737 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
738 }
739 
740 int
fubyte(user_addr_t addr)741 fubyte(user_addr_t addr)
742 {
743 	unsigned char byte;
744 
745 	if (copyin(addr, (void *) &byte, sizeof(char))) {
746 		return -1;
747 	}
748 	return byte;
749 }
750 
751 int
fuibyte(user_addr_t addr)752 fuibyte(user_addr_t addr)
753 {
754 	unsigned char byte;
755 
756 	if (copyin(addr, (void *) &(byte), sizeof(char))) {
757 		return -1;
758 	}
759 	return byte;
760 }
761 
762 int
suword(user_addr_t addr,long word)763 suword(
764 	user_addr_t addr,
765 	long word)
766 {
767 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
768 }
769 
770 long
fuword(user_addr_t addr)771 fuword(user_addr_t addr)
772 {
773 	long word = 0;
774 
775 	if (copyin(addr, (void *) &word, sizeof(int))) {
776 		return -1;
777 	}
778 	return word;
779 }
780 
781 /* suiword and fuiword are the same as suword and fuword, respectively */
782 
783 int
suiword(user_addr_t addr,long word)784 suiword(
785 	user_addr_t addr,
786 	long word)
787 {
788 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
789 }
790 
791 long
fuiword(user_addr_t addr)792 fuiword(user_addr_t addr)
793 {
794 	long word = 0;
795 
796 	if (copyin(addr, (void *) &word, sizeof(int))) {
797 		return -1;
798 	}
799 	return word;
800 }
801 
802 /*
803  * With a 32-bit kernel and mixed 32/64-bit user tasks, this interface allows the
804  * fetching and setting of process-sized size_t and pointer values.
805  */
806 int
sulong(user_addr_t addr,int64_t word)807 sulong(user_addr_t addr, int64_t word)
808 {
809 	if (IS_64BIT_PROCESS(current_proc())) {
810 		return copyout((void *)&word, addr, sizeof(word)) == 0 ? 0 : -1;
811 	} else {
812 		return suiword(addr, (long)word);
813 	}
814 }
815 
816 int64_t
fulong(user_addr_t addr)817 fulong(user_addr_t addr)
818 {
819 	int64_t longword;
820 
821 	if (IS_64BIT_PROCESS(current_proc())) {
822 		if (copyin(addr, (void *)&longword, sizeof(longword)) != 0) {
823 			return -1;
824 		}
825 		return longword;
826 	} else {
827 		return (int64_t)fuiword(addr);
828 	}
829 }
830 
831 int
suulong(user_addr_t addr,uint64_t uword)832 suulong(user_addr_t addr, uint64_t uword)
833 {
834 	if (IS_64BIT_PROCESS(current_proc())) {
835 		return copyout((void *)&uword, addr, sizeof(uword)) == 0 ? 0 : -1;
836 	} else {
837 		return suiword(addr, (uint32_t)uword);
838 	}
839 }
840 
841 uint64_t
fuulong(user_addr_t addr)842 fuulong(user_addr_t addr)
843 {
844 	uint64_t ulongword;
845 
846 	if (IS_64BIT_PROCESS(current_proc())) {
847 		if (copyin(addr, (void *)&ulongword, sizeof(ulongword)) != 0) {
848 			return -1ULL;
849 		}
850 		return ulongword;
851 	} else {
852 		return (uint64_t)fuiword(addr);
853 	}
854 }
855 
856 int
swapon(__unused proc_t procp,__unused struct swapon_args * uap,__unused int * retval)857 swapon(__unused proc_t procp, __unused struct swapon_args *uap, __unused int *retval)
858 {
859 	return ENOTSUP;
860 }
861 
862 #if defined(SECURE_KERNEL)
863 static int kern_secure_kernel = 1;
864 #else
865 static int kern_secure_kernel = 0;
866 #endif
867 
868 SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_secure_kernel, 0, "");
869 SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW | CTLFLAG_LOCKED,
870     &shared_region_trace_level, 0, "");
871 SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD | CTLFLAG_LOCKED,
872     &shared_region_version, 0, "");
873 SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED,
874     &shared_region_persistence, 0, "");
875 
876 /*
877  * shared_region_check_np:
878  *
879  * This system call is intended for dyld.
880  *
881  * dyld calls this when any process starts to see if the process's shared
882  * region is already set up and ready to use.
883  * This call returns the base address of the first mapping in the
884  * process's shared region's first mapping.
885  * dyld will then check what's mapped at that address.
886  *
887  * If the shared region is empty, dyld will then attempt to map the shared
888  * cache file in the shared region via the shared_region_map_np() system call.
889  *
890  * If something's already mapped in the shared region, dyld will check if it
891  * matches the shared cache it would like to use for that process.
892  * If it matches, evrything's ready and the process can proceed and use the
893  * shared region.
894  * If it doesn't match, dyld will unmap the shared region and map the shared
895  * cache into the process's address space via mmap().
896  *
897  * A NULL pointer argument can be used by dyld to indicate it has unmapped
898  * the shared region. We will remove the shared_region reference from the task.
899  *
900  * ERROR VALUES
901  * EINVAL	no shared region
902  * ENOMEM	shared region is empty
903  * EFAULT	bad address for "start_address"
904  */
905 int
shared_region_check_np(__unused struct proc * p,struct shared_region_check_np_args * uap,__unused int * retvalp)906 shared_region_check_np(
907 	__unused struct proc                    *p,
908 	struct shared_region_check_np_args      *uap,
909 	__unused int                            *retvalp)
910 {
911 	vm_shared_region_t      shared_region;
912 	mach_vm_offset_t        start_address = 0;
913 	int                     error = 0;
914 	kern_return_t           kr;
915 	task_t                  task = current_task();
916 
917 	SHARED_REGION_TRACE_DEBUG(
918 		("shared_region: %p [%d(%s)] -> check_np(0x%llx)\n",
919 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
920 		proc_getpid(p), p->p_comm,
921 		(uint64_t)uap->start_address));
922 
923 	/*
924 	 * Special value of start_address used to indicate that map_with_linking() should
925 	 * no longer be allowed in this process
926 	 */
927 	if (uap->start_address == (task_get_64bit_addr(task) ? DYLD_VM_END_MWL : (uint32_t)DYLD_VM_END_MWL)) {
928 		p->p_disallow_map_with_linking = TRUE;
929 		return 0;
930 	}
931 
932 	/* retrieve the current tasks's shared region */
933 	shared_region = vm_shared_region_get(task);
934 	if (shared_region != NULL) {
935 		/*
936 		 * A NULL argument is used by dyld to indicate the task
937 		 * has unmapped its shared region.
938 		 */
939 		if (uap->start_address == 0) {
940 			/* unmap it first */
941 			vm_shared_region_remove(task, shared_region);
942 			vm_shared_region_set(task, NULL);
943 		} else {
944 			/* retrieve address of its first mapping... */
945 			kr = vm_shared_region_start_address(shared_region, &start_address, task);
946 			if (kr != KERN_SUCCESS) {
947 				SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
948 				    "check_np(0x%llx) "
949 				    "vm_shared_region_start_address() failed\n",
950 				    (void *)VM_KERNEL_ADDRPERM(current_thread()),
951 				    proc_getpid(p), p->p_comm,
952 				    (uint64_t)uap->start_address));
953 				error = ENOMEM;
954 			} else {
955 #if __has_feature(ptrauth_calls)
956 				/*
957 				 * Remap any section of the shared library that
958 				 * has authenticated pointers into private memory.
959 				 */
960 				if (vm_shared_region_auth_remap(shared_region) != KERN_SUCCESS) {
961 					SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
962 					    "check_np(0x%llx) "
963 					    "vm_shared_region_auth_remap() failed\n",
964 					    (void *)VM_KERNEL_ADDRPERM(current_thread()),
965 					    proc_getpid(p), p->p_comm,
966 					    (uint64_t)uap->start_address));
967 					error = ENOMEM;
968 				}
969 #endif /* __has_feature(ptrauth_calls) */
970 
971 				/* ... and give it to the caller */
972 				if (error == 0) {
973 					error = copyout(&start_address,
974 					    (user_addr_t) uap->start_address,
975 					    sizeof(start_address));
976 					if (error != 0) {
977 						SHARED_REGION_TRACE_ERROR(
978 							("shared_region: %p [%d(%s)] "
979 							"check_np(0x%llx) "
980 							"copyout(0x%llx) error %d\n",
981 							(void *)VM_KERNEL_ADDRPERM(current_thread()),
982 							proc_getpid(p), p->p_comm,
983 							(uint64_t)uap->start_address, (uint64_t)start_address,
984 							error));
985 					}
986 				}
987 			}
988 		}
989 		vm_shared_region_deallocate(shared_region);
990 	} else {
991 		/* no shared region ! */
992 		error = EINVAL;
993 	}
994 
995 	SHARED_REGION_TRACE_DEBUG(
996 		("shared_region: %p [%d(%s)] check_np(0x%llx) <- 0x%llx %d\n",
997 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
998 		proc_getpid(p), p->p_comm,
999 		(uint64_t)uap->start_address, (uint64_t)start_address, error));
1000 
1001 	return error;
1002 }
1003 
1004 
1005 static int
shared_region_copyin(struct proc * p,user_addr_t user_addr,unsigned int count,unsigned int element_size,void * kernel_data)1006 shared_region_copyin(
1007 	struct proc  *p,
1008 	user_addr_t  user_addr,
1009 	unsigned int count,
1010 	unsigned int element_size,
1011 	void         *kernel_data)
1012 {
1013 	int             error = 0;
1014 	vm_size_t       size = count * element_size;
1015 
1016 	error = copyin(user_addr, kernel_data, size);
1017 	if (error) {
1018 		SHARED_REGION_TRACE_ERROR(
1019 			("shared_region: %p [%d(%s)] map(): "
1020 			"copyin(0x%llx, %ld) failed (error=%d)\n",
1021 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1022 			proc_getpid(p), p->p_comm,
1023 			(uint64_t)user_addr, (long)size, error));
1024 	}
1025 	return error;
1026 }
1027 
1028 /*
1029  * A reasonable upper limit to prevent overflow of allocation/copyin.
1030  */
1031 #define _SR_FILE_MAPPINGS_MAX_FILES 256
1032 
1033 /* forward declaration */
1034 __attribute__((noinline))
1035 static void shared_region_map_and_slide_cleanup(
1036 	struct proc              *p,
1037 	uint32_t                 files_count,
1038 	struct _sr_file_mappings *sr_file_mappings,
1039 	struct vm_shared_region  *shared_region);
1040 
1041 /*
1042  * Setup part of _shared_region_map_and_slide().
1043  * It had to be broken out of _shared_region_map_and_slide() to
1044  * prevent compiler inlining from blowing out the stack.
1045  */
1046 __attribute__((noinline))
1047 static int
shared_region_map_and_slide_setup(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings,struct _sr_file_mappings ** sr_file_mappings,struct vm_shared_region ** shared_region_ptr,struct vnode * rdir_vp)1048 shared_region_map_and_slide_setup(
1049 	struct proc                         *p,
1050 	uint32_t                            files_count,
1051 	struct shared_file_np               *files,
1052 	uint32_t                            mappings_count,
1053 	struct shared_file_mapping_slide_np *mappings,
1054 	struct _sr_file_mappings            **sr_file_mappings,
1055 	struct vm_shared_region             **shared_region_ptr,
1056 	struct vnode                        *rdir_vp)
1057 {
1058 	int                             error = 0;
1059 	struct _sr_file_mappings        *srfmp;
1060 	uint32_t                        mappings_next;
1061 	struct vnode_attr               va;
1062 	off_t                           fs;
1063 #if CONFIG_MACF
1064 	vm_prot_t                       maxprot = VM_PROT_ALL;
1065 #endif
1066 	uint32_t                        i;
1067 	struct vm_shared_region         *shared_region = NULL;
1068 	boolean_t                       is_driverkit = task_is_driver(current_task());
1069 
1070 	SHARED_REGION_TRACE_DEBUG(
1071 		("shared_region: %p [%d(%s)] -> map\n",
1072 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1073 		proc_getpid(p), p->p_comm));
1074 
1075 	if (files_count > _SR_FILE_MAPPINGS_MAX_FILES) {
1076 		error = E2BIG;
1077 		goto done;
1078 	}
1079 	if (files_count == 0) {
1080 		error = EINVAL;
1081 		goto done;
1082 	}
1083 	*sr_file_mappings = kalloc_type(struct _sr_file_mappings, files_count,
1084 	    Z_WAITOK | Z_ZERO);
1085 	if (*sr_file_mappings == NULL) {
1086 		error = ENOMEM;
1087 		goto done;
1088 	}
1089 	mappings_next = 0;
1090 	for (i = 0; i < files_count; i++) {
1091 		srfmp = &(*sr_file_mappings)[i];
1092 		srfmp->fd = files[i].sf_fd;
1093 		srfmp->mappings_count = files[i].sf_mappings_count;
1094 		srfmp->mappings = &mappings[mappings_next];
1095 		mappings_next += srfmp->mappings_count;
1096 		if (mappings_next > mappings_count) {
1097 			error = EINVAL;
1098 			goto done;
1099 		}
1100 		srfmp->slide = files[i].sf_slide;
1101 	}
1102 
1103 	/* get the process's shared region (setup in vm_map_exec()) */
1104 	shared_region = vm_shared_region_trim_and_get(current_task());
1105 	*shared_region_ptr = shared_region;
1106 	if (shared_region == NULL) {
1107 		SHARED_REGION_TRACE_ERROR(
1108 			("shared_region: %p [%d(%s)] map(): "
1109 			"no shared region\n",
1110 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1111 			proc_getpid(p), p->p_comm));
1112 		error = EINVAL;
1113 		goto done;
1114 	}
1115 
1116 	/*
1117 	 * Check the shared region matches the current root
1118 	 * directory of this process.  Deny the mapping to
1119 	 * avoid tainting the shared region with something that
1120 	 * doesn't quite belong into it.
1121 	 */
1122 	struct vnode *sr_vnode = vm_shared_region_root_dir(shared_region);
1123 	if (sr_vnode != NULL ?  rdir_vp != sr_vnode : rdir_vp != rootvnode) {
1124 		SHARED_REGION_TRACE_ERROR(
1125 			("shared_region: map(%p) root_dir mismatch\n",
1126 			(void *)VM_KERNEL_ADDRPERM(current_thread())));
1127 		error = EPERM;
1128 		goto done;
1129 	}
1130 
1131 
1132 	for (srfmp = &(*sr_file_mappings)[0];
1133 	    srfmp < &(*sr_file_mappings)[files_count];
1134 	    srfmp++) {
1135 		if (srfmp->mappings_count == 0) {
1136 			/* no mappings here... */
1137 			continue;
1138 		}
1139 
1140 		/*
1141 		 * A file descriptor of -1 is used to indicate that the data
1142 		 * to be put in the shared region for this mapping comes directly
1143 		 * from the processes address space. Ensure we have proper alignments.
1144 		 */
1145 		if (srfmp->fd == -1) {
1146 			/* only allow one mapping per fd */
1147 			if (srfmp->mappings_count > 1) {
1148 				SHARED_REGION_TRACE_ERROR(
1149 					("shared_region: %p [%d(%s)] map data >1 mapping\n",
1150 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1151 					proc_getpid(p), p->p_comm));
1152 				error = EINVAL;
1153 				goto done;
1154 			}
1155 
1156 			/*
1157 			 * The destination address and size must be page aligned.
1158 			 */
1159 			struct shared_file_mapping_slide_np *mapping = &srfmp->mappings[0];
1160 			mach_vm_address_t dest_addr = mapping->sms_address;
1161 			mach_vm_size_t    map_size = mapping->sms_size;
1162 			if (!vm_map_page_aligned(dest_addr, vm_map_page_mask(current_map()))) {
1163 				SHARED_REGION_TRACE_ERROR(
1164 					("shared_region: %p [%d(%s)] map data destination 0x%llx not aligned\n",
1165 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1166 					proc_getpid(p), p->p_comm, dest_addr));
1167 				error = EINVAL;
1168 				goto done;
1169 			}
1170 			if (!vm_map_page_aligned(map_size, vm_map_page_mask(current_map()))) {
1171 				SHARED_REGION_TRACE_ERROR(
1172 					("shared_region: %p [%d(%s)] map data size 0x%llx not aligned\n",
1173 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1174 					proc_getpid(p), p->p_comm, map_size));
1175 				error = EINVAL;
1176 				goto done;
1177 			}
1178 			continue;
1179 		}
1180 
1181 		/* get file structure from file descriptor */
1182 		error = fp_get_ftype(p, srfmp->fd, DTYPE_VNODE, EINVAL, &srfmp->fp);
1183 		if (error) {
1184 			SHARED_REGION_TRACE_ERROR(
1185 				("shared_region: %p [%d(%s)] map: "
1186 				"fd=%d lookup failed (error=%d)\n",
1187 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1188 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1189 			goto done;
1190 		}
1191 
1192 		/* we need at least read permission on the file */
1193 		if (!(srfmp->fp->fp_glob->fg_flag & FREAD)) {
1194 			SHARED_REGION_TRACE_ERROR(
1195 				("shared_region: %p [%d(%s)] map: "
1196 				"fd=%d not readable\n",
1197 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1198 				proc_getpid(p), p->p_comm, srfmp->fd));
1199 			error = EPERM;
1200 			goto done;
1201 		}
1202 
1203 		/* get vnode from file structure */
1204 		error = vnode_getwithref((vnode_t)fp_get_data(srfmp->fp));
1205 		if (error) {
1206 			SHARED_REGION_TRACE_ERROR(
1207 				("shared_region: %p [%d(%s)] map: "
1208 				"fd=%d getwithref failed (error=%d)\n",
1209 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1210 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1211 			goto done;
1212 		}
1213 		srfmp->vp = (struct vnode *)fp_get_data(srfmp->fp);
1214 
1215 		/* make sure the vnode is a regular file */
1216 		if (srfmp->vp->v_type != VREG) {
1217 			SHARED_REGION_TRACE_ERROR(
1218 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1219 				"not a file (type=%d)\n",
1220 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1221 				proc_getpid(p), p->p_comm,
1222 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1223 				srfmp->vp->v_name, srfmp->vp->v_type));
1224 			error = EINVAL;
1225 			goto done;
1226 		}
1227 
1228 #if CONFIG_MACF
1229 		/* pass in 0 for the offset argument because AMFI does not need the offset
1230 		 *       of the shared cache */
1231 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
1232 		    srfmp->fp->fp_glob, VM_PROT_ALL, MAP_FILE | MAP_PRIVATE | MAP_FIXED, 0, &maxprot);
1233 		if (error) {
1234 			goto done;
1235 		}
1236 #endif /* MAC */
1237 
1238 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1239 		/*
1240 		 * Check if the shared cache is in the trust cache;
1241 		 * if so, we can skip the root ownership check.
1242 		 */
1243 #if DEVELOPMENT || DEBUG
1244 		/*
1245 		 * Skip both root ownership and trust cache check if
1246 		 * enforcement is disabled.
1247 		 */
1248 		if (!cs_system_enforcement()) {
1249 			goto after_root_check;
1250 		}
1251 #endif /* DEVELOPMENT || DEBUG */
1252 		struct cs_blob *blob = csvnode_get_blob(srfmp->vp, 0);
1253 		if (blob == NULL) {
1254 			SHARED_REGION_TRACE_ERROR(
1255 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1256 				"missing CS blob\n",
1257 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1258 				proc_getpid(p), p->p_comm,
1259 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1260 				srfmp->vp->v_name));
1261 			goto root_check;
1262 		}
1263 		const uint8_t *cdhash = csblob_get_cdhash(blob);
1264 		if (cdhash == NULL) {
1265 			SHARED_REGION_TRACE_ERROR(
1266 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1267 				"missing cdhash\n",
1268 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1269 				proc_getpid(p), p->p_comm,
1270 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1271 				srfmp->vp->v_name));
1272 			goto root_check;
1273 		}
1274 
1275 		bool in_trust_cache = false;
1276 		TrustCacheQueryToken_t qt;
1277 		if (query_trust_cache(kTCQueryTypeAll, cdhash, &qt) == KERN_SUCCESS) {
1278 			TCType_t tc_type = kTCTypeInvalid;
1279 			TCReturn_t tc_ret = amfi->TrustCache.queryGetTCType(&qt, &tc_type);
1280 			in_trust_cache = (tc_ret.error == kTCReturnSuccess &&
1281 			    (tc_type == kTCTypeCryptex1BootOS ||
1282 			    tc_type == kTCTypeStatic ||
1283 			    tc_type == kTCTypeEngineering));
1284 		}
1285 		if (!in_trust_cache) {
1286 			SHARED_REGION_TRACE_ERROR(
1287 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1288 				"not in trust cache\n",
1289 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1290 				proc_getpid(p), p->p_comm,
1291 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1292 				srfmp->vp->v_name));
1293 			goto root_check;
1294 		}
1295 		goto after_root_check;
1296 root_check:
1297 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1298 
1299 		/* The shared cache file must be owned by root */
1300 		VATTR_INIT(&va);
1301 		VATTR_WANTED(&va, va_uid);
1302 		error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1303 		if (error) {
1304 			SHARED_REGION_TRACE_ERROR(
1305 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1306 				"vnode_getattr(%p) failed (error=%d)\n",
1307 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1308 				proc_getpid(p), p->p_comm,
1309 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1310 				srfmp->vp->v_name,
1311 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1312 				error));
1313 			goto done;
1314 		}
1315 		if (va.va_uid != 0) {
1316 			SHARED_REGION_TRACE_ERROR(
1317 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1318 				"owned by uid=%d instead of 0\n",
1319 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1320 				proc_getpid(p), p->p_comm,
1321 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1322 				srfmp->vp->v_name, va.va_uid));
1323 			error = EPERM;
1324 			goto done;
1325 		}
1326 
1327 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1328 after_root_check:
1329 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1330 
1331 #if CONFIG_CSR
1332 		if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
1333 			VATTR_INIT(&va);
1334 			VATTR_WANTED(&va, va_flags);
1335 			error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1336 			if (error) {
1337 				SHARED_REGION_TRACE_ERROR(
1338 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1339 					"vnode_getattr(%p) failed (error=%d)\n",
1340 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1341 					proc_getpid(p), p->p_comm,
1342 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1343 					srfmp->vp->v_name,
1344 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1345 					error));
1346 				goto done;
1347 			}
1348 
1349 			if (!(va.va_flags & SF_RESTRICTED)) {
1350 				/*
1351 				 * CSR is not configured in CSR_ALLOW_UNRESTRICTED_FS mode, and
1352 				 * the shared cache file is NOT SIP-protected, so reject the
1353 				 * mapping request
1354 				 */
1355 				SHARED_REGION_TRACE_ERROR(
1356 					("shared_region: %p [%d(%s)] map(%p:'%s'), "
1357 					"vnode is not SIP-protected. \n",
1358 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1359 					proc_getpid(p), p->p_comm,
1360 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1361 					srfmp->vp->v_name));
1362 				error = EPERM;
1363 				goto done;
1364 			}
1365 		}
1366 #else /* CONFIG_CSR */
1367 
1368 		/*
1369 		 * Devices without SIP/ROSP need to make sure that the shared cache
1370 		 * is either on the root volume or in the preboot cryptex volume.
1371 		 */
1372 		assert(rdir_vp != NULL);
1373 		if (srfmp->vp->v_mount != rdir_vp->v_mount) {
1374 			vnode_t preboot_vp = NULL;
1375 #if XNU_TARGET_OS_OSX
1376 #define PREBOOT_CRYPTEX_PATH "/System/Volumes/Preboot/Cryptexes"
1377 #else
1378 #define PREBOOT_CRYPTEX_PATH "/private/preboot/Cryptexes"
1379 #endif
1380 			error = vnode_lookup(PREBOOT_CRYPTEX_PATH, 0, &preboot_vp, vfs_context_current());
1381 			if (error || srfmp->vp->v_mount != preboot_vp->v_mount) {
1382 				SHARED_REGION_TRACE_ERROR(
1383 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1384 					"not on process' root volume nor preboot volume\n",
1385 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1386 					proc_getpid(p), p->p_comm,
1387 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1388 					srfmp->vp->v_name));
1389 				error = EPERM;
1390 				if (preboot_vp) {
1391 					(void)vnode_put(preboot_vp);
1392 				}
1393 				goto done;
1394 			} else if (preboot_vp) {
1395 				(void)vnode_put(preboot_vp);
1396 			}
1397 		}
1398 #endif /* CONFIG_CSR */
1399 
1400 		if (scdir_enforce) {
1401 			char **expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1402 			struct vnode *scdir_vp = NULL;
1403 			for (expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1404 			    *expected_scdir_path != NULL;
1405 			    expected_scdir_path++) {
1406 				/* get vnode for expected_scdir_path */
1407 				error = vnode_lookup(*expected_scdir_path, 0, &scdir_vp, vfs_context_current());
1408 				if (error) {
1409 					SHARED_REGION_TRACE_ERROR(
1410 						("shared_region: %p [%d(%s)]: "
1411 						"vnode_lookup(%s) failed (error=%d)\n",
1412 						(void *)VM_KERNEL_ADDRPERM(current_thread()),
1413 						proc_getpid(p), p->p_comm,
1414 						*expected_scdir_path, error));
1415 					continue;
1416 				}
1417 
1418 				/* check if parent is scdir_vp */
1419 				assert(scdir_vp != NULL);
1420 				if (vnode_parent(srfmp->vp) == scdir_vp) {
1421 					(void)vnode_put(scdir_vp);
1422 					scdir_vp = NULL;
1423 					goto scdir_ok;
1424 				}
1425 				(void)vnode_put(scdir_vp);
1426 				scdir_vp = NULL;
1427 			}
1428 			/* nothing matches */
1429 			SHARED_REGION_TRACE_ERROR(
1430 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1431 				"shared cache file not in expected directory\n",
1432 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1433 				proc_getpid(p), p->p_comm,
1434 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1435 				srfmp->vp->v_name));
1436 			error = EPERM;
1437 			goto done;
1438 		}
1439 scdir_ok:
1440 
1441 		/* get vnode size */
1442 		error = vnode_size(srfmp->vp, &fs, vfs_context_current());
1443 		if (error) {
1444 			SHARED_REGION_TRACE_ERROR(
1445 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1446 				"vnode_size(%p) failed (error=%d)\n",
1447 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1448 				proc_getpid(p), p->p_comm,
1449 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1450 				srfmp->vp->v_name,
1451 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp), error));
1452 			goto done;
1453 		}
1454 		srfmp->file_size = fs;
1455 
1456 		/* get the file's memory object handle */
1457 		srfmp->file_control = ubc_getobject(srfmp->vp, UBC_HOLDOBJECT);
1458 		if (srfmp->file_control == MEMORY_OBJECT_CONTROL_NULL) {
1459 			SHARED_REGION_TRACE_ERROR(
1460 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1461 				"no memory object\n",
1462 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1463 				proc_getpid(p), p->p_comm,
1464 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1465 				srfmp->vp->v_name));
1466 			error = EINVAL;
1467 			goto done;
1468 		}
1469 
1470 		/* check that the mappings are properly covered by code signatures */
1471 		if (!cs_system_enforcement()) {
1472 			/* code signing is not enforced: no need to check */
1473 		} else {
1474 			for (i = 0; i < srfmp->mappings_count; i++) {
1475 				if (srfmp->mappings[i].sms_init_prot & VM_PROT_ZF) {
1476 					/* zero-filled mapping: not backed by the file */
1477 					continue;
1478 				}
1479 				if (ubc_cs_is_range_codesigned(srfmp->vp,
1480 				    srfmp->mappings[i].sms_file_offset,
1481 				    srfmp->mappings[i].sms_size)) {
1482 					/* this mapping is fully covered by code signatures */
1483 					continue;
1484 				}
1485 				SHARED_REGION_TRACE_ERROR(
1486 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1487 					"mapping #%d/%d [0x%llx:0x%llx:0x%llx:0x%x:0x%x] "
1488 					"is not code-signed\n",
1489 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1490 					proc_getpid(p), p->p_comm,
1491 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1492 					srfmp->vp->v_name,
1493 					i, srfmp->mappings_count,
1494 					srfmp->mappings[i].sms_address,
1495 					srfmp->mappings[i].sms_size,
1496 					srfmp->mappings[i].sms_file_offset,
1497 					srfmp->mappings[i].sms_max_prot,
1498 					srfmp->mappings[i].sms_init_prot));
1499 				error = EINVAL;
1500 				goto done;
1501 			}
1502 		}
1503 	}
1504 done:
1505 	if (error != 0) {
1506 		shared_region_map_and_slide_cleanup(p, files_count, *sr_file_mappings, shared_region);
1507 		*sr_file_mappings = NULL;
1508 		*shared_region_ptr = NULL;
1509 	}
1510 	return error;
1511 }
1512 
1513 /*
1514  * shared_region_map_np()
1515  *
1516  * This system call is intended for dyld.
1517  *
1518  * dyld uses this to map a shared cache file into a shared region.
1519  * This is usually done only the first time a shared cache is needed.
1520  * Subsequent processes will just use the populated shared region without
1521  * requiring any further setup.
1522  */
1523 static int
_shared_region_map_and_slide(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings)1524 _shared_region_map_and_slide(
1525 	struct proc                         *p,
1526 	uint32_t                            files_count,
1527 	struct shared_file_np               *files,
1528 	uint32_t                            mappings_count,
1529 	struct shared_file_mapping_slide_np *mappings)
1530 {
1531 	int                             error = 0;
1532 	kern_return_t                   kr = KERN_SUCCESS;
1533 	struct _sr_file_mappings        *sr_file_mappings = NULL;
1534 	struct vnode                    *rdir_vp = NULL;
1535 	struct vm_shared_region         *shared_region = NULL;
1536 
1537 	/*
1538 	 * Get a reference to the current proc's root dir.
1539 	 * Need this to prevent racing with chroot.
1540 	 */
1541 	proc_fdlock(p);
1542 	rdir_vp = p->p_fd.fd_rdir;
1543 	if (rdir_vp == NULL) {
1544 		rdir_vp = rootvnode;
1545 	}
1546 	assert(rdir_vp != NULL);
1547 	vnode_get(rdir_vp);
1548 	proc_fdunlock(p);
1549 
1550 	/*
1551 	 * Turn files, mappings into sr_file_mappings and other setup.
1552 	 */
1553 	error = shared_region_map_and_slide_setup(p, files_count,
1554 	    files, mappings_count, mappings,
1555 	    &sr_file_mappings, &shared_region, rdir_vp);
1556 	if (error != 0) {
1557 		vnode_put(rdir_vp);
1558 		return error;
1559 	}
1560 
1561 	/* map the file(s) into that shared region's submap */
1562 	kr = vm_shared_region_map_file(shared_region, files_count, sr_file_mappings);
1563 	if (kr != KERN_SUCCESS) {
1564 		SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] map(): "
1565 		    "vm_shared_region_map_file() failed kr=0x%x\n",
1566 		    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1567 		    proc_getpid(p), p->p_comm, kr));
1568 	}
1569 
1570 	/* convert kern_return_t to errno */
1571 	switch (kr) {
1572 	case KERN_SUCCESS:
1573 		error = 0;
1574 		break;
1575 	case KERN_INVALID_ADDRESS:
1576 		error = EFAULT;
1577 		break;
1578 	case KERN_PROTECTION_FAILURE:
1579 		error = EPERM;
1580 		break;
1581 	case KERN_NO_SPACE:
1582 		error = ENOMEM;
1583 		break;
1584 	case KERN_FAILURE:
1585 	case KERN_INVALID_ARGUMENT:
1586 	default:
1587 		error = EINVAL;
1588 		break;
1589 	}
1590 
1591 	/*
1592 	 * Mark that this process is now using split libraries.
1593 	 */
1594 	if (error == 0 && (p->p_flag & P_NOSHLIB)) {
1595 		OSBitAndAtomic(~((uint32_t)P_NOSHLIB), &p->p_flag);
1596 	}
1597 
1598 	vnode_put(rdir_vp);
1599 	shared_region_map_and_slide_cleanup(p, files_count, sr_file_mappings, shared_region);
1600 
1601 	SHARED_REGION_TRACE_DEBUG(
1602 		("shared_region: %p [%d(%s)] <- map\n",
1603 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1604 		proc_getpid(p), p->p_comm));
1605 
1606 	return error;
1607 }
1608 
1609 /*
1610  * Clean up part of _shared_region_map_and_slide()
1611  * It had to be broken out of _shared_region_map_and_slide() to
1612  * prevent compiler inlining from blowing out the stack.
1613  */
1614 __attribute__((noinline))
1615 static void
shared_region_map_and_slide_cleanup(struct proc * p,uint32_t files_count,struct _sr_file_mappings * sr_file_mappings,struct vm_shared_region * shared_region)1616 shared_region_map_and_slide_cleanup(
1617 	struct proc              *p,
1618 	uint32_t                 files_count,
1619 	struct _sr_file_mappings *sr_file_mappings,
1620 	struct vm_shared_region  *shared_region)
1621 {
1622 	struct _sr_file_mappings *srfmp;
1623 	struct vnode_attr        va;
1624 
1625 	if (sr_file_mappings != NULL) {
1626 		for (srfmp = &sr_file_mappings[0]; srfmp < &sr_file_mappings[files_count]; srfmp++) {
1627 			if (srfmp->vp != NULL) {
1628 				vnode_lock_spin(srfmp->vp);
1629 				srfmp->vp->v_flag |= VSHARED_DYLD;
1630 				vnode_unlock(srfmp->vp);
1631 
1632 				/* update the vnode's access time */
1633 				if (!(vnode_vfsvisflags(srfmp->vp) & MNT_NOATIME)) {
1634 					VATTR_INIT(&va);
1635 					nanotime(&va.va_access_time);
1636 					VATTR_SET_ACTIVE(&va, va_access_time);
1637 					vnode_setattr(srfmp->vp, &va, vfs_context_current());
1638 				}
1639 
1640 #if NAMEDSTREAMS
1641 				/*
1642 				 * If the shared cache is compressed, it may
1643 				 * have a namedstream vnode instantiated for
1644 				 * for it. That namedstream vnode will also
1645 				 * have to be marked with VSHARED_DYLD.
1646 				 */
1647 				if (vnode_hasnamedstreams(srfmp->vp)) {
1648 					vnode_t svp;
1649 					if (vnode_getnamedstream(srfmp->vp, &svp, XATTR_RESOURCEFORK_NAME,
1650 					    NS_OPEN, 0, vfs_context_kernel()) == 0) {
1651 						vnode_lock_spin(svp);
1652 						svp->v_flag |= VSHARED_DYLD;
1653 						vnode_unlock(svp);
1654 						vnode_put(svp);
1655 					}
1656 				}
1657 #endif /* NAMEDSTREAMS */
1658 				/*
1659 				 * release the vnode...
1660 				 * ubc_map() still holds it for us in the non-error case
1661 				 */
1662 				(void) vnode_put(srfmp->vp);
1663 				srfmp->vp = NULL;
1664 			}
1665 			if (srfmp->fp != NULL) {
1666 				/* release the file descriptor */
1667 				fp_drop(p, srfmp->fd, srfmp->fp, 0);
1668 				srfmp->fp = NULL;
1669 			}
1670 		}
1671 		kfree_type(struct _sr_file_mappings, files_count, sr_file_mappings);
1672 	}
1673 
1674 	if (shared_region != NULL) {
1675 		vm_shared_region_deallocate(shared_region);
1676 	}
1677 }
1678 
1679 /*
1680  * For each file mapped, we may have mappings for:
1681  *    TEXT, EXECUTE, LINKEDIT, DATA_CONST, __AUTH, DATA
1682  * so let's round up to 8 mappings per file.
1683  */
1684 #define SFM_MAX       (_SR_FILE_MAPPINGS_MAX_FILES * 8)     /* max mapping structs allowed to pass in */
1685 
1686 /*
1687  * This is the new interface for setting up shared region mappings.
1688  *
1689  * The slide used for shared regions setup using this interface is done differently
1690  * from the old interface. The slide value passed in the shared_files_np represents
1691  * a max value. The kernel will choose a random value based on that, then use it
1692  * for all shared regions.
1693  */
1694 #if defined (__x86_64__)
1695 #define SLIDE_AMOUNT_MASK ~FOURK_PAGE_MASK
1696 #else
1697 #define SLIDE_AMOUNT_MASK ~SIXTEENK_PAGE_MASK
1698 #endif
1699 
1700 int
shared_region_map_and_slide_2_np(struct proc * p,struct shared_region_map_and_slide_2_np_args * uap,__unused int * retvalp)1701 shared_region_map_and_slide_2_np(
1702 	struct proc                                  *p,
1703 	struct shared_region_map_and_slide_2_np_args *uap,
1704 	__unused int                                 *retvalp)
1705 {
1706 	unsigned int                  files_count;
1707 	struct shared_file_np         *shared_files = NULL;
1708 	unsigned int                  mappings_count;
1709 	struct shared_file_mapping_slide_np *mappings = NULL;
1710 	kern_return_t                 kr = KERN_SUCCESS;
1711 
1712 	files_count = uap->files_count;
1713 	mappings_count = uap->mappings_count;
1714 
1715 	if (files_count == 0) {
1716 		SHARED_REGION_TRACE_INFO(
1717 			("shared_region: %p [%d(%s)] map(): "
1718 			"no files\n",
1719 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1720 			proc_getpid(p), p->p_comm));
1721 		kr = 0; /* no files to map: we're done ! */
1722 		goto done;
1723 	} else if (files_count <= _SR_FILE_MAPPINGS_MAX_FILES) {
1724 		shared_files = kalloc_data(files_count * sizeof(shared_files[0]), Z_WAITOK);
1725 		if (shared_files == NULL) {
1726 			kr = KERN_RESOURCE_SHORTAGE;
1727 			goto done;
1728 		}
1729 	} else {
1730 		SHARED_REGION_TRACE_ERROR(
1731 			("shared_region: %p [%d(%s)] map(): "
1732 			"too many files (%d) max %d\n",
1733 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1734 			proc_getpid(p), p->p_comm,
1735 			files_count, _SR_FILE_MAPPINGS_MAX_FILES));
1736 		kr = KERN_FAILURE;
1737 		goto done;
1738 	}
1739 
1740 	if (mappings_count == 0) {
1741 		SHARED_REGION_TRACE_INFO(
1742 			("shared_region: %p [%d(%s)] map(): "
1743 			"no mappings\n",
1744 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1745 			proc_getpid(p), p->p_comm));
1746 		kr = 0; /* no mappings: we're done ! */
1747 		goto done;
1748 	} else if (mappings_count <= SFM_MAX) {
1749 		mappings = kalloc_data(mappings_count * sizeof(mappings[0]), Z_WAITOK);
1750 		if (mappings == NULL) {
1751 			kr = KERN_RESOURCE_SHORTAGE;
1752 			goto done;
1753 		}
1754 	} else {
1755 		SHARED_REGION_TRACE_ERROR(
1756 			("shared_region: %p [%d(%s)] map(): "
1757 			"too many mappings (%d) max %d\n",
1758 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1759 			proc_getpid(p), p->p_comm,
1760 			mappings_count, SFM_MAX));
1761 		kr = KERN_FAILURE;
1762 		goto done;
1763 	}
1764 
1765 	kr = shared_region_copyin(p, uap->files, files_count, sizeof(shared_files[0]), shared_files);
1766 	if (kr != KERN_SUCCESS) {
1767 		goto done;
1768 	}
1769 
1770 	kr = shared_region_copyin(p, uap->mappings, mappings_count, sizeof(mappings[0]), mappings);
1771 	if (kr != KERN_SUCCESS) {
1772 		goto done;
1773 	}
1774 
1775 	uint32_t max_slide = shared_files[0].sf_slide;
1776 	uint32_t random_val;
1777 	uint32_t slide_amount;
1778 
1779 	if (max_slide != 0) {
1780 		read_random(&random_val, sizeof random_val);
1781 		slide_amount = ((random_val % max_slide) & SLIDE_AMOUNT_MASK);
1782 	} else {
1783 		slide_amount = 0;
1784 	}
1785 #if DEVELOPMENT || DEBUG
1786 	extern bool bootarg_disable_aslr;
1787 	if (bootarg_disable_aslr) {
1788 		slide_amount = 0;
1789 	}
1790 #endif /* DEVELOPMENT || DEBUG */
1791 
1792 	/*
1793 	 * Fix up the mappings to reflect the desired slide.
1794 	 */
1795 	unsigned int f;
1796 	unsigned int m = 0;
1797 	unsigned int i;
1798 	for (f = 0; f < files_count; ++f) {
1799 		shared_files[f].sf_slide = slide_amount;
1800 		for (i = 0; i < shared_files[f].sf_mappings_count; ++i, ++m) {
1801 			if (m >= mappings_count) {
1802 				SHARED_REGION_TRACE_ERROR(
1803 					("shared_region: %p [%d(%s)] map(): "
1804 					"mapping count argument was too small\n",
1805 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1806 					proc_getpid(p), p->p_comm));
1807 				kr = KERN_FAILURE;
1808 				goto done;
1809 			}
1810 			mappings[m].sms_address += slide_amount;
1811 			if (mappings[m].sms_slide_size != 0) {
1812 				mappings[m].sms_slide_start += slide_amount;
1813 			}
1814 		}
1815 	}
1816 
1817 	kr = _shared_region_map_and_slide(p, files_count, shared_files, mappings_count, mappings);
1818 done:
1819 	kfree_data(shared_files, files_count * sizeof(shared_files[0]));
1820 	kfree_data(mappings, mappings_count * sizeof(mappings[0]));
1821 	return kr;
1822 }
1823 
1824 /*
1825  * A syscall for dyld to use to map data pages that need load time relocation fixups.
1826  * The fixups are performed by a custom pager during page-in, so the pages still appear
1827  * "clean" and hence are easily discarded under memory pressure. They can be re-paged-in
1828  * on demand later, all w/o using the compressor.
1829  *
1830  * Note these page are treated as MAP_PRIVATE. So if the application dirties any pages while
1831  * running, they are COW'd as normal.
1832  */
1833 int
map_with_linking_np(struct proc * p,struct map_with_linking_np_args * uap,__unused int * retvalp)1834 map_with_linking_np(
1835 	struct proc                     *p,
1836 	struct map_with_linking_np_args *uap,
1837 	__unused int                    *retvalp)
1838 {
1839 	uint32_t                        region_count;
1840 	uint32_t                        r;
1841 	struct mwl_region               *regions = NULL;
1842 	struct mwl_region               *rp;
1843 	uint32_t                        link_info_size;
1844 	void                            *link_info = NULL;      /* starts with a struct mwl_info_hdr */
1845 	struct mwl_info_hdr             *info_hdr = NULL;
1846 	uint64_t                        binds_size;
1847 	int                             fd;
1848 	struct fileproc                 *fp = NULL;
1849 	struct vnode                    *vp = NULL;
1850 	size_t                          file_size;
1851 	off_t                           fs;
1852 	struct vnode_attr               va;
1853 	memory_object_control_t         file_control = NULL;
1854 	int                             error;
1855 	kern_return_t                   kr = KERN_SUCCESS;
1856 
1857 	/*
1858 	 * Check if dyld has told us it finished with this call.
1859 	 */
1860 	if (p->p_disallow_map_with_linking) {
1861 		printf("%s: [%d(%s)]: map__with_linking() was disabled\n",
1862 		    __func__, proc_getpid(p), p->p_comm);
1863 		kr = KERN_FAILURE;
1864 		goto done;
1865 	}
1866 
1867 	/*
1868 	 * First we do some sanity checking on what dyld has passed us.
1869 	 */
1870 	region_count = uap->region_count;
1871 	link_info_size = uap->link_info_size;
1872 	if (region_count == 0) {
1873 		printf("%s: [%d(%s)]: region_count == 0\n",
1874 		    __func__, proc_getpid(p), p->p_comm);
1875 		kr = KERN_FAILURE;
1876 		goto done;
1877 	}
1878 	if (region_count > MWL_MAX_REGION_COUNT) {
1879 		printf("%s: [%d(%s)]: region_count too big %d\n",
1880 		    __func__, proc_getpid(p), p->p_comm, region_count);
1881 		kr = KERN_FAILURE;
1882 		goto done;
1883 	}
1884 
1885 	if (link_info_size <= MWL_MIN_LINK_INFO_SIZE) {
1886 		printf("%s: [%d(%s)]: link_info_size too small\n",
1887 		    __func__, proc_getpid(p), p->p_comm);
1888 		kr = KERN_FAILURE;
1889 		goto done;
1890 	}
1891 	if (link_info_size >= MWL_MAX_LINK_INFO_SIZE) {
1892 		printf("%s: [%d(%s)]: link_info_size too big %d\n",
1893 		    __func__, proc_getpid(p), p->p_comm, link_info_size);
1894 		kr = KERN_FAILURE;
1895 		goto done;
1896 	}
1897 
1898 	/*
1899 	 * Allocate and copyin the regions and link info
1900 	 */
1901 	regions = kalloc_data(region_count * sizeof(regions[0]), Z_WAITOK);
1902 	if (regions == NULL) {
1903 		printf("%s: [%d(%s)]: failed to allocate regions\n",
1904 		    __func__, proc_getpid(p), p->p_comm);
1905 		kr = KERN_RESOURCE_SHORTAGE;
1906 		goto done;
1907 	}
1908 	kr = shared_region_copyin(p, uap->regions, region_count, sizeof(regions[0]), regions);
1909 	if (kr != KERN_SUCCESS) {
1910 		printf("%s: [%d(%s)]: failed to copyin regions kr=%d\n",
1911 		    __func__, proc_getpid(p), p->p_comm, kr);
1912 		goto done;
1913 	}
1914 
1915 	link_info = kalloc_data(link_info_size, Z_WAITOK);
1916 	if (link_info == NULL) {
1917 		printf("%s: [%d(%s)]: failed to allocate link_info\n",
1918 		    __func__, proc_getpid(p), p->p_comm);
1919 		kr = KERN_RESOURCE_SHORTAGE;
1920 		goto done;
1921 	}
1922 	kr = shared_region_copyin(p, uap->link_info, 1, link_info_size, link_info);
1923 	if (kr != KERN_SUCCESS) {
1924 		printf("%s: [%d(%s)]: failed to copyin link_info kr=%d\n",
1925 		    __func__, proc_getpid(p), p->p_comm, kr);
1926 		goto done;
1927 	}
1928 
1929 	/*
1930 	 * Do some verification the data structures.
1931 	 */
1932 	info_hdr = (struct mwl_info_hdr *)link_info;
1933 	if (info_hdr->mwli_version != MWL_INFO_VERS) {
1934 		printf("%s: [%d(%s)]: unrecognized mwli_version=%d\n",
1935 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_version);
1936 		kr = KERN_FAILURE;
1937 		goto done;
1938 	}
1939 
1940 	if (info_hdr->mwli_binds_offset > link_info_size) {
1941 		printf("%s: [%d(%s)]: mwli_binds_offset too large %d\n",
1942 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_offset);
1943 		kr = KERN_FAILURE;
1944 		goto done;
1945 	}
1946 
1947 	/* some older devs have s/w page size > h/w page size, no need to support them */
1948 	if (info_hdr->mwli_page_size != PAGE_SIZE) {
1949 		/* no printf, since this is expected on some devices */
1950 		kr = KERN_INVALID_ARGUMENT;
1951 		goto done;
1952 	}
1953 
1954 	binds_size = (uint64_t)info_hdr->mwli_binds_count *
1955 	    ((info_hdr->mwli_pointer_format == DYLD_CHAINED_PTR_32) ? 4 : 8);
1956 	if (binds_size > link_info_size - info_hdr->mwli_binds_offset) {
1957 		printf("%s: [%d(%s)]: mwli_binds_count too large %d\n",
1958 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_count);
1959 		kr = KERN_FAILURE;
1960 		goto done;
1961 	}
1962 
1963 	if (info_hdr->mwli_chains_offset > link_info_size) {
1964 		printf("%s: [%d(%s)]: mwli_chains_offset too large %d\n",
1965 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_offset);
1966 		kr = KERN_FAILURE;
1967 		goto done;
1968 	}
1969 
1970 
1971 	/*
1972 	 * Ensure the chained starts in the link info and make sure the
1973 	 * segment info offsets are within bounds.
1974 	 */
1975 	if (info_hdr->mwli_chains_size < sizeof(struct dyld_chained_starts_in_image)) {
1976 		printf("%s: [%d(%s)]: mwli_chains_size too small %d\n",
1977 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
1978 		kr = KERN_FAILURE;
1979 		goto done;
1980 	}
1981 	if (info_hdr->mwli_chains_size > link_info_size - info_hdr->mwli_chains_offset) {
1982 		printf("%s: [%d(%s)]: mwli_chains_size too large %d\n",
1983 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
1984 		kr = KERN_FAILURE;
1985 		goto done;
1986 	}
1987 
1988 	/* Note that more verification of offsets is done in the pager itself */
1989 
1990 	/*
1991 	 * Ensure we've only been given one FD and verify valid protections.
1992 	 */
1993 	fd = regions[0].mwlr_fd;
1994 	for (r = 0; r < region_count; ++r) {
1995 		if (regions[r].mwlr_fd != fd) {
1996 			printf("%s: [%d(%s)]: mwlr_fd mismatch %d and %d\n",
1997 			    __func__, proc_getpid(p), p->p_comm, fd, regions[r].mwlr_fd);
1998 			kr = KERN_FAILURE;
1999 			goto done;
2000 		}
2001 
2002 		/*
2003 		 * Only allow data mappings and not zero fill. Permit TPRO
2004 		 * mappings only when VM_PROT_READ | VM_PROT_WRITE.
2005 		 */
2006 		if (regions[r].mwlr_protections & VM_PROT_EXECUTE) {
2007 			printf("%s: [%d(%s)]: mwlr_protections EXECUTE not allowed\n",
2008 			    __func__, proc_getpid(p), p->p_comm);
2009 			kr = KERN_FAILURE;
2010 			goto done;
2011 		}
2012 		if (regions[r].mwlr_protections & VM_PROT_ZF) {
2013 			printf("%s: [%d(%s)]: region %d, found VM_PROT_ZF not allowed\n",
2014 			    __func__, proc_getpid(p), p->p_comm, r);
2015 			kr = KERN_FAILURE;
2016 			goto done;
2017 		}
2018 		if ((regions[r].mwlr_protections & VM_PROT_TPRO) &&
2019 		    !(regions[r].mwlr_protections & VM_PROT_WRITE)) {
2020 			printf("%s: [%d(%s)]: region %d, found VM_PROT_TPRO without VM_PROT_WRITE\n",
2021 			    __func__, proc_getpid(p), p->p_comm, r);
2022 			kr = KERN_FAILURE;
2023 			goto done;
2024 		}
2025 	}
2026 
2027 
2028 	/* get file structure from file descriptor */
2029 	error = fp_get_ftype(p, fd, DTYPE_VNODE, EINVAL, &fp);
2030 	if (error) {
2031 		printf("%s: [%d(%s)]: fp_get_ftype() failed, error %d\n",
2032 		    __func__, proc_getpid(p), p->p_comm, error);
2033 		kr = KERN_FAILURE;
2034 		goto done;
2035 	}
2036 
2037 	/* We need at least read permission on the file */
2038 	if (!(fp->fp_glob->fg_flag & FREAD)) {
2039 		printf("%s: [%d(%s)]: not readable\n",
2040 		    __func__, proc_getpid(p), p->p_comm);
2041 		kr = KERN_FAILURE;
2042 		goto done;
2043 	}
2044 
2045 	/* Get the vnode from file structure */
2046 	vp = (struct vnode *)fp_get_data(fp);
2047 	error = vnode_getwithref(vp);
2048 	if (error) {
2049 		printf("%s: [%d(%s)]: failed to get vnode, error %d\n",
2050 		    __func__, proc_getpid(p), p->p_comm, error);
2051 		kr = KERN_FAILURE;
2052 		vp = NULL; /* just to be sure */
2053 		goto done;
2054 	}
2055 
2056 	/* Make sure the vnode is a regular file */
2057 	if (vp->v_type != VREG) {
2058 		printf("%s: [%d(%s)]: vnode not VREG\n",
2059 		    __func__, proc_getpid(p), p->p_comm);
2060 		kr = KERN_FAILURE;
2061 		goto done;
2062 	}
2063 
2064 	/* get vnode size */
2065 	error = vnode_size(vp, &fs, vfs_context_current());
2066 	if (error) {
2067 		goto done;
2068 	}
2069 	file_size = fs;
2070 
2071 	/* get the file's memory object handle */
2072 	file_control = ubc_getobject(vp, UBC_HOLDOBJECT);
2073 	if (file_control == MEMORY_OBJECT_CONTROL_NULL) {
2074 		printf("%s: [%d(%s)]: no memory object\n",
2075 		    __func__, proc_getpid(p), p->p_comm);
2076 		kr = KERN_FAILURE;
2077 		goto done;
2078 	}
2079 
2080 	for (r = 0; r < region_count; ++r) {
2081 		rp = &regions[r];
2082 
2083 #if CONFIG_MACF
2084 		vm_prot_t prot = (rp->mwlr_protections & VM_PROT_ALL);
2085 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
2086 		    fp->fp_glob, prot, MAP_FILE | MAP_PRIVATE | MAP_FIXED, rp->mwlr_file_offset, &prot);
2087 		if (error) {
2088 			printf("%s: [%d(%s)]: mac_file_check_mmap() failed, region %d, error %d\n",
2089 			    __func__, proc_getpid(p), p->p_comm, r, error);
2090 			kr = KERN_FAILURE;
2091 			goto done;
2092 		}
2093 #endif /* MAC */
2094 
2095 		/* check that the mappings are properly covered by code signatures */
2096 		if (cs_system_enforcement()) {
2097 			if (!ubc_cs_is_range_codesigned(vp, rp->mwlr_file_offset, rp->mwlr_size)) {
2098 				printf("%s: [%d(%s)]: region %d, not code signed\n",
2099 				    __func__, proc_getpid(p), p->p_comm, r);
2100 				kr = KERN_FAILURE;
2101 				goto done;
2102 			}
2103 		}
2104 	}
2105 
2106 	/* update the vnode's access time */
2107 	if (!(vnode_vfsvisflags(vp) & MNT_NOATIME)) {
2108 		VATTR_INIT(&va);
2109 		nanotime(&va.va_access_time);
2110 		VATTR_SET_ACTIVE(&va, va_access_time);
2111 		vnode_setattr(vp, &va, vfs_context_current());
2112 	}
2113 
2114 	/* get the VM to do the work */
2115 	kr = vm_map_with_linking(proc_task(p), regions, region_count, &link_info, link_info_size, file_control);
2116 
2117 done:
2118 	if (fp != NULL) {
2119 		/* release the file descriptor */
2120 		fp_drop(p, fd, fp, 0);
2121 	}
2122 	if (vp != NULL) {
2123 		(void)vnode_put(vp);
2124 	}
2125 	if (regions != NULL) {
2126 		kfree_data(regions, region_count * sizeof(regions[0]));
2127 	}
2128 	/* link info is NULL if it is used in the pager, if things worked */
2129 	if (link_info != NULL) {
2130 		kfree_data(link_info, link_info_size);
2131 	}
2132 
2133 	switch (kr) {
2134 	case KERN_SUCCESS:
2135 		return 0;
2136 	case KERN_RESOURCE_SHORTAGE:
2137 		return ENOMEM;
2138 	default:
2139 		return EINVAL;
2140 	}
2141 }
2142 
2143 #if DEBUG || DEVELOPMENT
2144 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count,
2145     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count, 0, "");
2146 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count_max,
2147     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count_max, 0, "");
2148 #endif /* DEBUG || DEVELOPMENT */
2149 
2150 /* sysctl overflow room */
2151 
2152 SYSCTL_INT(_vm, OID_AUTO, pagesize, CTLFLAG_RD | CTLFLAG_LOCKED,
2153     (int *) &page_size, 0, "vm page size");
2154 
2155 /* vm_page_free_target is provided as a makeshift solution for applications that want to
2156  *       allocate buffer space, possibly purgeable memory, but not cause inactive pages to be
2157  *       reclaimed. It allows the app to calculate how much memory is free outside the free target. */
2158 extern unsigned int     vm_page_free_target;
2159 SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED,
2160     &vm_page_free_target, 0, "Pageout daemon free target");
2161 
2162 SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED,
2163     &vm_pageout_state.vm_memory_pressure, 0, "Memory pressure indicator");
2164 
2165 static int
2166 vm_ctl_page_free_wanted SYSCTL_HANDLER_ARGS
2167 {
2168 #pragma unused(oidp, arg1, arg2)
2169 	unsigned int page_free_wanted;
2170 
2171 	page_free_wanted = mach_vm_ctl_page_free_wanted();
2172 	return SYSCTL_OUT(req, &page_free_wanted, sizeof(page_free_wanted));
2173 }
2174 SYSCTL_PROC(_vm, OID_AUTO, page_free_wanted,
2175     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
2176     0, 0, vm_ctl_page_free_wanted, "I", "");
2177 
2178 extern unsigned int     vm_page_purgeable_count;
2179 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2180     &vm_page_purgeable_count, 0, "Purgeable page count");
2181 
2182 extern unsigned int     vm_page_purgeable_wired_count;
2183 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2184     &vm_page_purgeable_wired_count, 0, "Wired purgeable page count");
2185 
2186 extern unsigned int vm_page_kern_lpage_count;
2187 SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2188     &vm_page_kern_lpage_count, 0, "kernel used large pages");
2189 
2190 #if DEVELOPMENT || DEBUG
2191 #if __ARM_MIXED_PAGE_SIZE__
2192 static int vm_mixed_pagesize_supported = 1;
2193 #else
2194 static int vm_mixed_pagesize_supported = 0;
2195 #endif /*__ARM_MIXED_PAGE_SIZE__ */
2196 SYSCTL_INT(_debug, OID_AUTO, vm_mixed_pagesize_supported, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED,
2197     &vm_mixed_pagesize_supported, 0, "kernel support for mixed pagesize");
2198 
2199 SCALABLE_COUNTER_DECLARE(vm_page_grab_count);
2200 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed");
2201 SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
2202     &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed");
2203 
2204 SYSCTL_INT(_vm, OID_AUTO, pageout_purged_objects, CTLFLAG_RD | CTLFLAG_LOCKED,
2205     &vm_pageout_debug.vm_pageout_purged_objects, 0, "System purged object count");
2206 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED,
2207     &vm_pageout_debug.vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)");
2208 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED,
2209     &vm_pageout_debug.vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)");
2210 
2211 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2212     &vm_pageout_debug.vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated");
2213 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2214     &vm_pageout_debug.vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated");
2215 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2216     &vm_pageout_debug.vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated");         /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */
2217 SYSCTL_ULONG(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2218     &vm_pageout_vminfo.vm_pageout_freed_cleaned, "Cleaned pages freed");
2219 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2220     &vm_pageout_debug.vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated");
2221 SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2222     &vm_pageout_debug.vm_pageout_enqueued_cleaned, 0, "");         /* sum of next two */
2223 #endif /* DEVELOPMENT || DEBUG */
2224 
2225 extern int madvise_free_debug;
2226 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
2227     &madvise_free_debug, 0, "zero-fill on madvise(MADV_FREE*)");
2228 extern int madvise_free_debug_sometimes;
2229 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug_sometimes, CTLFLAG_RW | CTLFLAG_LOCKED,
2230     &madvise_free_debug_sometimes, 0, "sometimes zero-fill on madvise(MADV_FREE*)");
2231 
2232 SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2233     &vm_page_stats_reusable.reusable_count, 0, "Reusable page count");
2234 SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2235     &vm_page_stats_reusable.reusable_pages_success, "");
2236 SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2237     &vm_page_stats_reusable.reusable_pages_failure, "");
2238 SYSCTL_QUAD(_vm, OID_AUTO, reusable_pages_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2239     &vm_page_stats_reusable.reusable_pages_shared, "");
2240 SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2241     &vm_page_stats_reusable.all_reusable_calls, "");
2242 SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2243     &vm_page_stats_reusable.partial_reusable_calls, "");
2244 SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2245     &vm_page_stats_reusable.reuse_pages_success, "");
2246 SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2247     &vm_page_stats_reusable.reuse_pages_failure, "");
2248 SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2249     &vm_page_stats_reusable.all_reuse_calls, "");
2250 SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2251     &vm_page_stats_reusable.partial_reuse_calls, "");
2252 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2253     &vm_page_stats_reusable.can_reuse_success, "");
2254 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2255     &vm_page_stats_reusable.can_reuse_failure, "");
2256 SYSCTL_QUAD(_vm, OID_AUTO, reusable_reclaimed, CTLFLAG_RD | CTLFLAG_LOCKED,
2257     &vm_page_stats_reusable.reusable_reclaimed, "");
2258 SYSCTL_QUAD(_vm, OID_AUTO, reusable_nonwritable, CTLFLAG_RD | CTLFLAG_LOCKED,
2259     &vm_page_stats_reusable.reusable_nonwritable, "");
2260 SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2261     &vm_page_stats_reusable.reusable_shared, "");
2262 SYSCTL_QUAD(_vm, OID_AUTO, free_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2263     &vm_page_stats_reusable.free_shared, "");
2264 
2265 
2266 extern unsigned int vm_page_free_count, vm_page_speculative_count;
2267 SYSCTL_UINT(_vm, OID_AUTO, page_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_count, 0, "");
2268 SYSCTL_UINT(_vm, OID_AUTO, page_speculative_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_speculative_count, 0, "");
2269 
2270 extern unsigned int vm_page_cleaned_count;
2271 SYSCTL_UINT(_vm, OID_AUTO, page_cleaned_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_cleaned_count, 0, "Cleaned queue size");
2272 
2273 extern unsigned int vm_page_pageable_internal_count, vm_page_pageable_external_count;
2274 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_internal_count, 0, "");
2275 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, "");
2276 
2277 /* pageout counts */
2278 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_clean, 0, "");
2279 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_used, 0, "");
2280 
2281 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, "");
2282 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_external, "");
2283 SYSCTL_ULONG(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2284 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_external, "");
2285 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2286 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_cleaned, "");
2287 
2288 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_sharedcache, "");
2289 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache, "");
2290 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_realtime, "");
2291 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime, "");
2292 extern unsigned int vm_page_realtime_count;
2293 SYSCTL_UINT(_vm, OID_AUTO, page_realtime_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_realtime_count, 0, "");
2294 extern int vm_pageout_protect_realtime;
2295 SYSCTL_INT(_vm, OID_AUTO, pageout_protect_realtime, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_protect_realtime, 0, "");
2296 
2297 /* counts of pages prefaulted when entering a memory object */
2298 extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout;
2299 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_pages, "");
2300 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_bailout, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_bailout, "");
2301 
2302 #if defined (__x86_64__)
2303 extern unsigned int vm_clump_promote_threshold;
2304 SYSCTL_UINT(_vm, OID_AUTO, vm_clump_promote_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_clump_promote_threshold, 0, "clump size threshold for promotes");
2305 #if DEVELOPMENT || DEBUG
2306 extern unsigned long vm_clump_stats[];
2307 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[1], "free page allocations from clump of 1 page");
2308 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[2], "free page allocations from clump of 2 pages");
2309 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[3], "free page allocations from clump of 3 pages");
2310 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats4, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[4], "free page allocations from clump of 4 pages");
2311 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats5, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[5], "free page allocations from clump of 5 pages");
2312 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats6, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[6], "free page allocations from clump of 6 pages");
2313 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats7, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[7], "free page allocations from clump of 7 pages");
2314 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats8, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[8], "free page allocations from clump of 8 pages");
2315 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats9, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[9], "free page allocations from clump of 9 pages");
2316 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats10, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[10], "free page allocations from clump of 10 pages");
2317 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats11, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[11], "free page allocations from clump of 11 pages");
2318 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats12, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[12], "free page allocations from clump of 12 pages");
2319 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats13, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[13], "free page allocations from clump of 13 pages");
2320 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats14, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[14], "free page allocations from clump of 14 pages");
2321 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats15, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[15], "free page allocations from clump of 15 pages");
2322 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats16, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[16], "free page allocations from clump of 16 pages");
2323 extern unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
2324 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_alloc, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_allocs, "free page allocations");
2325 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inserts, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inserts, "free page insertions");
2326 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inrange, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inrange, "free page insertions that are part of vm_pages");
2327 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_promotes, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_promotes, "pages promoted to head");
2328 #endif  /* if DEVELOPMENT || DEBUG */
2329 #endif  /* #if defined (__x86_64__) */
2330 
2331 #if CONFIG_SECLUDED_MEMORY
2332 
2333 SYSCTL_UINT(_vm, OID_AUTO, num_tasks_can_use_secluded_mem, CTLFLAG_RD | CTLFLAG_LOCKED, &num_tasks_can_use_secluded_mem, 0, "");
2334 extern unsigned int vm_page_secluded_target;
2335 extern unsigned int vm_page_secluded_count;
2336 extern unsigned int vm_page_secluded_count_free;
2337 extern unsigned int vm_page_secluded_count_inuse;
2338 extern unsigned int vm_page_secluded_count_over_target;
2339 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, "");
2340 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, "");
2341 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, "");
2342 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, "");
2343 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_over_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_over_target, 0, "");
2344 
2345 extern struct vm_page_secluded_data vm_page_secluded;
2346 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, "");
2347 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_free, 0, "");
2348 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_other, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_other, 0, "");
2349 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_locked, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_locked, 0, "");
2350 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_state, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_state, 0, "");
2351 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_realtime, 0, "");
2352 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_dirty, 0, "");
2353 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, "");
2354 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, "");
2355 
2356 #endif /* CONFIG_SECLUDED_MEMORY */
2357 
2358 #pragma mark Deferred Reclaim
2359 
2360 #if CONFIG_DEFERRED_RECLAIM
2361 
2362 #if DEVELOPMENT || DEBUG
2363 /*
2364  * VM reclaim testing
2365  */
2366 extern bool vm_deferred_reclamation_block_until_pid_has_been_reclaimed(pid_t pid);
2367 
2368 static int
2369 sysctl_vm_reclaim_drain_async_queue SYSCTL_HANDLER_ARGS
2370 {
2371 #pragma unused(arg1, arg2)
2372 	int error = EINVAL, pid = 0;
2373 	/*
2374 	 * Only send on write
2375 	 */
2376 	error = sysctl_handle_int(oidp, &pid, 0, req);
2377 	if (error || !req->newptr) {
2378 		return error;
2379 	}
2380 
2381 	bool success = vm_deferred_reclamation_block_until_pid_has_been_reclaimed(pid);
2382 	if (success) {
2383 		error = 0;
2384 	}
2385 
2386 	return error;
2387 }
2388 
2389 SYSCTL_PROC(_vm, OID_AUTO, reclaim_drain_async_queue,
2390     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2391     &sysctl_vm_reclaim_drain_async_queue, "I", "");
2392 
2393 static int
2394 sysctl_vm_reclaim_from_pid SYSCTL_HANDLER_ARGS
2395 {
2396 	int error = EINVAL;
2397 	pid_t pid;
2398 	error = sysctl_handle_int(oidp, &pid, 0, req);
2399 	/* Only reclaim on write */
2400 	if (error || !req->newptr) {
2401 		return error;
2402 	}
2403 	if (pid <= 0) {
2404 		return EINVAL;
2405 	}
2406 	proc_t p = proc_find(pid);
2407 	if (p == PROC_NULL) {
2408 		return ESRCH;
2409 	}
2410 	task_t t = proc_task(p);
2411 	if (t == TASK_NULL) {
2412 		proc_rele(p);
2413 		return ESRCH;
2414 	}
2415 	task_reference(t);
2416 	proc_rele(p);
2417 	vm_deferred_reclamation_reclaim_from_task_sync(t, UINT64_MAX);
2418 	task_deallocate(t);
2419 	return 0;
2420 }
2421 
2422 SYSCTL_PROC(_vm, OID_AUTO, reclaim_from_pid,
2423     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2424     &sysctl_vm_reclaim_from_pid, "I",
2425     "Drain the deferred reclamation buffer for a pid");
2426 
2427 static int
2428 sysctl_vm_reclaim_drain_all_buffers SYSCTL_HANDLER_ARGS
2429 {
2430 	/* Only reclaim on write */
2431 	if (!req->newptr) {
2432 		return EINVAL;
2433 	}
2434 	vm_deferred_reclamation_reclaim_all_memory(RECLAIM_OPTIONS_NONE);
2435 	return 0;
2436 }
2437 
2438 SYSCTL_PROC(_vm, OID_AUTO, reclaim_drain_all_buffers,
2439     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2440     &sysctl_vm_reclaim_drain_all_buffers, "I",
2441     "Drain all system-wide deferred reclamation buffers");
2442 
2443 
2444 extern uint64_t vm_reclaim_max_threshold;
2445 extern uint64_t vm_reclaim_trim_divisor;
2446 
2447 SYSCTL_ULONG(_vm, OID_AUTO, reclaim_max_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_max_threshold, "");
2448 SYSCTL_ULONG(_vm, OID_AUTO, reclaim_trim_divisor, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_trim_divisor, "");
2449 #endif /* DEVELOPMENT || DEBUG */
2450 
2451 #endif /* CONFIG_DEFERRED_RECLAIM */
2452 
2453 #include <kern/thread.h>
2454 #include <sys/user.h>
2455 
2456 void vm_pageout_io_throttle(void);
2457 
2458 void
vm_pageout_io_throttle(void)2459 vm_pageout_io_throttle(void)
2460 {
2461 	struct uthread *uthread = current_uthread();
2462 
2463 	/*
2464 	 * thread is marked as a low priority I/O type
2465 	 * and the I/O we issued while in this cleaning operation
2466 	 * collided with normal I/O operations... we'll
2467 	 * delay in order to mitigate the impact of this
2468 	 * task on the normal operation of the system
2469 	 */
2470 
2471 	if (uthread->uu_lowpri_window) {
2472 		throttle_lowpri_io(1);
2473 	}
2474 }
2475 
2476 int
vm_pressure_monitor(__unused struct proc * p,struct vm_pressure_monitor_args * uap,int * retval)2477 vm_pressure_monitor(
2478 	__unused struct proc *p,
2479 	struct vm_pressure_monitor_args *uap,
2480 	int *retval)
2481 {
2482 	kern_return_t   kr;
2483 	uint32_t        pages_reclaimed;
2484 	uint32_t        pages_wanted;
2485 
2486 	kr = mach_vm_pressure_monitor(
2487 		(boolean_t) uap->wait_for_pressure,
2488 		uap->nsecs_monitored,
2489 		(uap->pages_reclaimed) ? &pages_reclaimed : NULL,
2490 		&pages_wanted);
2491 
2492 	switch (kr) {
2493 	case KERN_SUCCESS:
2494 		break;
2495 	case KERN_ABORTED:
2496 		return EINTR;
2497 	default:
2498 		return EINVAL;
2499 	}
2500 
2501 	if (uap->pages_reclaimed) {
2502 		if (copyout((void *)&pages_reclaimed,
2503 		    uap->pages_reclaimed,
2504 		    sizeof(pages_reclaimed)) != 0) {
2505 			return EFAULT;
2506 		}
2507 	}
2508 
2509 	*retval = (int) pages_wanted;
2510 	return 0;
2511 }
2512 
2513 int
kas_info(struct proc * p,struct kas_info_args * uap,int * retval __unused)2514 kas_info(struct proc *p,
2515     struct kas_info_args *uap,
2516     int *retval __unused)
2517 {
2518 #ifndef CONFIG_KAS_INFO
2519 	(void)p;
2520 	(void)uap;
2521 	return ENOTSUP;
2522 #else /* CONFIG_KAS_INFO */
2523 	int                     selector = uap->selector;
2524 	user_addr_t     valuep = uap->value;
2525 	user_addr_t     sizep = uap->size;
2526 	user_size_t size, rsize;
2527 	int                     error;
2528 
2529 	if (!kauth_cred_issuser(kauth_cred_get())) {
2530 		return EPERM;
2531 	}
2532 
2533 #if CONFIG_MACF
2534 	error = mac_system_check_kas_info(kauth_cred_get(), selector);
2535 	if (error) {
2536 		return error;
2537 	}
2538 #endif
2539 
2540 	if (IS_64BIT_PROCESS(p)) {
2541 		user64_size_t size64;
2542 		error = copyin(sizep, &size64, sizeof(size64));
2543 		size = (user_size_t)size64;
2544 	} else {
2545 		user32_size_t size32;
2546 		error = copyin(sizep, &size32, sizeof(size32));
2547 		size = (user_size_t)size32;
2548 	}
2549 	if (error) {
2550 		return error;
2551 	}
2552 
2553 	switch (selector) {
2554 	case KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR:
2555 	{
2556 		uint64_t slide = vm_kernel_slide;
2557 
2558 		if (sizeof(slide) != size) {
2559 			return EINVAL;
2560 		}
2561 
2562 		error = copyout(&slide, valuep, sizeof(slide));
2563 		if (error) {
2564 			return error;
2565 		}
2566 		rsize = size;
2567 	}
2568 	break;
2569 	case KAS_INFO_KERNEL_SEGMENT_VMADDR_SELECTOR:
2570 	{
2571 		uint32_t i;
2572 		kernel_mach_header_t *mh = &_mh_execute_header;
2573 		struct load_command *cmd;
2574 		cmd = (struct load_command*) &mh[1];
2575 		uint64_t *bases;
2576 		rsize = mh->ncmds * sizeof(uint64_t);
2577 
2578 		/*
2579 		 * Return the size if no data was passed
2580 		 */
2581 		if (valuep == 0) {
2582 			break;
2583 		}
2584 
2585 		if (rsize > size) {
2586 			return EINVAL;
2587 		}
2588 
2589 		bases = kalloc_data(rsize, Z_WAITOK | Z_ZERO);
2590 
2591 		for (i = 0; i < mh->ncmds; i++) {
2592 			if (cmd->cmd == LC_SEGMENT_KERNEL) {
2593 				__IGNORE_WCASTALIGN(kernel_segment_command_t * sg = (kernel_segment_command_t *) cmd);
2594 				bases[i] = (uint64_t)sg->vmaddr;
2595 			}
2596 			cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize);
2597 		}
2598 
2599 		error = copyout(bases, valuep, rsize);
2600 
2601 		kfree_data(bases, rsize);
2602 
2603 		if (error) {
2604 			return error;
2605 		}
2606 	}
2607 	break;
2608 	case KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR:
2609 	case KAS_INFO_TXM_TEXT_SLIDE_SELECTOR:
2610 	{
2611 #if CONFIG_SPTM
2612 		const uint64_t slide =
2613 		    (selector == KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR) ? vm_sptm_offsets.slide : vm_txm_offsets.slide;
2614 #else
2615 		const uint64_t slide = 0;
2616 #endif
2617 
2618 		if (sizeof(slide) != size) {
2619 			return EINVAL;
2620 		}
2621 
2622 		error = copyout(&slide, valuep, sizeof(slide));
2623 		if (error) {
2624 			return error;
2625 		}
2626 		rsize = size;
2627 	}
2628 	break;
2629 	default:
2630 		return EINVAL;
2631 	}
2632 
2633 	if (IS_64BIT_PROCESS(p)) {
2634 		user64_size_t size64 = (user64_size_t)rsize;
2635 		error = copyout(&size64, sizep, sizeof(size64));
2636 	} else {
2637 		user32_size_t size32 = (user32_size_t)rsize;
2638 		error = copyout(&size32, sizep, sizeof(size32));
2639 	}
2640 
2641 	return error;
2642 #endif /* CONFIG_KAS_INFO */
2643 }
2644 
2645 #pragma clang diagnostic push
2646 #pragma clang diagnostic ignored "-Wcast-qual"
2647 #pragma clang diagnostic ignored "-Wunused-function"
2648 
2649 static void
asserts()2650 asserts()
2651 {
2652 	static_assert(sizeof(vm_min_kernel_address) == sizeof(unsigned long));
2653 	static_assert(sizeof(vm_max_kernel_address) == sizeof(unsigned long));
2654 }
2655 
2656 SYSCTL_ULONG(_vm, OID_AUTO, vm_min_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_min_kernel_address, "");
2657 SYSCTL_ULONG(_vm, OID_AUTO, vm_max_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_max_kernel_address, "");
2658 #pragma clang diagnostic pop
2659 
2660 extern uint32_t vm_page_pages;
2661 SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0, "");
2662 
2663 extern uint32_t vm_page_busy_absent_skipped;
2664 SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, "");
2665 
2666 extern uint32_t vm_page_upl_tainted;
2667 SYSCTL_UINT(_vm, OID_AUTO, upl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_upl_tainted, 0, "");
2668 
2669 extern uint32_t vm_page_iopl_tainted;
2670 SYSCTL_UINT(_vm, OID_AUTO, iopl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_iopl_tainted, 0, "");
2671 
2672 #if __arm64__ && (DEVELOPMENT || DEBUG)
2673 extern int vm_footprint_suspend_allowed;
2674 SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, "");
2675 
2676 extern void pmap_footprint_suspend(vm_map_t map, boolean_t suspend);
2677 static int
2678 sysctl_vm_footprint_suspend SYSCTL_HANDLER_ARGS
2679 {
2680 #pragma unused(oidp, arg1, arg2)
2681 	int error = 0;
2682 	int new_value;
2683 
2684 	if (req->newptr == USER_ADDR_NULL) {
2685 		return 0;
2686 	}
2687 	error = SYSCTL_IN(req, &new_value, sizeof(int));
2688 	if (error) {
2689 		return error;
2690 	}
2691 	if (!vm_footprint_suspend_allowed) {
2692 		if (new_value != 0) {
2693 			/* suspends are not allowed... */
2694 			return 0;
2695 		}
2696 		/* ... but let resumes proceed */
2697 	}
2698 	DTRACE_VM2(footprint_suspend,
2699 	    vm_map_t, current_map(),
2700 	    int, new_value);
2701 
2702 	pmap_footprint_suspend(current_map(), new_value);
2703 
2704 	return 0;
2705 }
2706 SYSCTL_PROC(_vm, OID_AUTO, footprint_suspend,
2707     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED,
2708     0, 0, &sysctl_vm_footprint_suspend, "I", "");
2709 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
2710 
2711 extern uint64_t vm_map_corpse_footprint_count;
2712 extern uint64_t vm_map_corpse_footprint_size_avg;
2713 extern uint64_t vm_map_corpse_footprint_size_max;
2714 extern uint64_t vm_map_corpse_footprint_full;
2715 extern uint64_t vm_map_corpse_footprint_no_buf;
2716 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_count,
2717     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_count, "");
2718 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_avg,
2719     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_avg, "");
2720 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_max,
2721     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_max, "");
2722 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_full,
2723     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_full, "");
2724 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_no_buf,
2725     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_no_buf, "");
2726 
2727 #if CODE_SIGNING_MONITOR
2728 extern uint64_t vm_cs_defer_to_csm;
2729 extern uint64_t vm_cs_defer_to_csm_not;
2730 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm,
2731     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm, "");
2732 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm_not,
2733     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm_not, "");
2734 #endif /* CODE_SIGNING_MONITOR */
2735 
2736 extern uint64_t shared_region_pager_copied;
2737 extern uint64_t shared_region_pager_slid;
2738 extern uint64_t shared_region_pager_slid_error;
2739 extern uint64_t shared_region_pager_reclaimed;
2740 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_copied,
2741     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_copied, "");
2742 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid,
2743     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid, "");
2744 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid_error,
2745     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid_error, "");
2746 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed,
2747     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_reclaimed, "");
2748 extern int shared_region_destroy_delay;
2749 SYSCTL_INT(_vm, OID_AUTO, shared_region_destroy_delay,
2750     CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_destroy_delay, 0, "");
2751 
2752 #if MACH_ASSERT
2753 extern int pmap_ledgers_panic_leeway;
2754 SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
2755 #endif /* MACH_ASSERT */
2756 
2757 
2758 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_count;
2759 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_size;
2760 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_max;
2761 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart;
2762 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_error;
2763 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_count;
2764 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_size;
2765 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_max;
2766 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart;
2767 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_error;
2768 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_count;
2769 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_size;
2770 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_max;
2771 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_count,
2772     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_count, "");
2773 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_size,
2774     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_size, "");
2775 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_max,
2776     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_max, "");
2777 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_restart,
2778     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_restart, "");
2779 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_error,
2780     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_error, "");
2781 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_count,
2782     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_count, "");
2783 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_size,
2784     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_size, "");
2785 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_max,
2786     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_max, "");
2787 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_restart,
2788     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_restart, "");
2789 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_error,
2790     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_error, "");
2791 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_count,
2792     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_count, "");
2793 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_size,
2794     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_size, "");
2795 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_max,
2796     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_max, "");
2797 
2798 extern int vm_protect_privileged_from_untrusted;
2799 SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted,
2800     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, "");
2801 extern uint64_t vm_copied_on_read;
2802 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read,
2803     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, "");
2804 
2805 extern int vm_shared_region_count;
2806 extern int vm_shared_region_peak;
2807 SYSCTL_INT(_vm, OID_AUTO, shared_region_count,
2808     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_count, 0, "");
2809 SYSCTL_INT(_vm, OID_AUTO, shared_region_peak,
2810     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_peak, 0, "");
2811 #if DEVELOPMENT || DEBUG
2812 extern unsigned int shared_region_pagers_resident_count;
2813 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_count,
2814     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_count, 0, "");
2815 extern unsigned int shared_region_pagers_resident_peak;
2816 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_peak,
2817     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_peak, 0, "");
2818 extern int shared_region_pager_count;
2819 SYSCTL_INT(_vm, OID_AUTO, shared_region_pager_count,
2820     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_count, 0, "");
2821 #if __has_feature(ptrauth_calls)
2822 extern int shared_region_key_count;
2823 SYSCTL_INT(_vm, OID_AUTO, shared_region_key_count,
2824     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_key_count, 0, "");
2825 extern int vm_shared_region_reslide_count;
2826 SYSCTL_INT(_vm, OID_AUTO, shared_region_reslide_count,
2827     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_reslide_count, 0, "");
2828 #endif /* __has_feature(ptrauth_calls) */
2829 #endif /* DEVELOPMENT || DEBUG */
2830 
2831 #if MACH_ASSERT
2832 extern int debug4k_filter;
2833 SYSCTL_INT(_vm, OID_AUTO, debug4k_filter, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_filter, 0, "");
2834 extern int debug4k_panic_on_terminate;
2835 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_terminate, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_terminate, 0, "");
2836 extern int debug4k_panic_on_exception;
2837 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_exception, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_exception, 0, "");
2838 extern int debug4k_panic_on_misaligned_sharing;
2839 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_misaligned_sharing, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_misaligned_sharing, 0, "");
2840 #endif /* MACH_ASSERT */
2841 
2842 extern uint64_t vm_map_set_size_limit_count;
2843 extern uint64_t vm_map_set_data_limit_count;
2844 extern uint64_t vm_map_enter_RLIMIT_AS_count;
2845 extern uint64_t vm_map_enter_RLIMIT_DATA_count;
2846 SYSCTL_QUAD(_vm, OID_AUTO, map_set_size_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_size_limit_count, "");
2847 SYSCTL_QUAD(_vm, OID_AUTO, map_set_data_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_data_limit_count, "");
2848 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_AS_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_AS_count, "");
2849 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_DATA_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_DATA_count, "");
2850 
2851 extern uint64_t vm_fault_resilient_media_initiate;
2852 extern uint64_t vm_fault_resilient_media_retry;
2853 extern uint64_t vm_fault_resilient_media_proceed;
2854 extern uint64_t vm_fault_resilient_media_release;
2855 extern uint64_t vm_fault_resilient_media_abort1;
2856 extern uint64_t vm_fault_resilient_media_abort2;
2857 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_initiate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_initiate, "");
2858 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_retry, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_retry, "");
2859 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_proceed, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_proceed, "");
2860 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_release, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_release, "");
2861 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort1, "");
2862 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort2, "");
2863 #if MACH_ASSERT
2864 extern int vm_fault_resilient_media_inject_error1_rate;
2865 extern int vm_fault_resilient_media_inject_error1;
2866 extern int vm_fault_resilient_media_inject_error2_rate;
2867 extern int vm_fault_resilient_media_inject_error2;
2868 extern int vm_fault_resilient_media_inject_error3_rate;
2869 extern int vm_fault_resilient_media_inject_error3;
2870 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1_rate, 0, "");
2871 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1, 0, "");
2872 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2_rate, 0, "");
2873 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2, 0, "");
2874 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3_rate, 0, "");
2875 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3, 0, "");
2876 #endif /* MACH_ASSERT */
2877 
2878 extern uint64_t pmap_query_page_info_retries;
2879 SYSCTL_QUAD(_vm, OID_AUTO, pmap_query_page_info_retries, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_query_page_info_retries, "");
2880 
2881 /*
2882  * A sysctl which causes all existing shared regions to become stale. They
2883  * will no longer be used by anything new and will be torn down as soon as
2884  * the last existing user exits. A write of non-zero value causes that to happen.
2885  * This should only be used by launchd, so we check that this is initproc.
2886  */
2887 static int
shared_region_pivot(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)2888 shared_region_pivot(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
2889 {
2890 	unsigned int value = 0;
2891 	int changed = 0;
2892 	int error = sysctl_io_number(req, 0, sizeof(value), &value, &changed);
2893 	if (error || !changed) {
2894 		return error;
2895 	}
2896 	if (current_proc() != initproc) {
2897 		return EPERM;
2898 	}
2899 
2900 	vm_shared_region_pivot();
2901 
2902 	return 0;
2903 }
2904 
2905 SYSCTL_PROC(_vm, OID_AUTO, shared_region_pivot,
2906     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
2907     0, 0, shared_region_pivot, "I", "");
2908 
2909 extern uint64_t vm_object_shadow_forced;
2910 extern uint64_t vm_object_shadow_skipped;
2911 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
2912     &vm_object_shadow_forced, "");
2913 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_skipped, CTLFLAG_RD | CTLFLAG_LOCKED,
2914     &vm_object_shadow_skipped, "");
2915 
2916 SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED,
2917     &vmtc_total, 0, "total text page corruptions detected");
2918 
2919 
2920 #if DEBUG || DEVELOPMENT
2921 /*
2922  * A sysctl that can be used to corrupt a text page with an illegal instruction.
2923  * Used for testing text page self healing.
2924  */
2925 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
2926 static int
corrupt_text_addr(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)2927 corrupt_text_addr(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
2928 {
2929 	uint64_t value = 0;
2930 	int error = sysctl_handle_quad(oidp, &value, 0, req);
2931 	if (error || !req->newptr) {
2932 		return error;
2933 	}
2934 
2935 	if (vm_corrupt_text_addr((uintptr_t)value) == KERN_SUCCESS) {
2936 		return 0;
2937 	} else {
2938 		return EINVAL;
2939 	}
2940 }
2941 
2942 SYSCTL_PROC(_vm, OID_AUTO, corrupt_text_addr,
2943     CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
2944     0, 0, corrupt_text_addr, "-", "");
2945 #endif /* DEBUG || DEVELOPMENT */
2946 
2947 #if CONFIG_MAP_RANGES
2948 /*
2949  * vm.malloc_ranges
2950  *
2951  * space-separated list of <left:right> hexadecimal addresses.
2952  */
2953 static int
2954 vm_map_malloc_ranges SYSCTL_HANDLER_ARGS
2955 {
2956 	vm_map_t map = current_map();
2957 	struct mach_vm_range r1, r2;
2958 	char str[20 * 4];
2959 	int len;
2960 	mach_vm_offset_t right_hole_max;
2961 
2962 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_DEFAULT, &r1)) {
2963 		return ENOENT;
2964 	}
2965 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_HEAP, &r2)) {
2966 		return ENOENT;
2967 	}
2968 
2969 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
2970 	right_hole_max = MACH_VM_JUMBO_ADDRESS;
2971 #else /* !XNU_TARGET_OS_IOS || !EXTENDED_USER_VA_SUPPORT */
2972 	right_hole_max = get_map_max(map);
2973 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
2974 
2975 	len = scnprintf(str, sizeof(str), "0x%llx:0x%llx 0x%llx:0x%llx",
2976 	    r1.max_address, r2.min_address,
2977 	    r2.max_address, right_hole_max);
2978 
2979 	return SYSCTL_OUT(req, str, len);
2980 }
2981 
2982 SYSCTL_PROC(_vm, OID_AUTO, malloc_ranges,
2983     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
2984     0, 0, &vm_map_malloc_ranges, "A", "");
2985 
2986 #if DEBUG || DEVELOPMENT
2987 static int
2988 vm_map_user_range_default SYSCTL_HANDLER_ARGS
2989 {
2990 #pragma unused(arg1, arg2, oidp)
2991 	struct mach_vm_range range;
2992 
2993 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_DEFAULT, &range)
2994 	    != KERN_SUCCESS) {
2995 		return EINVAL;
2996 	}
2997 
2998 	return SYSCTL_OUT(req, &range, sizeof(range));
2999 }
3000 
3001 static int
3002 vm_map_user_range_heap SYSCTL_HANDLER_ARGS
3003 {
3004 #pragma unused(arg1, arg2, oidp)
3005 	struct mach_vm_range range;
3006 
3007 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_HEAP, &range)
3008 	    != KERN_SUCCESS) {
3009 		return EINVAL;
3010 	}
3011 
3012 	return SYSCTL_OUT(req, &range, sizeof(range));
3013 }
3014 
3015 static int
3016 vm_map_user_range_large_file SYSCTL_HANDLER_ARGS
3017 {
3018 #pragma unused(arg1, arg2, oidp)
3019 	struct mach_vm_range range;
3020 
3021 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_LARGE_FILE, &range)
3022 	    != KERN_SUCCESS) {
3023 		return EINVAL;
3024 	}
3025 
3026 	return SYSCTL_OUT(req, &range, sizeof(range));
3027 }
3028 
3029 /*
3030  * A sysctl that can be used to return ranges for the current VM map.
3031  * Used for testing VM ranges.
3032  */
3033 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_default, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3034     0, 0, &vm_map_user_range_default, "S,mach_vm_range", "");
3035 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_heap, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3036     0, 0, &vm_map_user_range_heap, "S,mach_vm_range", "");
3037 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_large_file, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3038     0, 0, &vm_map_user_range_large_file, "S,mach_vm_range", "");
3039 
3040 #endif /* DEBUG || DEVELOPMENT */
3041 #endif /* CONFIG_MAP_RANGES */
3042 
3043 #if DEBUG || DEVELOPMENT
3044 #endif /* DEBUG || DEVELOPMENT */
3045 
3046 extern uint64_t vm_map_range_overflows_count;
3047 SYSCTL_QUAD(_vm, OID_AUTO, map_range_overflows_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_range_overflows_count, "");
3048 extern boolean_t vm_map_range_overflows_log;
3049 SYSCTL_INT(_vm, OID_AUTO, map_range_oveflows_log, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_range_overflows_log, 0, "");
3050 
3051 extern uint64_t c_seg_filled_no_contention;
3052 extern uint64_t c_seg_filled_contention;
3053 extern clock_sec_t c_seg_filled_contention_sec_max;
3054 extern clock_nsec_t c_seg_filled_contention_nsec_max;
3055 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_no_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_no_contention, "");
3056 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention, "");
3057 SYSCTL_ULONG(_vm, OID_AUTO, c_seg_filled_contention_sec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_sec_max, "");
3058 SYSCTL_UINT(_vm, OID_AUTO, c_seg_filled_contention_nsec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_nsec_max, 0, "");
3059 #if (XNU_TARGET_OS_OSX && __arm64__)
3060 extern clock_nsec_t c_process_major_report_over_ms; /* report if over ? ms */
3061 extern int c_process_major_yield_after; /* yield after moving ? segments */
3062 extern uint64_t c_process_major_reports;
3063 extern clock_sec_t c_process_major_max_sec;
3064 extern clock_nsec_t c_process_major_max_nsec;
3065 extern uint32_t c_process_major_peak_segcount;
3066 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_report_over_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_report_over_ms, 0, "");
3067 SYSCTL_INT(_vm, OID_AUTO, c_process_major_yield_after, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_yield_after, 0, "");
3068 SYSCTL_QUAD(_vm, OID_AUTO, c_process_major_reports, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_reports, "");
3069 SYSCTL_ULONG(_vm, OID_AUTO, c_process_major_max_sec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_sec, "");
3070 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_max_nsec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_nsec, 0, "");
3071 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_peak_segcount, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_peak_segcount, 0, "");
3072 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3073 
3074 #if DEVELOPMENT || DEBUG
3075 extern int panic_object_not_alive;
3076 SYSCTL_INT(_vm, OID_AUTO, panic_object_not_alive, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &panic_object_not_alive, 0, "");
3077 #endif /* DEVELOPMENT || DEBUG */
3078 
3079 #if FBDP_DEBUG_OBJECT_NO_PAGER
3080 extern int fbdp_no_panic;
3081 SYSCTL_INT(_vm, OID_AUTO, fbdp_no_panic, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &fbdp_no_panic, 0, "");
3082 #endif /* MACH_ASSERT */
3083 
3084 
3085 #if DEVELOPMENT || DEBUG
3086 
3087 
3088 /* The largest possible single segment + its slots is (sizeof(c_segment_info) + C_SLOT_MAX_INDEX * sizeof(c_slot_info)), so this should be enough  */
3089 #define SYSCTL_SEG_BUF_SIZE (8 * 1024)
3090 
3091 extern uint32_t c_segments_available;
3092 
3093 struct sysctl_buf_header {
3094 	uint32_t magic;
3095 } __attribute__((packed));
3096 
3097 /* This sysctl iterates over the populated c_segments and writes some info about each one and its slots.
3098  * instead of doing everything here, the function calls a function vm_compressor.c. */
3099 static int
sysctl_compressor_segments(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3100 sysctl_compressor_segments(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3101 {
3102 	char* buf = kalloc_data(SYSCTL_SEG_BUF_SIZE, Z_WAITOK | Z_ZERO);
3103 	if (!buf) {
3104 		return ENOMEM;
3105 	}
3106 	size_t offset = 0;
3107 	int error = 0;
3108 	int segno = 0;
3109 	/* 4 byte header to identify the version of the formatting of the data.
3110 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3111 	((struct sysctl_buf_header*)buf)->magic = VM_C_SEGMENT_INFO_MAGIC;
3112 	offset += sizeof(uint32_t);
3113 
3114 	while (segno < c_segments_available) {
3115 		size_t left_sz = SYSCTL_SEG_BUF_SIZE - offset;
3116 		kern_return_t kr = vm_compressor_serialize_segment_debug_info(segno, buf + offset, &left_sz);
3117 		if (kr == KERN_NO_SPACE) {
3118 			/* failed to add another segment, push the current buffer out and try again */
3119 			if (offset == 0) {
3120 				error = EINVAL; /* no space to write but I didn't write anything, shouldn't really happen */
3121 				goto out;
3122 			}
3123 			/* write out chunk */
3124 			error = SYSCTL_OUT(req, buf, offset);
3125 			if (error) {
3126 				goto out;
3127 			}
3128 			offset = 0;
3129 			bzero(buf, SYSCTL_SEG_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3130 			/* don't increment segno, need to try again saving the current one */
3131 		} else if (kr != KERN_SUCCESS) {
3132 			error = EINVAL;
3133 			goto out;
3134 		} else {
3135 			offset += left_sz;
3136 			++segno;
3137 		}
3138 	}
3139 
3140 	if (offset > 0) { /* write last chunk */
3141 		error = SYSCTL_OUT(req, buf, offset);
3142 	}
3143 
3144 out:
3145 	kfree_data(buf, SYSCTL_SEG_BUF_SIZE)
3146 	return error;
3147 }
3148 
3149 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments, "S", "");
3150 
3151 
3152 extern uint32_t vm_compressor_fragmentation_level(void);
3153 
3154 static int
sysctl_compressor_fragmentation_level(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3155 sysctl_compressor_fragmentation_level(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3156 {
3157 	uint32_t value = vm_compressor_fragmentation_level();
3158 	return SYSCTL_OUT(req, &value, sizeof(value));
3159 }
3160 
3161 SYSCTL_PROC(_vm, OID_AUTO, compressor_fragmentation_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_fragmentation_level, "IU", "");
3162 
3163 
3164 #define SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE (8 * 1024)
3165 
3166 
3167 /* This sysctl iterates over all the entries of the vm_map of the a given process and write some info about the vm_object pointed by the entries.
3168  * This can be used for mapping where are all the pages of a process located in the compressor.
3169  */
3170 static int
sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)3171 sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3172 {
3173 	int error = 0;
3174 	char *buf = NULL;
3175 	proc_t p = PROC_NULL;
3176 	task_t task = TASK_NULL;
3177 	vm_map_t map = VM_MAP_NULL;
3178 	__block size_t offset = 0;
3179 
3180 	/* go from pid to proc to task to vm_map. see sysctl_procargsx() for another example of this procession */
3181 	int *name = arg1;
3182 	int namelen = arg2;
3183 	if (namelen < 1) {
3184 		return EINVAL;
3185 	}
3186 	int pid = name[0];
3187 	p = proc_find(pid);  /* this increments a reference to the proc */
3188 	if (p == PROC_NULL) {
3189 		return EINVAL;
3190 	}
3191 	task = proc_task(p);
3192 	proc_rele(p);  /* decrement ref of proc */
3193 	p = PROC_NULL;
3194 	if (task == TASK_NULL) {
3195 		return EINVAL;
3196 	}
3197 	/* convert proc reference to task reference */
3198 	task_reference(task);
3199 	/* task reference to map reference */
3200 	map = get_task_map_reference(task);
3201 	task_deallocate(task);
3202 
3203 	if (map == VM_MAP_NULL) {
3204 		return EINVAL;  /* nothing allocated yet */
3205 	}
3206 
3207 	buf = kalloc_data(SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE, Z_WAITOK | Z_ZERO);
3208 	if (!buf) {
3209 		error = ENOMEM;
3210 		goto out;
3211 	}
3212 
3213 	/* 4 byte header to identify the version of the formatting of the data.
3214 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3215 	((struct sysctl_buf_header*)buf)->magic = VM_MAP_ENTRY_INFO_MAGIC;
3216 	offset += sizeof(uint32_t);
3217 
3218 	kern_return_t (^write_header)(int) = ^kern_return_t (int nentries) {
3219 		/* write the header, happens only once at the beginning so we should have enough space */
3220 		assert(offset + sizeof(struct vm_map_info_hdr) < SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE);
3221 		struct vm_map_info_hdr* out_hdr = (struct vm_map_info_hdr*)(buf + offset);
3222 		out_hdr->vmi_nentries = nentries;
3223 		offset += sizeof(struct vm_map_info_hdr);
3224 		return KERN_SUCCESS;
3225 	};
3226 
3227 	kern_return_t (^write_entry)(void*) = ^kern_return_t (void* entry) {
3228 		while (true) { /* try up to 2 times, first try write the the current buffer, otherwise to a new buffer */
3229 			size_t left_sz = SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE - offset;
3230 			kern_return_t kr = vm_map_dump_entry_and_compressor_pager(entry, buf + offset, &left_sz);
3231 			if (kr == KERN_NO_SPACE) {
3232 				/* failed to write anything, flush the current buffer and try again */
3233 				if (offset == 0) {
3234 					return KERN_FAILURE; /* no space to write but I didn't write anything yet, shouldn't really happen */
3235 				}
3236 				/* write out chunk */
3237 				int out_error = SYSCTL_OUT(req, buf, offset);
3238 				if (out_error) {
3239 					return KERN_FAILURE;
3240 				}
3241 				offset = 0;
3242 				bzero(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3243 				continue; /* need to retry the entry dump again with the cleaned buffer */
3244 			} else if (kr != KERN_SUCCESS) {
3245 				return kr;
3246 			}
3247 			offset += left_sz;
3248 			break;
3249 		}
3250 		return KERN_SUCCESS;
3251 	};
3252 
3253 	/* this foreach first calls to the first callback with the number of entries, then calls the second for every entry
3254 	 * when the buffer is exhausted, it is flushed to the sysctl and restarted */
3255 	kern_return_t kr = vm_map_entries_foreach(map, write_header, write_entry);
3256 
3257 	if (kr != KERN_SUCCESS) {
3258 		goto out;
3259 	}
3260 
3261 	if (offset > 0) { /* last chunk */
3262 		error = SYSCTL_OUT(req, buf, offset);
3263 	}
3264 
3265 out:
3266 	if (buf != NULL) {
3267 		kfree_data(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE)
3268 	}
3269 	if (map != NULL) {
3270 		vm_map_deallocate(map);
3271 	}
3272 	return error;
3273 }
3274 
3275 SYSCTL_PROC(_vm, OID_AUTO, task_vm_objects_slotmap, CTLTYPE_NODE | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_task_vm_objects_slotmap, "S", "");
3276 
3277 
3278 
3279 #endif /* DEVELOPMENT || DEBUG */
3280