xref: /xnu-12377.1.9/bsd/vm/vm_unix.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Mach Operating System
30  * Copyright (c) 1987 Carnegie-Mellon University
31  * All rights reserved.  The CMU software License Agreement specifies
32  * the terms and conditions for use and redistribution.
33  */
34 /*
35  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
36  * support for mandatory and extensible security protections.  This notice
37  * is included in support of clause 2.2 (b) of the Apple Public License,
38  * Version 2.0.
39  */
40 #include <vm/vm_options.h>
41 
42 #include <kern/ecc.h>
43 #include <kern/task.h>
44 #include <kern/thread.h>
45 #include <kern/debug.h>
46 #include <kern/extmod_statistics.h>
47 #include <mach/mach_traps.h>
48 #include <mach/port.h>
49 #include <mach/sdt.h>
50 #include <mach/task.h>
51 #include <mach/task_access.h>
52 #include <mach/task_special_ports.h>
53 #include <mach/time_value.h>
54 #include <mach/vm_map.h>
55 #include <mach/vm_param.h>
56 #include <mach/vm_prot.h>
57 #include <machine/machine_routines.h>
58 
59 #include <sys/file_internal.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/dir.h>
63 #include <sys/namei.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/vm.h>
67 #include <sys/file.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/mount.h>
70 #include <sys/xattr.h>
71 #include <sys/trace.h>
72 #include <sys/kernel.h>
73 #include <sys/ubc_internal.h>
74 #include <sys/user.h>
75 #include <sys/syslog.h>
76 #include <sys/stat.h>
77 #include <sys/sysproto.h>
78 #include <sys/mman.h>
79 #include <sys/sysctl.h>
80 #include <sys/cprotect.h>
81 #include <sys/kpi_socket.h>
82 #include <sys/kas_info.h>
83 #include <sys/socket.h>
84 #include <sys/socketvar.h>
85 #include <sys/random.h>
86 #include <sys/code_signing.h>
87 #if NECP
88 #include <net/necp.h>
89 #endif /* NECP */
90 #if SKYWALK
91 #include <skywalk/os_channel.h>
92 #endif /* SKYWALK */
93 
94 #include <security/audit/audit.h>
95 #include <security/mac.h>
96 #include <bsm/audit_kevents.h>
97 
98 #include <kern/kalloc.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_kern_xnu.h>
101 #include <vm/vm_pageout_xnu.h>
102 
103 #include <mach/shared_region.h>
104 #include <vm/vm_shared_region_internal.h>
105 
106 #include <vm/vm_dyld_pager_internal.h>
107 #include <vm/vm_protos_internal.h>
108 #include <vm/vm_compressor_info.h>         /* for c_segment_info */
109 #include <vm/vm_compressor_xnu.h>          /* for vm_compressor_serialize_segment_debug_info() */
110 #include <vm/vm_object_xnu.h>              /* for vm_chead_select_t */
111 #include <vm/vm_memory_entry_xnu.h>
112 #include <vm/vm_iokit.h>
113 #include <vm/vm_reclaim_xnu.h>
114 
115 #include <sys/kern_memorystatus.h>
116 #include <sys/kern_memorystatus_freeze.h>
117 #include <sys/proc_internal.h>
118 
119 #include <mach-o/fixup-chains.h>
120 
121 #if CONFIG_MACF
122 #include <security/mac_framework.h>
123 #endif
124 
125 #include <kern/bits.h>
126 
127 #if CONFIG_CSR
128 #include <sys/csr.h>
129 #endif /* CONFIG_CSR */
130 #include <sys/trust_caches.h>
131 #include <libkern/amfi/amfi.h>
132 #include <IOKit/IOBSD.h>
133 
134 #if VM_MAP_DEBUG_APPLE_PROTECT
135 SYSCTL_INT(_vm, OID_AUTO, map_debug_apple_protect, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_apple_protect, 0, "");
136 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
137 
138 #if DEVELOPMENT || DEBUG
139 
140 extern int vm_object_cache_evict_all(void);
141 static int
142 sysctl_vm_object_cache_evict SYSCTL_HANDLER_ARGS
143 {
144 #pragma unused(arg1, arg2, req)
145 	(void) vm_object_cache_evict_all();
146 	return 0;
147 }
148 
149 SYSCTL_PROC(_vm, OID_AUTO, object_cache_evict, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
150     0, 0, &sysctl_vm_object_cache_evict, "I", "");
151 
152 static int
153 sysctl_kmem_alloc_contig SYSCTL_HANDLER_ARGS
154 {
155 #pragma unused(arg1, arg2)
156 	vm_offset_t     kaddr;
157 	kern_return_t   kr;
158 	int     error = 0;
159 	int     size = 0;
160 
161 	error = sysctl_handle_int(oidp, &size, 0, req);
162 	if (error || !req->newptr) {
163 		return error;
164 	}
165 
166 	kr = kmem_alloc_contig(kernel_map, &kaddr, (vm_size_t)size,
167 	    0, 0, 0, KMA_DATA, VM_KERN_MEMORY_IOKIT);
168 
169 	if (kr == KERN_SUCCESS) {
170 		kmem_free(kernel_map, kaddr, size);
171 	}
172 
173 	return error;
174 }
175 
176 SYSCTL_PROC(_vm, OID_AUTO, kmem_alloc_contig, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
177     0, 0, &sysctl_kmem_alloc_contig, "I", "");
178 
179 extern int vm_region_footprint;
180 SYSCTL_INT(_vm, OID_AUTO, region_footprint, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, &vm_region_footprint, 0, "");
181 
182 static int
183 sysctl_kmem_gobj_stats SYSCTL_HANDLER_ARGS
184 {
185 #pragma unused(arg1, arg2, oidp)
186 	kmem_gobj_stats stats = kmem_get_gobj_stats();
187 
188 	return SYSCTL_OUT(req, &stats, sizeof(stats));
189 }
190 
191 SYSCTL_PROC(_vm, OID_AUTO, kmem_gobj_stats,
192     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
193     0, 0, &sysctl_kmem_gobj_stats, "S,kmem_gobj_stats", "");
194 
195 #endif /* DEVELOPMENT || DEBUG */
196 
197 static int
198 sysctl_vm_self_region_footprint SYSCTL_HANDLER_ARGS
199 {
200 #pragma unused(arg1, arg2, oidp)
201 	int     error = 0;
202 	int     value;
203 
204 	value = task_self_region_footprint();
205 	error = SYSCTL_OUT(req, &value, sizeof(int));
206 	if (error) {
207 		return error;
208 	}
209 
210 	if (!req->newptr) {
211 		return 0;
212 	}
213 
214 	error = SYSCTL_IN(req, &value, sizeof(int));
215 	if (error) {
216 		return error;
217 	}
218 	task_self_region_footprint_set(value);
219 	return 0;
220 }
221 SYSCTL_PROC(_vm, OID_AUTO, self_region_footprint, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_footprint, "I", "");
222 
223 static int
224 sysctl_vm_self_region_page_size SYSCTL_HANDLER_ARGS
225 {
226 #pragma unused(arg1, arg2, oidp)
227 	int     error = 0;
228 	int     value;
229 
230 	value = (1 << thread_self_region_page_shift());
231 	error = SYSCTL_OUT(req, &value, sizeof(int));
232 	if (error) {
233 		return error;
234 	}
235 
236 	if (!req->newptr) {
237 		return 0;
238 	}
239 
240 	error = SYSCTL_IN(req, &value, sizeof(int));
241 	if (error) {
242 		return error;
243 	}
244 
245 	if (value != 0 && value != 4096 && value != 16384) {
246 		return EINVAL;
247 	}
248 
249 #if !__ARM_MIXED_PAGE_SIZE__
250 	if (value != vm_map_page_size(current_map())) {
251 		return EINVAL;
252 	}
253 #endif /* !__ARM_MIXED_PAGE_SIZE__ */
254 
255 	thread_self_region_page_shift_set(bit_first(value));
256 	return 0;
257 }
258 SYSCTL_PROC(_vm, OID_AUTO, self_region_page_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_page_size, "I", "");
259 
260 static int
261 sysctl_vm_self_region_info_flags SYSCTL_HANDLER_ARGS
262 {
263 #pragma unused(arg1, arg2, oidp)
264 	int     error = 0;
265 	int     value;
266 	kern_return_t kr;
267 
268 	value = task_self_region_info_flags();
269 	error = SYSCTL_OUT(req, &value, sizeof(int));
270 	if (error) {
271 		return error;
272 	}
273 
274 	if (!req->newptr) {
275 		return 0;
276 	}
277 
278 	error = SYSCTL_IN(req, &value, sizeof(int));
279 	if (error) {
280 		return error;
281 	}
282 
283 	kr = task_self_region_info_flags_set(value);
284 	if (kr != KERN_SUCCESS) {
285 		return EINVAL;
286 	}
287 
288 	return 0;
289 }
290 SYSCTL_PROC(_vm, OID_AUTO, self_region_info_flags, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_info_flags, "I", "");
291 
292 
293 #if DEVELOPMENT || DEBUG
294 extern int panic_on_unsigned_execute;
295 SYSCTL_INT(_vm, OID_AUTO, panic_on_unsigned_execute, CTLFLAG_RW | CTLFLAG_LOCKED, &panic_on_unsigned_execute, 0, "");
296 
297 extern int vm_log_xnu_user_debug;
298 SYSCTL_INT(_vm, OID_AUTO, log_xnu_user_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_xnu_user_debug, 0, "");
299 #endif /* DEVELOPMENT || DEBUG */
300 
301 extern int vm_log_map_delete_permanent_prot_none;
302 SYSCTL_INT(_vm, OID_AUTO, log_map_delete_permanent_prot_none, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_map_delete_permanent_prot_none, 0, "");
303 
304 extern int cs_executable_create_upl;
305 extern int cs_executable_wire;
306 SYSCTL_INT(_vm, OID_AUTO, cs_executable_create_upl, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_create_upl, 0, "");
307 SYSCTL_INT(_vm, OID_AUTO, cs_executable_wire, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_wire, 0, "");
308 
309 extern int apple_protect_pager_count;
310 extern int apple_protect_pager_count_mapped;
311 extern unsigned int apple_protect_pager_cache_limit;
312 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count, 0, "");
313 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count_mapped, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count_mapped, 0, "");
314 SYSCTL_UINT(_vm, OID_AUTO, apple_protect_pager_cache_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_cache_limit, 0, "");
315 
316 #if DEVELOPMENT || DEBUG
317 extern int radar_20146450;
318 SYSCTL_INT(_vm, OID_AUTO, radar_20146450, CTLFLAG_RW | CTLFLAG_LOCKED, &radar_20146450, 0, "");
319 
320 extern int macho_printf;
321 SYSCTL_INT(_vm, OID_AUTO, macho_printf, CTLFLAG_RW | CTLFLAG_LOCKED, &macho_printf, 0, "");
322 
323 extern int apple_protect_pager_data_request_debug;
324 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_data_request_debug, 0, "");
325 
326 extern unsigned int vm_object_copy_delayed_paging_wait_disable;
327 EXPERIMENT_FACTOR_LEGACY_UINT(_vm, vm_object_copy_delayed_paging_wait_disable, &vm_object_copy_delayed_paging_wait_disable, FALSE, TRUE, "");
328 
329 __enum_closed_decl(vm_submap_test_op, uint32_t, {
330 	vsto_make_submap = 1,  /* make submap from entries in current_map()
331 	                        * at start..end, offset ignored */
332 	vsto_remap_submap = 2, /* map in current_map() at start..end,
333 	                        * from parent address submap_base_address
334 	                        * and submap address offset */
335 	vsto_end
336 });
337 
338 static int
339 sysctl_vm_submap_test_ctl SYSCTL_HANDLER_ARGS
340 {
341 	int error;
342 	struct {
343 		vm_submap_test_op op;
344 		mach_vm_address_t submap_base_address;
345 		mach_vm_address_t start;
346 		mach_vm_address_t end;
347 		mach_vm_address_t offset;
348 	} args;
349 	if (req->newlen != sizeof(args)) {
350 		return EINVAL;
351 	}
352 	error = SYSCTL_IN(req, &args, sizeof(args));
353 	if (error) {
354 		return error;
355 	}
356 
357 	switch (args.op) {
358 	case vsto_make_submap:
359 		vm_map_testing_make_sealed_submap(current_map(), args.start, args.end);
360 		break;
361 	case vsto_remap_submap:
362 		vm_map_testing_remap_submap(current_map(),
363 		    args.submap_base_address, args.start, args.end, args.offset);
364 		break;
365 	default:
366 		return EINVAL;
367 	}
368 
369 	return 0;
370 }
371 SYSCTL_PROC(_vm, OID_AUTO, submap_test_ctl, CTLFLAG_WR | CTLFLAG_LOCKED, 0, 0, &sysctl_vm_submap_test_ctl, "-", "");
372 
373 #if __arm64__
374 /* These are meant to support the page table accounting unit test. */
375 extern unsigned int arm_hardware_page_size;
376 extern unsigned int arm_pt_desc_size;
377 extern unsigned int arm_pt_root_size;
378 extern unsigned int inuse_user_tteroot_count;
379 extern unsigned int inuse_kernel_tteroot_count;
380 extern unsigned int inuse_user_ttepages_count;
381 extern unsigned int inuse_kernel_ttepages_count;
382 extern unsigned int inuse_user_ptepages_count;
383 extern unsigned int inuse_kernel_ptepages_count;
384 SYSCTL_UINT(_vm, OID_AUTO, native_hw_pagesize, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_hardware_page_size, 0, "");
385 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_desc_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_desc_size, 0, "");
386 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_root_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_root_size, 0, "");
387 SYSCTL_UINT(_vm, OID_AUTO, user_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_tteroot_count, 0, "");
388 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_tteroot_count, 0, "");
389 SYSCTL_UINT(_vm, OID_AUTO, user_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ttepages_count, 0, "");
390 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ttepages_count, 0, "");
391 SYSCTL_UINT(_vm, OID_AUTO, user_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ptepages_count, 0, "");
392 SYSCTL_UINT(_vm, OID_AUTO, kernel_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ptepages_count, 0, "");
393 #if !CONFIG_SPTM
394 extern unsigned int free_page_size_tt_count;
395 extern unsigned int free_tt_count;
396 SYSCTL_UINT(_vm, OID_AUTO, free_1page_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_page_size_tt_count, 0, "");
397 SYSCTL_UINT(_vm, OID_AUTO, free_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_tt_count, 0, "");
398 #endif
399 #if DEVELOPMENT || DEBUG
400 extern unsigned long pmap_asid_flushes;
401 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_flushes, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_flushes, "");
402 extern unsigned long pmap_asid_hits;
403 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_hits, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_hits, "");
404 extern unsigned long pmap_asid_misses;
405 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_misses, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_misses, "");
406 extern unsigned long pmap_speculation_restrictions;
407 SYSCTL_ULONG(_vm, OID_AUTO, pmap_speculation_restrictions, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_speculation_restrictions, "");
408 #endif
409 #endif /* __arm64__ */
410 #endif /* DEVELOPMENT || DEBUG */
411 
412 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor, 0, "");
413 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor_pages, 0, "");
414 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate, 0, "");
415 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate_failure, 0, "");
416 SYSCTL_INT(_vm, OID_AUTO, vm_should_cow_but_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.should_cow_but_wired, 0, "");
417 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow, 0, "");
418 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow_pages, 0, "");
419 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_write, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_write, 0, "");
420 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_copy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_copy, 0, "");
421 #if VM_SCAN_FOR_SHADOW_CHAIN
422 static int vm_shadow_max_enabled = 0;    /* Disabled by default */
423 extern int proc_shadow_max(void);
424 static int
425 vm_shadow_max SYSCTL_HANDLER_ARGS
426 {
427 #pragma unused(arg1, arg2, oidp)
428 	int value = 0;
429 
430 	if (vm_shadow_max_enabled) {
431 		value = proc_shadow_max();
432 	}
433 
434 	return SYSCTL_OUT(req, &value, sizeof(value));
435 }
436 SYSCTL_PROC(_vm, OID_AUTO, vm_shadow_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
437     0, 0, &vm_shadow_max, "I", "");
438 
439 SYSCTL_INT(_vm, OID_AUTO, vm_shadow_max_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_shadow_max_enabled, 0, "");
440 
441 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
442 
443 SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, "");
444 
445 #if PAGE_SLEEP_WITH_INHERITOR
446 #if DEVELOPMENT || DEBUG
447 extern uint32_t page_worker_table_size;
448 SYSCTL_INT(_vm, OID_AUTO, page_worker_table_size, CTLFLAG_RD | CTLFLAG_LOCKED, &page_worker_table_size, 0, "");
449 SCALABLE_COUNTER_DECLARE(page_worker_hash_collisions);
450 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_hash_collisions, page_worker_hash_collisions, "");
451 SCALABLE_COUNTER_DECLARE(page_worker_inheritor_sleeps);
452 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_inheritor_sleeps, page_worker_inheritor_sleeps, "");
453 #endif /* DEVELOPMENT || DEBUG */
454 #endif /* PAGE_SLEEP_WITH_INHERITOR */
455 
456 #if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1
457 extern uint32_t vm_cheads;
458 extern vm_chead_select_t vm_chead_select;
459 extern boolean_t vm_chead_rehint;
460 #if DEVELOPMENT || DEBUG
461 SYSCTL_UINT(_vm, OID_AUTO, compressor_heads, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cheads, 0, "");
462 SYSCTL_UINT(_vm, OID_AUTO, compressor_head_select, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_chead_select, 0, "");
463 SYSCTL_INT(_vm, OID_AUTO, compressor_head_rehint, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_chead_rehint, 0, "");
464 #endif /* DEVELOPMENT || DEBUG */
465 EXPERIMENT_FACTOR_UINT(compressor_heads, &vm_cheads, 1, COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT, "");
466 EXPERIMENT_FACTOR_UINT(compressor_head_select, &vm_chead_select, CSEL_MIN, CSEL_MAX, "");
467 EXPERIMENT_FACTOR_INT(compressor_head_rehint, &vm_chead_rehint, 0, 1, "");
468 #endif /* COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 */
469 
470 /*
471  * Sysctl's related to data/stack execution.  See osfmk/vm/vm_map.c
472  */
473 
474 #if DEVELOPMENT || DEBUG
475 extern int allow_stack_exec, allow_data_exec;
476 
477 SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, "");
478 SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, "");
479 
480 #endif /* DEVELOPMENT || DEBUG */
481 
482 static const char *prot_values[] = {
483 	"none",
484 	"read-only",
485 	"write-only",
486 	"read-write",
487 	"execute-only",
488 	"read-execute",
489 	"write-execute",
490 	"read-write-execute"
491 };
492 
493 void
log_stack_execution_failure(addr64_t vaddr,vm_prot_t prot)494 log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot)
495 {
496 	printf("Data/Stack execution not permitted: %s[pid %d] at virtual address 0x%qx, protections were %s\n",
497 	    current_proc()->p_comm, proc_getpid(current_proc()), vaddr, prot_values[prot & VM_PROT_ALL]);
498 }
499 
500 /*
501  * shared_region_unnest_logging: level of logging of unnesting events
502  * 0	- no logging
503  * 1	- throttled logging of unexpected unnesting events (default)
504  * 2	- unthrottled logging of unexpected unnesting events
505  * 3+	- unthrottled logging of all unnesting events
506  */
507 int shared_region_unnest_logging = 1;
508 
509 SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED,
510     &shared_region_unnest_logging, 0, "");
511 
512 int vm_shared_region_unnest_log_interval = 10;
513 int shared_region_unnest_log_count_threshold = 5;
514 
515 
516 #if XNU_TARGET_OS_OSX
517 
518 #if defined (__x86_64__)
519 static int scdir_enforce = 1;
520 #else /* defined (__x86_64__) */
521 static int scdir_enforce = 0;   /* AOT caches live elsewhere */
522 #endif /* defined (__x86_64__) */
523 
524 static char *scdir_path[] = {
525 	"/System/Library/dyld/",
526 	"/System/Volumes/Preboot/Cryptexes/OS/System/Library/dyld",
527 	"/System/Cryptexes/OS/System/Library/dyld",
528 	NULL
529 };
530 
531 #else /* XNU_TARGET_OS_OSX */
532 
533 static int scdir_enforce = 0;
534 static char *scdir_path[] = {
535 	"/System/Library/Caches/com.apple.dyld/",
536 	"/private/preboot/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
537 	"/System/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
538 	NULL
539 };
540 
541 #endif /* XNU_TARGET_OS_OSX */
542 
543 static char *driverkit_scdir_path[] = {
544 	"/System/DriverKit/System/Library/dyld/",
545 #if XNU_TARGET_OS_OSX
546 	"/System/Volumes/Preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
547 #else
548 	"/private/preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
549 #endif /* XNU_TARGET_OS_OSX */
550 	"/System/Cryptexes/OS/System/DriverKit/System/Library/dyld",
551 	NULL
552 };
553 
554 #ifndef SECURE_KERNEL
555 static int sysctl_scdir_enforce SYSCTL_HANDLER_ARGS
556 {
557 #if CONFIG_CSR
558 	if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
559 		printf("Failed attempt to set vm.enforce_shared_cache_dir sysctl\n");
560 		return EPERM;
561 	}
562 #endif /* CONFIG_CSR */
563 	return sysctl_handle_int(oidp, arg1, arg2, req);
564 }
565 
566 SYSCTL_PROC(_vm, OID_AUTO, enforce_shared_cache_dir, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, sysctl_scdir_enforce, "I", "");
567 #endif
568 
569 /* These log rate throttling state variables aren't thread safe, but
570  * are sufficient unto the task.
571  */
572 static int64_t last_unnest_log_time = 0;
573 static int shared_region_unnest_log_count = 0;
574 
575 void
log_unnest_badness(vm_map_t m,vm_map_offset_t s,vm_map_offset_t e,boolean_t is_nested_map,vm_map_offset_t lowest_unnestable_addr)576 log_unnest_badness(
577 	vm_map_t        m,
578 	vm_map_offset_t s,
579 	vm_map_offset_t e,
580 	boolean_t       is_nested_map,
581 	vm_map_offset_t lowest_unnestable_addr)
582 {
583 	struct timeval  tv;
584 
585 	if (shared_region_unnest_logging == 0) {
586 		return;
587 	}
588 
589 	if (shared_region_unnest_logging <= 2 &&
590 	    is_nested_map &&
591 	    s >= lowest_unnestable_addr) {
592 		/*
593 		 * Unnesting of writable map entries is fine.
594 		 */
595 		return;
596 	}
597 
598 	if (shared_region_unnest_logging <= 1) {
599 		microtime(&tv);
600 		if ((tv.tv_sec - last_unnest_log_time) <
601 		    vm_shared_region_unnest_log_interval) {
602 			if (shared_region_unnest_log_count++ >
603 			    shared_region_unnest_log_count_threshold) {
604 				return;
605 			}
606 		} else {
607 			last_unnest_log_time = tv.tv_sec;
608 			shared_region_unnest_log_count = 0;
609 		}
610 	}
611 
612 	DTRACE_VM4(log_unnest_badness,
613 	    vm_map_t, m,
614 	    vm_map_offset_t, s,
615 	    vm_map_offset_t, e,
616 	    vm_map_offset_t, lowest_unnestable_addr);
617 	printf("%s[%d] triggered unnest of range 0x%qx->0x%qx of DYLD shared region in VM map %p. While not abnormal for debuggers, this increases system memory footprint until the target exits.\n", current_proc()->p_comm, proc_getpid(current_proc()), (uint64_t)s, (uint64_t)e, (void *) VM_KERNEL_ADDRPERM(m));
618 }
619 
620 uint64_t
vm_purge_filebacked_pagers(void)621 vm_purge_filebacked_pagers(void)
622 {
623 	uint64_t pages_purged;
624 
625 	pages_purged = 0;
626 	pages_purged += apple_protect_pager_purge_all();
627 	pages_purged += shared_region_pager_purge_all();
628 	pages_purged += dyld_pager_purge_all();
629 #if DEVELOPMENT || DEBUG
630 	printf("%s:%d pages purged: %llu\n", __FUNCTION__, __LINE__, pages_purged);
631 #endif /* DEVELOPMENT || DEBUG */
632 	return pages_purged;
633 }
634 
635 int
useracc(user_addr_ut addr_u,user_size_ut len_u,int prot)636 useracc(
637 	user_addr_ut    addr_u,
638 	user_size_ut    len_u,
639 	int             prot)
640 {
641 	vm_map_t        map;
642 	vm_prot_t       vm_prot = VM_PROT_WRITE;
643 
644 	map = current_map();
645 
646 	if (prot == B_READ) {
647 		vm_prot = VM_PROT_READ;
648 	}
649 
650 	return vm_map_check_protection(map, addr_u,
651 	           vm_sanitize_compute_ut_end(addr_u, len_u), vm_prot,
652 	           VM_SANITIZE_CALLER_USERACC);
653 }
654 
655 #if XNU_PLATFORM_MacOSX
656 static __attribute__((always_inline, warn_unused_result))
657 kern_return_t
vslock_sanitize(vm_map_t map,user_addr_ut addr_u,user_size_ut len_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)658 vslock_sanitize(
659 	vm_map_t                map,
660 	user_addr_ut            addr_u,
661 	user_size_ut            len_u,
662 	vm_sanitize_caller_t    vm_sanitize_caller,
663 	vm_map_offset_t        *start,
664 	vm_map_offset_t        *end,
665 	vm_map_size_t          *size)
666 {
667 	return vm_sanitize_addr_size(addr_u, len_u, vm_sanitize_caller,
668 	           map,
669 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
670 	           size);
671 }
672 #endif /* XNU_PLATFORM_MacOSX */
673 
674 int
vslock(user_addr_ut addr,user_size_ut len)675 vslock(user_addr_ut addr, user_size_ut len)
676 {
677 	kern_return_t kret;
678 
679 #if XNU_PLATFORM_MacOSX
680 	/*
681 	 * Preserve previous behavior on macOS for overflows due to bin
682 	 * compatibility i.e. return success for overflows without doing
683 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
684 	 * for overflow errors which gets converted to KERN_SUCCESS by
685 	 * vm_sanitize_get_kr.
686 	 */
687 	vm_map_offset_t start, end;
688 	vm_map_size_t   size;
689 
690 	kret = vslock_sanitize(current_map(),
691 	    addr,
692 	    len,
693 	    VM_SANITIZE_CALLER_VSLOCK,
694 	    &start,
695 	    &end,
696 	    &size);
697 	if (__improbable(kret != KERN_SUCCESS)) {
698 		switch (vm_sanitize_get_kr(kret)) {
699 		case KERN_SUCCESS:
700 			return 0;
701 		case KERN_INVALID_ADDRESS:
702 		case KERN_NO_SPACE:
703 			return ENOMEM;
704 		case KERN_PROTECTION_FAILURE:
705 			return EACCES;
706 		default:
707 			return EINVAL;
708 		}
709 	}
710 #endif /* XNU_PLATFORM_MacOSX */
711 
712 	kret = vm_map_wire_kernel(current_map(), addr,
713 	    vm_sanitize_compute_ut_end(addr, len),
714 	    vm_sanitize_wrap_prot(VM_PROT_READ | VM_PROT_WRITE),
715 	    VM_KERN_MEMORY_BSD,
716 	    FALSE);
717 
718 	switch (kret) {
719 	case KERN_SUCCESS:
720 		return 0;
721 	case KERN_INVALID_ADDRESS:
722 	case KERN_NO_SPACE:
723 		return ENOMEM;
724 	case KERN_PROTECTION_FAILURE:
725 		return EACCES;
726 	default:
727 		return EINVAL;
728 	}
729 }
730 
731 int
vsunlock(user_addr_ut addr,user_size_ut len,__unused int dirtied)732 vsunlock(user_addr_ut addr, user_size_ut len, __unused int dirtied)
733 {
734 #if FIXME  /* [ */
735 	pmap_t          pmap;
736 	vm_page_t       pg;
737 	vm_map_offset_t vaddr;
738 	ppnum_t         paddr;
739 #endif  /* FIXME ] */
740 	kern_return_t   kret;
741 	vm_map_t        map;
742 
743 	map = current_map();
744 
745 #if FIXME  /* [ */
746 	if (dirtied) {
747 		pmap = get_task_pmap(current_task());
748 		for (vaddr = vm_map_trunc_page(addr, PAGE_MASK);
749 		    vaddr < vm_map_round_page(addr + len, PAGE_MASK);
750 		    vaddr += PAGE_SIZE) {
751 			paddr = pmap_find_phys(pmap, vaddr);
752 			pg = PHYS_TO_VM_PAGE(paddr);
753 			vm_page_set_modified(pg);
754 		}
755 	}
756 #endif  /* FIXME ] */
757 #ifdef  lint
758 	dirtied++;
759 #endif  /* lint */
760 
761 #if XNU_PLATFORM_MacOSX
762 	/*
763 	 * Preserve previous behavior on macOS for overflows due to bin
764 	 * compatibility i.e. return success for overflows without doing
765 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
766 	 * for overflow errors which gets converted to KERN_SUCCESS by
767 	 * vm_sanitize_get_kr.
768 	 */
769 	vm_map_offset_t start, end;
770 	vm_map_size_t   size;
771 
772 	kret = vslock_sanitize(map,
773 	    addr,
774 	    len,
775 	    VM_SANITIZE_CALLER_VSUNLOCK,
776 	    &start,
777 	    &end,
778 	    &size);
779 	if (__improbable(kret != KERN_SUCCESS)) {
780 		switch (vm_sanitize_get_kr(kret)) {
781 		case KERN_SUCCESS:
782 			return 0;
783 		case KERN_INVALID_ADDRESS:
784 		case KERN_NO_SPACE:
785 			return ENOMEM;
786 		case KERN_PROTECTION_FAILURE:
787 			return EACCES;
788 		default:
789 			return EINVAL;
790 		}
791 	}
792 #endif /* XNU_PLATFORM_MacOSX */
793 
794 	kret = vm_map_unwire(map, addr,
795 	    vm_sanitize_compute_ut_end(addr, len), false);
796 	switch (kret) {
797 	case KERN_SUCCESS:
798 		return 0;
799 	case KERN_INVALID_ADDRESS:
800 	case KERN_NO_SPACE:
801 		return ENOMEM;
802 	case KERN_PROTECTION_FAILURE:
803 		return EACCES;
804 	default:
805 		return EINVAL;
806 	}
807 }
808 
809 int
subyte(user_addr_t addr,int byte)810 subyte(
811 	user_addr_t addr,
812 	int byte)
813 {
814 	char character;
815 
816 	character = (char)byte;
817 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
818 }
819 
820 int
suibyte(user_addr_t addr,int byte)821 suibyte(
822 	user_addr_t addr,
823 	int byte)
824 {
825 	char character;
826 
827 	character = (char)byte;
828 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
829 }
830 
831 int
fubyte(user_addr_t addr)832 fubyte(user_addr_t addr)
833 {
834 	unsigned char byte;
835 
836 	if (copyin(addr, (void *) &byte, sizeof(char))) {
837 		return -1;
838 	}
839 	return byte;
840 }
841 
842 int
fuibyte(user_addr_t addr)843 fuibyte(user_addr_t addr)
844 {
845 	unsigned char byte;
846 
847 	if (copyin(addr, (void *) &(byte), sizeof(char))) {
848 		return -1;
849 	}
850 	return byte;
851 }
852 
853 int
suword(user_addr_t addr,long word)854 suword(
855 	user_addr_t addr,
856 	long word)
857 {
858 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
859 }
860 
861 long
fuword(user_addr_t addr)862 fuword(user_addr_t addr)
863 {
864 	long word = 0;
865 
866 	if (copyin(addr, (void *) &word, sizeof(int))) {
867 		return -1;
868 	}
869 	return word;
870 }
871 
872 /* suiword and fuiword are the same as suword and fuword, respectively */
873 
874 int
suiword(user_addr_t addr,long word)875 suiword(
876 	user_addr_t addr,
877 	long word)
878 {
879 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
880 }
881 
882 long
fuiword(user_addr_t addr)883 fuiword(user_addr_t addr)
884 {
885 	long word = 0;
886 
887 	if (copyin(addr, (void *) &word, sizeof(int))) {
888 		return -1;
889 	}
890 	return word;
891 }
892 
893 /*
894  * With a 32-bit kernel and mixed 32/64-bit user tasks, this interface allows the
895  * fetching and setting of process-sized size_t and pointer values.
896  */
897 int
sulong(user_addr_t addr,int64_t word)898 sulong(user_addr_t addr, int64_t word)
899 {
900 	if (IS_64BIT_PROCESS(current_proc())) {
901 		return copyout((void *)&word, addr, sizeof(word)) == 0 ? 0 : -1;
902 	} else {
903 		return suiword(addr, (long)word);
904 	}
905 }
906 
907 int64_t
fulong(user_addr_t addr)908 fulong(user_addr_t addr)
909 {
910 	int64_t longword;
911 
912 	if (IS_64BIT_PROCESS(current_proc())) {
913 		if (copyin(addr, (void *)&longword, sizeof(longword)) != 0) {
914 			return -1;
915 		}
916 		return longword;
917 	} else {
918 		return (int64_t)fuiword(addr);
919 	}
920 }
921 
922 int
suulong(user_addr_t addr,uint64_t uword)923 suulong(user_addr_t addr, uint64_t uword)
924 {
925 	if (IS_64BIT_PROCESS(current_proc())) {
926 		return copyout((void *)&uword, addr, sizeof(uword)) == 0 ? 0 : -1;
927 	} else {
928 		return suiword(addr, (uint32_t)uword);
929 	}
930 }
931 
932 uint64_t
fuulong(user_addr_t addr)933 fuulong(user_addr_t addr)
934 {
935 	uint64_t ulongword;
936 
937 	if (IS_64BIT_PROCESS(current_proc())) {
938 		if (copyin(addr, (void *)&ulongword, sizeof(ulongword)) != 0) {
939 			return -1ULL;
940 		}
941 		return ulongword;
942 	} else {
943 		return (uint64_t)fuiword(addr);
944 	}
945 }
946 
947 int
swapon(__unused proc_t procp,__unused struct swapon_args * uap,__unused int * retval)948 swapon(__unused proc_t procp, __unused struct swapon_args *uap, __unused int *retval)
949 {
950 	return ENOTSUP;
951 }
952 
953 #if defined(SECURE_KERNEL)
954 static int kern_secure_kernel = 1;
955 #else
956 static int kern_secure_kernel = 0;
957 #endif
958 
959 SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_secure_kernel, 0, "");
960 SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW | CTLFLAG_LOCKED,
961     &shared_region_trace_level, 0, "");
962 SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD | CTLFLAG_LOCKED,
963     &shared_region_version, 0, "");
964 SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED,
965     &shared_region_persistence, 0, "");
966 
967 /*
968  * shared_region_check_np:
969  *
970  * This system call is intended for dyld.
971  *
972  * dyld calls this when any process starts to see if the process's shared
973  * region is already set up and ready to use.
974  * This call returns the base address of the first mapping in the
975  * process's shared region's first mapping.
976  * dyld will then check what's mapped at that address.
977  *
978  * If the shared region is empty, dyld will then attempt to map the shared
979  * cache file in the shared region via the shared_region_map_and_slide_2_np()
980  * system call.
981  *
982  * If something's already mapped in the shared region, dyld will check if it
983  * matches the shared cache it would like to use for that process.
984  * If it matches, evrything's ready and the process can proceed and use the
985  * shared region.
986  * If it doesn't match, dyld will unmap the shared region and map the shared
987  * cache into the process's address space via mmap().
988  *
989  * A NULL pointer argument can be used by dyld to indicate it has unmapped
990  * the shared region. We will remove the shared_region reference from the task.
991  *
992  * ERROR VALUES
993  * EINVAL	no shared region
994  * ENOMEM	shared region is empty
995  * EFAULT	bad address for "start_address"
996  */
997 int
shared_region_check_np(__unused struct proc * p,struct shared_region_check_np_args * uap,__unused int * retvalp)998 shared_region_check_np(
999 	__unused struct proc                    *p,
1000 	struct shared_region_check_np_args      *uap,
1001 	__unused int                            *retvalp)
1002 {
1003 	vm_shared_region_t      shared_region;
1004 	mach_vm_offset_t        start_address = 0;
1005 	int                     error = 0;
1006 	kern_return_t           kr = KERN_FAILURE;
1007 	task_t                  task = current_task();
1008 
1009 	SHARED_REGION_TRACE_DEBUG(
1010 		("shared_region: %p [%d(%s)] -> check_np(0x%llx)\n",
1011 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1012 		proc_getpid(p), p->p_comm,
1013 		(uint64_t)uap->start_address));
1014 
1015 	/*
1016 	 * Special value of start_address used to indicate that map_with_linking() should
1017 	 * no longer be allowed in this process
1018 	 */
1019 	if (uap->start_address == (task_get_64bit_addr(task) ? DYLD_VM_END_MWL : (uint32_t)DYLD_VM_END_MWL)) {
1020 		p->p_disallow_map_with_linking = TRUE;
1021 		return 0;
1022 	}
1023 
1024 	/* retrieve the current task's shared region */
1025 	shared_region = vm_shared_region_get(task);
1026 	if (shared_region != NULL) {
1027 		/*
1028 		 * A NULL argument is used by dyld to indicate the task
1029 		 * has unmapped its shared region.
1030 		 */
1031 		if (uap->start_address == 0) {
1032 			/* unmap it first */
1033 			vm_shared_region_remove(task, shared_region);
1034 			vm_shared_region_set(task, NULL);
1035 		} else {
1036 			/* retrieve address of its first mapping... */
1037 			kr = vm_shared_region_start_address(shared_region, &start_address);
1038 			if (kr != KERN_SUCCESS) {
1039 				SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1040 				    "check_np(0x%llx) "
1041 				    "vm_shared_region_start_address() returned 0x%x\n",
1042 				    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1043 				    proc_getpid(p), p->p_comm,
1044 				    (uint64_t)uap->start_address, kr));
1045 				error = ENOMEM;
1046 			}
1047 			if (error == 0) {
1048 				/* Insert the shared region submap and various bits of debug info into the task. */
1049 				kr = vm_shared_region_update_task(task, shared_region, start_address);
1050 				if (kr != KERN_SUCCESS) {
1051 					SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1052 					    "check_np(0x%llx) "
1053 					    "vm_shared_update_task() returned 0x%x\n",
1054 					    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1055 					    proc_getpid(p), p->p_comm,
1056 					    (uint64_t)uap->start_address, kr));
1057 
1058 					error = ENOMEM;
1059 				}
1060 			}
1061 #if __has_feature(ptrauth_calls)
1062 			/*
1063 			 * Remap any section of the shared library that
1064 			 * has authenticated pointers into private memory.
1065 			 */
1066 			if ((error == 0) && (vm_shared_region_auth_remap(shared_region) != KERN_SUCCESS)) {
1067 				SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1068 				    "check_np(0x%llx) "
1069 				    "vm_shared_region_auth_remap() failed\n",
1070 				    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1071 				    proc_getpid(p), p->p_comm,
1072 				    (uint64_t)uap->start_address));
1073 				error = ENOMEM;
1074 			}
1075 #endif /* __has_feature(ptrauth_calls) */
1076 			/* Give the start address to the caller */
1077 			if (error == 0) {
1078 				error = copyout(&start_address,
1079 				    (user_addr_t) uap->start_address,
1080 				    sizeof(start_address));
1081 				if (error != 0) {
1082 					SHARED_REGION_TRACE_ERROR(
1083 						("shared_region: %p [%d(%s)] "
1084 						"check_np(0x%llx) "
1085 						"copyout(0x%llx) error %d\n",
1086 						(void *)VM_KERNEL_ADDRPERM(current_thread()),
1087 						proc_getpid(p), p->p_comm,
1088 						(uint64_t)uap->start_address, (uint64_t)start_address,
1089 						error));
1090 				}
1091 			}
1092 		}
1093 		vm_shared_region_deallocate(shared_region);
1094 	} else {
1095 		/* no shared region ! */
1096 		error = EINVAL;
1097 	}
1098 
1099 	SHARED_REGION_TRACE_DEBUG(
1100 		("shared_region: %p [%d(%s)] check_np(0x%llx) <- 0x%llx %d\n",
1101 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1102 		proc_getpid(p), p->p_comm,
1103 		(uint64_t)uap->start_address, (uint64_t)start_address, error));
1104 
1105 	return error;
1106 }
1107 
1108 
1109 static int
shared_region_copyin(struct proc * p,user_addr_t user_addr,unsigned int count,unsigned int element_size,void * kernel_data)1110 shared_region_copyin(
1111 	struct proc  *p,
1112 	user_addr_t  user_addr,
1113 	unsigned int count,
1114 	unsigned int element_size,
1115 	void         *kernel_data)
1116 {
1117 	int             error = 0;
1118 	vm_size_t       size = count * element_size;
1119 
1120 	error = copyin(user_addr, kernel_data, size);
1121 	if (error) {
1122 		SHARED_REGION_TRACE_ERROR(
1123 			("shared_region: %p [%d(%s)] map(): "
1124 			"copyin(0x%llx, %ld) failed (error=%d)\n",
1125 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1126 			proc_getpid(p), p->p_comm,
1127 			(uint64_t)user_addr, (long)size, error));
1128 	}
1129 	return error;
1130 }
1131 
1132 /*
1133  * A reasonable upper limit to prevent overflow of allocation/copyin.
1134  */
1135 #define _SR_FILE_MAPPINGS_MAX_FILES 256
1136 
1137 /* forward declaration */
1138 __attribute__((noinline))
1139 static void shared_region_map_and_slide_cleanup(
1140 	struct proc              *p,
1141 	uint32_t                 files_count,
1142 	struct _sr_file_mappings *sr_file_mappings,
1143 	struct vm_shared_region  *shared_region);
1144 
1145 /*
1146  * Setup part of _shared_region_map_and_slide().
1147  * It had to be broken out of _shared_region_map_and_slide() to
1148  * prevent compiler inlining from blowing out the stack.
1149  */
1150 __attribute__((noinline))
1151 static int
shared_region_map_and_slide_setup(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings,struct _sr_file_mappings ** sr_file_mappings,struct vm_shared_region ** shared_region_ptr,struct vnode * rdir_vp)1152 shared_region_map_and_slide_setup(
1153 	struct proc                         *p,
1154 	uint32_t                            files_count,
1155 	struct shared_file_np               *files,
1156 	uint32_t                            mappings_count,
1157 	struct shared_file_mapping_slide_np *mappings,
1158 	struct _sr_file_mappings            **sr_file_mappings,
1159 	struct vm_shared_region             **shared_region_ptr,
1160 	struct vnode                        *rdir_vp)
1161 {
1162 	int                             error = 0;
1163 	struct _sr_file_mappings        *srfmp;
1164 	uint32_t                        mappings_next;
1165 	struct vnode_attr               va;
1166 	off_t                           fs;
1167 #if CONFIG_MACF
1168 	vm_prot_t                       maxprot = VM_PROT_ALL;
1169 #endif
1170 	uint32_t                        i;
1171 	struct vm_shared_region         *shared_region = NULL;
1172 	boolean_t                       is_driverkit = task_is_driver(current_task());
1173 
1174 	SHARED_REGION_TRACE_DEBUG(
1175 		("shared_region: %p [%d(%s)] -> map_and_slide_setup\n",
1176 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1177 		proc_getpid(p), p->p_comm));
1178 
1179 	if (files_count > _SR_FILE_MAPPINGS_MAX_FILES) {
1180 		error = E2BIG;
1181 		goto done;
1182 	}
1183 	if (files_count == 0) {
1184 		error = EINVAL;
1185 		goto done;
1186 	}
1187 	*sr_file_mappings = kalloc_type(struct _sr_file_mappings, files_count,
1188 	    Z_WAITOK | Z_ZERO);
1189 	if (*sr_file_mappings == NULL) {
1190 		error = ENOMEM;
1191 		goto done;
1192 	}
1193 	mappings_next = 0;
1194 	for (i = 0; i < files_count; i++) {
1195 		srfmp = &(*sr_file_mappings)[i];
1196 		srfmp->fd = files[i].sf_fd;
1197 		srfmp->mappings_count = files[i].sf_mappings_count;
1198 		srfmp->mappings = &mappings[mappings_next];
1199 		mappings_next += srfmp->mappings_count;
1200 		if (mappings_next > mappings_count) {
1201 			error = EINVAL;
1202 			goto done;
1203 		}
1204 		srfmp->slide = files[i].sf_slide;
1205 	}
1206 
1207 	/* get the process's shared region (setup in vm_map_exec()) */
1208 	shared_region = vm_shared_region_get(current_task());
1209 	*shared_region_ptr = shared_region;
1210 	if (shared_region == NULL) {
1211 		SHARED_REGION_TRACE_ERROR(
1212 			("shared_region: %p [%d(%s)] map(): "
1213 			"no shared region\n",
1214 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1215 			proc_getpid(p), p->p_comm));
1216 		error = EINVAL;
1217 		goto done;
1218 	}
1219 
1220 	/*
1221 	 * Check the shared region matches the current root
1222 	 * directory of this process.  Deny the mapping to
1223 	 * avoid tainting the shared region with something that
1224 	 * doesn't quite belong into it.
1225 	 */
1226 	struct vnode *sr_vnode = vm_shared_region_root_dir(shared_region);
1227 	if (sr_vnode != NULL ?  rdir_vp != sr_vnode : rdir_vp != rootvnode) {
1228 		SHARED_REGION_TRACE_ERROR(
1229 			("shared_region: map(%p) root_dir mismatch\n",
1230 			(void *)VM_KERNEL_ADDRPERM(current_thread())));
1231 		error = EPERM;
1232 		goto done;
1233 	}
1234 
1235 
1236 	for (srfmp = &(*sr_file_mappings)[0];
1237 	    srfmp < &(*sr_file_mappings)[files_count];
1238 	    srfmp++) {
1239 		if (srfmp->mappings_count == 0) {
1240 			/* no mappings here... */
1241 			continue;
1242 		}
1243 
1244 		/*
1245 		 * A file descriptor of -1 is used to indicate that the data
1246 		 * to be put in the shared region for this mapping comes directly
1247 		 * from the processes address space. Ensure we have proper alignments.
1248 		 */
1249 		if (srfmp->fd == -1) {
1250 			/* only allow one mapping per fd */
1251 			if (srfmp->mappings_count > 1) {
1252 				SHARED_REGION_TRACE_ERROR(
1253 					("shared_region: %p [%d(%s)] map data >1 mapping\n",
1254 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1255 					proc_getpid(p), p->p_comm));
1256 				error = EINVAL;
1257 				goto done;
1258 			}
1259 
1260 			/*
1261 			 * The destination address and size must be page aligned.
1262 			 */
1263 			struct shared_file_mapping_slide_np *mapping = &srfmp->mappings[0];
1264 			mach_vm_address_t dest_addr = mapping->sms_address;
1265 			mach_vm_size_t    map_size = mapping->sms_size;
1266 			if (!vm_map_page_aligned(dest_addr, vm_map_page_mask(current_map()))) {
1267 				SHARED_REGION_TRACE_ERROR(
1268 					("shared_region: %p [%d(%s)] map data destination 0x%llx not aligned\n",
1269 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1270 					proc_getpid(p), p->p_comm, dest_addr));
1271 				error = EINVAL;
1272 				goto done;
1273 			}
1274 			if (!vm_map_page_aligned(map_size, vm_map_page_mask(current_map()))) {
1275 				SHARED_REGION_TRACE_ERROR(
1276 					("shared_region: %p [%d(%s)] map data size 0x%llx not aligned\n",
1277 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1278 					proc_getpid(p), p->p_comm, map_size));
1279 				error = EINVAL;
1280 				goto done;
1281 			}
1282 			continue;
1283 		}
1284 
1285 		/* get file structure from file descriptor */
1286 		error = fp_get_ftype(p, srfmp->fd, DTYPE_VNODE, EINVAL, &srfmp->fp);
1287 		if (error) {
1288 			SHARED_REGION_TRACE_ERROR(
1289 				("shared_region: %p [%d(%s)] map: "
1290 				"fd=%d lookup failed (error=%d)\n",
1291 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1292 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1293 			goto done;
1294 		}
1295 
1296 		/* we need at least read permission on the file */
1297 		if (!(srfmp->fp->fp_glob->fg_flag & FREAD)) {
1298 			SHARED_REGION_TRACE_ERROR(
1299 				("shared_region: %p [%d(%s)] map: "
1300 				"fd=%d not readable\n",
1301 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1302 				proc_getpid(p), p->p_comm, srfmp->fd));
1303 			error = EPERM;
1304 			goto done;
1305 		}
1306 
1307 		/* get vnode from file structure */
1308 		error = vnode_getwithref((vnode_t)fp_get_data(srfmp->fp));
1309 		if (error) {
1310 			SHARED_REGION_TRACE_ERROR(
1311 				("shared_region: %p [%d(%s)] map: "
1312 				"fd=%d getwithref failed (error=%d)\n",
1313 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1314 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1315 			goto done;
1316 		}
1317 		srfmp->vp = (struct vnode *)fp_get_data(srfmp->fp);
1318 
1319 		/* make sure the vnode is a regular file */
1320 		if (srfmp->vp->v_type != VREG) {
1321 			SHARED_REGION_TRACE_ERROR(
1322 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1323 				"not a file (type=%d)\n",
1324 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1325 				proc_getpid(p), p->p_comm,
1326 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1327 				srfmp->vp->v_name, srfmp->vp->v_type));
1328 			error = EINVAL;
1329 			goto done;
1330 		}
1331 
1332 #if CONFIG_MACF
1333 		/* pass in 0 for the offset argument because AMFI does not need the offset
1334 		 *       of the shared cache */
1335 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
1336 		    srfmp->fp->fp_glob, VM_PROT_ALL, MAP_FILE | MAP_PRIVATE | MAP_FIXED, 0, &maxprot);
1337 		if (error) {
1338 			goto done;
1339 		}
1340 #endif /* MAC */
1341 
1342 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1343 		/*
1344 		 * Check if the shared cache is in the trust cache;
1345 		 * if so, we can skip the root ownership check.
1346 		 */
1347 #if DEVELOPMENT || DEBUG
1348 		/*
1349 		 * Skip both root ownership and trust cache check if
1350 		 * enforcement is disabled.
1351 		 */
1352 		if (!cs_system_enforcement()) {
1353 			goto after_root_check;
1354 		}
1355 #endif /* DEVELOPMENT || DEBUG */
1356 		struct cs_blob *blob = csvnode_get_blob(srfmp->vp, 0);
1357 		if (blob == NULL) {
1358 			SHARED_REGION_TRACE_ERROR(
1359 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1360 				"missing CS blob\n",
1361 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1362 				proc_getpid(p), p->p_comm,
1363 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1364 				srfmp->vp->v_name));
1365 			goto root_check;
1366 		}
1367 		const uint8_t *cdhash = csblob_get_cdhash(blob);
1368 		if (cdhash == NULL) {
1369 			SHARED_REGION_TRACE_ERROR(
1370 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1371 				"missing cdhash\n",
1372 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1373 				proc_getpid(p), p->p_comm,
1374 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1375 				srfmp->vp->v_name));
1376 			goto root_check;
1377 		}
1378 
1379 		bool in_trust_cache = false;
1380 		TrustCacheQueryToken_t qt;
1381 		if (query_trust_cache(kTCQueryTypeAll, cdhash, &qt) == KERN_SUCCESS) {
1382 			TCType_t tc_type = kTCTypeInvalid;
1383 			TCReturn_t tc_ret = amfi->TrustCache.queryGetTCType(&qt, &tc_type);
1384 			in_trust_cache = (tc_ret.error == kTCReturnSuccess &&
1385 			    (tc_type == kTCTypeCryptex1BootOS ||
1386 			    tc_type == kTCTypeStatic ||
1387 			    tc_type == kTCTypeEngineering));
1388 		}
1389 		if (!in_trust_cache) {
1390 			SHARED_REGION_TRACE_ERROR(
1391 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1392 				"not in trust cache\n",
1393 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1394 				proc_getpid(p), p->p_comm,
1395 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1396 				srfmp->vp->v_name));
1397 			goto root_check;
1398 		}
1399 		goto after_root_check;
1400 root_check:
1401 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1402 
1403 		/* The shared cache file must be owned by root */
1404 		VATTR_INIT(&va);
1405 		VATTR_WANTED(&va, va_uid);
1406 		error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1407 		if (error) {
1408 			SHARED_REGION_TRACE_ERROR(
1409 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1410 				"vnode_getattr(%p) failed (error=%d)\n",
1411 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1412 				proc_getpid(p), p->p_comm,
1413 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1414 				srfmp->vp->v_name,
1415 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1416 				error));
1417 			goto done;
1418 		}
1419 		if (va.va_uid != 0) {
1420 			SHARED_REGION_TRACE_ERROR(
1421 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1422 				"owned by uid=%d instead of 0\n",
1423 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1424 				proc_getpid(p), p->p_comm,
1425 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1426 				srfmp->vp->v_name, va.va_uid));
1427 			error = EPERM;
1428 			goto done;
1429 		}
1430 
1431 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1432 after_root_check:
1433 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1434 
1435 #if CONFIG_CSR
1436 		if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
1437 			VATTR_INIT(&va);
1438 			VATTR_WANTED(&va, va_flags);
1439 			error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1440 			if (error) {
1441 				SHARED_REGION_TRACE_ERROR(
1442 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1443 					"vnode_getattr(%p) failed (error=%d)\n",
1444 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1445 					proc_getpid(p), p->p_comm,
1446 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1447 					srfmp->vp->v_name,
1448 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1449 					error));
1450 				goto done;
1451 			}
1452 
1453 			if (!(va.va_flags & SF_RESTRICTED)) {
1454 				/*
1455 				 * CSR is not configured in CSR_ALLOW_UNRESTRICTED_FS mode, and
1456 				 * the shared cache file is NOT SIP-protected, so reject the
1457 				 * mapping request
1458 				 */
1459 				SHARED_REGION_TRACE_ERROR(
1460 					("shared_region: %p [%d(%s)] map(%p:'%s'), "
1461 					"vnode is not SIP-protected. \n",
1462 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1463 					proc_getpid(p), p->p_comm,
1464 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1465 					srfmp->vp->v_name));
1466 				error = EPERM;
1467 				goto done;
1468 			}
1469 		}
1470 #else /* CONFIG_CSR */
1471 
1472 		/*
1473 		 * Devices without SIP/ROSP need to make sure that the shared cache
1474 		 * is either on the root volume or in the preboot cryptex volume.
1475 		 */
1476 		assert(rdir_vp != NULL);
1477 		if (srfmp->vp->v_mount != rdir_vp->v_mount) {
1478 			vnode_t preboot_vp = NULL;
1479 #if XNU_TARGET_OS_OSX
1480 #define PREBOOT_CRYPTEX_PATH "/System/Volumes/Preboot/Cryptexes"
1481 #else
1482 #define PREBOOT_CRYPTEX_PATH "/private/preboot/Cryptexes"
1483 #endif
1484 			error = vnode_lookup(PREBOOT_CRYPTEX_PATH, 0, &preboot_vp, vfs_context_current());
1485 			if (error || srfmp->vp->v_mount != preboot_vp->v_mount) {
1486 				SHARED_REGION_TRACE_ERROR(
1487 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1488 					"not on process' root volume nor preboot volume\n",
1489 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1490 					proc_getpid(p), p->p_comm,
1491 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1492 					srfmp->vp->v_name));
1493 				error = EPERM;
1494 				if (preboot_vp) {
1495 					(void)vnode_put(preboot_vp);
1496 				}
1497 				goto done;
1498 			} else if (preboot_vp) {
1499 				(void)vnode_put(preboot_vp);
1500 			}
1501 		}
1502 #endif /* CONFIG_CSR */
1503 
1504 		if (scdir_enforce) {
1505 			char **expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1506 			struct vnode *scdir_vp = NULL;
1507 			for (expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1508 			    *expected_scdir_path != NULL;
1509 			    expected_scdir_path++) {
1510 				/* get vnode for expected_scdir_path */
1511 				error = vnode_lookup(*expected_scdir_path, 0, &scdir_vp, vfs_context_current());
1512 				if (error) {
1513 					SHARED_REGION_TRACE_ERROR(
1514 						("shared_region: %p [%d(%s)]: "
1515 						"vnode_lookup(%s) failed (error=%d)\n",
1516 						(void *)VM_KERNEL_ADDRPERM(current_thread()),
1517 						proc_getpid(p), p->p_comm,
1518 						*expected_scdir_path, error));
1519 					continue;
1520 				}
1521 
1522 				/* check if parent is scdir_vp */
1523 				assert(scdir_vp != NULL);
1524 				if (vnode_parent(srfmp->vp) == scdir_vp) {
1525 					(void)vnode_put(scdir_vp);
1526 					scdir_vp = NULL;
1527 					goto scdir_ok;
1528 				}
1529 				(void)vnode_put(scdir_vp);
1530 				scdir_vp = NULL;
1531 			}
1532 			/* nothing matches */
1533 			SHARED_REGION_TRACE_ERROR(
1534 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1535 				"shared cache file not in expected directory\n",
1536 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1537 				proc_getpid(p), p->p_comm,
1538 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1539 				srfmp->vp->v_name));
1540 			error = EPERM;
1541 			goto done;
1542 		}
1543 scdir_ok:
1544 
1545 		/* get vnode size */
1546 		error = vnode_size(srfmp->vp, &fs, vfs_context_current());
1547 		if (error) {
1548 			SHARED_REGION_TRACE_ERROR(
1549 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1550 				"vnode_size(%p) failed (error=%d)\n",
1551 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1552 				proc_getpid(p), p->p_comm,
1553 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1554 				srfmp->vp->v_name,
1555 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp), error));
1556 			goto done;
1557 		}
1558 		srfmp->file_size = fs;
1559 
1560 		/* get the file's memory object handle */
1561 		srfmp->file_control = ubc_getobject(srfmp->vp, UBC_HOLDOBJECT);
1562 		if (srfmp->file_control == MEMORY_OBJECT_CONTROL_NULL) {
1563 			SHARED_REGION_TRACE_ERROR(
1564 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1565 				"no memory object\n",
1566 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1567 				proc_getpid(p), p->p_comm,
1568 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1569 				srfmp->vp->v_name));
1570 			error = EINVAL;
1571 			goto done;
1572 		}
1573 
1574 		/* check that the mappings are properly covered by code signatures */
1575 		if (!cs_system_enforcement()) {
1576 			/* code signing is not enforced: no need to check */
1577 		} else {
1578 			for (i = 0; i < srfmp->mappings_count; i++) {
1579 				if (srfmp->mappings[i].sms_init_prot & VM_PROT_ZF) {
1580 					/* zero-filled mapping: not backed by the file */
1581 					continue;
1582 				}
1583 				if (ubc_cs_is_range_codesigned(srfmp->vp,
1584 				    srfmp->mappings[i].sms_file_offset,
1585 				    srfmp->mappings[i].sms_size)) {
1586 					/* this mapping is fully covered by code signatures */
1587 					continue;
1588 				}
1589 				SHARED_REGION_TRACE_ERROR(
1590 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1591 					"mapping #%d/%d [0x%llx:0x%llx:0x%llx:0x%x:0x%x] "
1592 					"is not code-signed\n",
1593 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1594 					proc_getpid(p), p->p_comm,
1595 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1596 					srfmp->vp->v_name,
1597 					i, srfmp->mappings_count,
1598 					srfmp->mappings[i].sms_address,
1599 					srfmp->mappings[i].sms_size,
1600 					srfmp->mappings[i].sms_file_offset,
1601 					srfmp->mappings[i].sms_max_prot,
1602 					srfmp->mappings[i].sms_init_prot));
1603 				error = EINVAL;
1604 				goto done;
1605 			}
1606 		}
1607 	}
1608 done:
1609 	if (error != 0) {
1610 		shared_region_map_and_slide_cleanup(p, files_count, *sr_file_mappings, shared_region);
1611 		*sr_file_mappings = NULL;
1612 		*shared_region_ptr = NULL;
1613 	}
1614 	SHARED_REGION_TRACE_DEBUG(
1615 		("shared_region: %p [%d(%s)] map_and_slide_setup <- %d\n",
1616 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1617 		proc_getpid(p), p->p_comm, error));
1618 	return error;
1619 }
1620 
1621 /*
1622  * shared_region_map_np()
1623  *
1624  * This system call is intended for dyld.
1625  *
1626  * dyld uses this to map a shared cache file into a shared region.
1627  * This is usually done only the first time a shared cache is needed.
1628  * Subsequent processes will just use the populated shared region without
1629  * requiring any further setup.
1630  */
1631 static int
_shared_region_map_and_slide(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings)1632 _shared_region_map_and_slide(
1633 	struct proc                         *p,
1634 	uint32_t                            files_count,
1635 	struct shared_file_np               *files,
1636 	uint32_t                            mappings_count,
1637 	struct shared_file_mapping_slide_np *mappings)
1638 {
1639 	int                             error = 0;
1640 	kern_return_t                   kr = KERN_SUCCESS;
1641 	struct _sr_file_mappings        *sr_file_mappings = NULL;
1642 	struct vnode                    *rdir_vp = NULL;
1643 	struct vm_shared_region         *shared_region = NULL;
1644 
1645 	/*
1646 	 * Get a reference to the current proc's root dir.
1647 	 * Need this to prevent racing with chroot.
1648 	 */
1649 	proc_fdlock(p);
1650 	rdir_vp = p->p_fd.fd_rdir;
1651 	if (rdir_vp == NULL) {
1652 		rdir_vp = rootvnode;
1653 	}
1654 	assert(rdir_vp != NULL);
1655 	vnode_get(rdir_vp);
1656 	proc_fdunlock(p);
1657 
1658 	/*
1659 	 * Turn files, mappings into sr_file_mappings and other setup.
1660 	 */
1661 	error = shared_region_map_and_slide_setup(p, files_count,
1662 	    files, mappings_count, mappings,
1663 	    &sr_file_mappings, &shared_region, rdir_vp);
1664 	if (error != 0) {
1665 		vnode_put(rdir_vp);
1666 		return error;
1667 	}
1668 
1669 	/* map the file(s) into that shared region's submap */
1670 	kr = vm_shared_region_map_file(shared_region, files_count, sr_file_mappings);
1671 	if (kr != KERN_SUCCESS) {
1672 		SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] map(): "
1673 		    "vm_shared_region_map_file() failed kr=0x%x\n",
1674 		    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1675 		    proc_getpid(p), p->p_comm, kr));
1676 	}
1677 
1678 	/* convert kern_return_t to errno */
1679 	switch (kr) {
1680 	case KERN_SUCCESS:
1681 		error = 0;
1682 		break;
1683 	case KERN_INVALID_ADDRESS:
1684 		error = EFAULT;
1685 		break;
1686 	case KERN_PROTECTION_FAILURE:
1687 		error = EPERM;
1688 		break;
1689 	case KERN_NO_SPACE:
1690 		error = ENOMEM;
1691 		break;
1692 	case KERN_FAILURE:
1693 	case KERN_INVALID_ARGUMENT:
1694 	default:
1695 		error = EINVAL;
1696 		break;
1697 	}
1698 
1699 	/*
1700 	 * Mark that this process is now using split libraries.
1701 	 */
1702 	if (error == 0 && (p->p_flag & P_NOSHLIB)) {
1703 		OSBitAndAtomic(~((uint32_t)P_NOSHLIB), &p->p_flag);
1704 	}
1705 
1706 	vnode_put(rdir_vp);
1707 	shared_region_map_and_slide_cleanup(p, files_count, sr_file_mappings, shared_region);
1708 
1709 	SHARED_REGION_TRACE_DEBUG(
1710 		("shared_region: %p [%d(%s)] <- map\n",
1711 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1712 		proc_getpid(p), p->p_comm));
1713 
1714 	return error;
1715 }
1716 
1717 /*
1718  * Clean up part of _shared_region_map_and_slide()
1719  * It had to be broken out of _shared_region_map_and_slide() to
1720  * prevent compiler inlining from blowing out the stack.
1721  */
1722 __attribute__((noinline))
1723 static void
shared_region_map_and_slide_cleanup(struct proc * p,uint32_t files_count,struct _sr_file_mappings * sr_file_mappings,struct vm_shared_region * shared_region)1724 shared_region_map_and_slide_cleanup(
1725 	struct proc              *p,
1726 	uint32_t                 files_count,
1727 	struct _sr_file_mappings *sr_file_mappings,
1728 	struct vm_shared_region  *shared_region)
1729 {
1730 	struct _sr_file_mappings *srfmp;
1731 	struct vnode_attr        va;
1732 
1733 	if (sr_file_mappings != NULL) {
1734 		for (srfmp = &sr_file_mappings[0]; srfmp < &sr_file_mappings[files_count]; srfmp++) {
1735 			if (srfmp->vp != NULL) {
1736 				vnode_lock_spin(srfmp->vp);
1737 				srfmp->vp->v_flag |= VSHARED_DYLD;
1738 				vnode_unlock(srfmp->vp);
1739 
1740 				/* update the vnode's access time */
1741 				if (!(vnode_vfsvisflags(srfmp->vp) & MNT_NOATIME)) {
1742 					VATTR_INIT(&va);
1743 					nanotime(&va.va_access_time);
1744 					VATTR_SET_ACTIVE(&va, va_access_time);
1745 					vnode_setattr(srfmp->vp, &va, vfs_context_current());
1746 				}
1747 
1748 #if NAMEDSTREAMS
1749 				/*
1750 				 * If the shared cache is compressed, it may
1751 				 * have a namedstream vnode instantiated for
1752 				 * for it. That namedstream vnode will also
1753 				 * have to be marked with VSHARED_DYLD.
1754 				 */
1755 				if (vnode_hasnamedstreams(srfmp->vp)) {
1756 					vnode_t svp;
1757 					if (vnode_getnamedstream(srfmp->vp, &svp, XATTR_RESOURCEFORK_NAME,
1758 					    NS_OPEN, 0, vfs_context_kernel()) == 0) {
1759 						vnode_lock_spin(svp);
1760 						svp->v_flag |= VSHARED_DYLD;
1761 						vnode_unlock(svp);
1762 						vnode_put(svp);
1763 					}
1764 				}
1765 #endif /* NAMEDSTREAMS */
1766 				/*
1767 				 * release the vnode...
1768 				 * ubc_map() still holds it for us in the non-error case
1769 				 */
1770 				(void) vnode_put(srfmp->vp);
1771 				srfmp->vp = NULL;
1772 			}
1773 			if (srfmp->fp != NULL) {
1774 				/* release the file descriptor */
1775 				fp_drop(p, srfmp->fd, srfmp->fp, 0);
1776 				srfmp->fp = NULL;
1777 			}
1778 		}
1779 		kfree_type(struct _sr_file_mappings, files_count, sr_file_mappings);
1780 	}
1781 
1782 	if (shared_region != NULL) {
1783 		vm_shared_region_deallocate(shared_region);
1784 	}
1785 }
1786 
1787 /*
1788  * For each file mapped, we may have mappings for:
1789  *    TEXT, EXECUTE, LINKEDIT, DATA_CONST, __AUTH, DATA
1790  * so let's round up to 8 mappings per file.
1791  */
1792 #define SFM_MAX       (_SR_FILE_MAPPINGS_MAX_FILES * 8)     /* max mapping structs allowed to pass in */
1793 
1794 /*
1795  * This is the new interface for setting up shared region mappings.
1796  *
1797  * The slide used for shared regions setup using this interface is done differently
1798  * from the old interface. The slide value passed in the shared_files_np represents
1799  * a max value. The kernel will choose a random value based on that, then use it
1800  * for all shared regions.
1801  */
1802 #if defined (__x86_64__)
1803 #define SLIDE_AMOUNT_MASK ~FOURK_PAGE_MASK
1804 #else
1805 #define SLIDE_AMOUNT_MASK ~SIXTEENK_PAGE_MASK
1806 #endif
1807 
1808 static inline __result_use_check kern_return_t
shared_region_map_and_slide_2_np_sanitize(struct proc * p,user_addr_t mappings_userspace_addr,unsigned int count,shared_file_mapping_slide_np_t * mappings)1809 shared_region_map_and_slide_2_np_sanitize(
1810 	struct proc                         *p,
1811 	user_addr_t                         mappings_userspace_addr,
1812 	unsigned int                        count,
1813 	shared_file_mapping_slide_np_t      *mappings)
1814 {
1815 	kern_return_t kr;
1816 	vm_map_t map = current_map();
1817 	mach_vm_address_t addr, end;
1818 	mach_vm_offset_t offset, offset_end;
1819 	mach_vm_size_t size, offset_size;
1820 	user_addr_t slide_start, slide_end, slide_size;
1821 	vm_prot_t cur;
1822 	vm_prot_t max;
1823 
1824 	user_addr_t user_addr = mappings_userspace_addr;
1825 
1826 	for (size_t i = 0; i < count; i++) {
1827 		shared_file_mapping_slide_np_ut mapping_u;
1828 		/*
1829 		 * First we bring each mapping struct into our kernel stack to
1830 		 * avoid TOCTOU.
1831 		 */
1832 		kr = shared_region_copyin(
1833 			p,
1834 			user_addr,
1835 			1, // copy 1 element at a time
1836 			sizeof(shared_file_mapping_slide_np_ut),
1837 			&mapping_u);
1838 		if (__improbable(kr != KERN_SUCCESS)) {
1839 			return kr;
1840 		}
1841 
1842 		/*
1843 		 * Then, we sanitize the data on the kernel stack.
1844 		 */
1845 		kr = vm_sanitize_addr_size(
1846 			mapping_u.sms_address_u,
1847 			mapping_u.sms_size_u,
1848 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1849 			map,
1850 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1851 			| VM_SANITIZE_FLAGS_CHECK_ALIGNED_START
1852 			| VM_SANITIZE_FLAGS_CHECK_ALIGNED_SIZE),
1853 			&addr,
1854 			&end,
1855 			&size);
1856 		if (__improbable(kr != KERN_SUCCESS)) {
1857 			return kr;
1858 		}
1859 
1860 		kr = vm_sanitize_addr_size(
1861 			mapping_u.sms_file_offset_u,
1862 			mapping_u.sms_size_u,
1863 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1864 			PAGE_MASK,
1865 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1866 			| VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1867 			&offset,
1868 			&offset_end,
1869 			&offset_size);
1870 		if (__improbable(kr != KERN_SUCCESS)) {
1871 			return kr;
1872 		}
1873 		if (__improbable(0 != (offset & vm_map_page_mask(map)))) {
1874 			return KERN_INVALID_ARGUMENT;
1875 		}
1876 
1877 		/*
1878 		 * Unsafe access is immediately followed by wrap to
1879 		 * convert from addr to size.
1880 		 */
1881 		mach_vm_size_ut sms_slide_size_u =
1882 		    vm_sanitize_wrap_size(
1883 			VM_SANITIZE_UNSAFE_UNWRAP(
1884 				mapping_u.sms_slide_size_u));
1885 
1886 		kr = vm_sanitize_addr_size(
1887 			mapping_u.sms_slide_start_u,
1888 			sms_slide_size_u,
1889 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1890 			map,
1891 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1892 			| VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1893 			&slide_start,
1894 			&slide_end,
1895 			&slide_size);
1896 		if (__improbable(kr != KERN_SUCCESS)) {
1897 			return kr;
1898 		}
1899 
1900 		kr = vm_sanitize_cur_and_max_prots(
1901 			mapping_u.sms_init_prot_u,
1902 			mapping_u.sms_max_prot_u,
1903 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1904 			map,
1905 			VM_PROT_SFM_EXTENSIONS_MASK | VM_PROT_TPRO,
1906 			&cur,
1907 			&max);
1908 		if (__improbable(kr != KERN_SUCCESS)) {
1909 			return kr;
1910 		}
1911 
1912 		/*
1913 		 * Finally, we move the data from the kernel stack to our
1914 		 * caller-allocated kernel heap buffer.
1915 		 */
1916 		mappings[i].sms_address = addr;
1917 		mappings[i].sms_size = size;
1918 		mappings[i].sms_file_offset = offset;
1919 		mappings[i].sms_slide_size = slide_size;
1920 		mappings[i].sms_slide_start = slide_start;
1921 		mappings[i].sms_max_prot = max;
1922 		mappings[i].sms_init_prot = cur;
1923 
1924 		if (__improbable(os_add_overflow(
1925 			    user_addr,
1926 			    sizeof(shared_file_mapping_slide_np_ut),
1927 			    &user_addr))) {
1928 			return KERN_INVALID_ARGUMENT;
1929 		}
1930 	}
1931 
1932 	return KERN_SUCCESS;
1933 }
1934 
1935 int
shared_region_map_and_slide_2_np(struct proc * p,struct shared_region_map_and_slide_2_np_args * uap,__unused int * retvalp)1936 shared_region_map_and_slide_2_np(
1937 	struct proc                                  *p,
1938 	struct shared_region_map_and_slide_2_np_args *uap,
1939 	__unused int                                 *retvalp)
1940 {
1941 	unsigned int                  files_count;
1942 	struct shared_file_np         *shared_files = NULL;
1943 	unsigned int                  mappings_count;
1944 	struct shared_file_mapping_slide_np *mappings = NULL;
1945 	kern_return_t                 kr = KERN_SUCCESS;
1946 
1947 	files_count = uap->files_count;
1948 	mappings_count = uap->mappings_count;
1949 
1950 	SHARED_REGION_TRACE_DEBUG(
1951 		("shared_region: %p [%d(%s)] -> map_and_slide(0x%llx)\n",
1952 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1953 		proc_getpid(p), p->p_comm,
1954 		(uint64_t)uap->mappings_u));
1955 
1956 	if (files_count == 0) {
1957 		SHARED_REGION_TRACE_INFO(
1958 			("shared_region: %p [%d(%s)] map(): "
1959 			"no files\n",
1960 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1961 			proc_getpid(p), p->p_comm));
1962 		kr = 0; /* no files to map: we're done ! */
1963 		goto done;
1964 	} else if (files_count <= _SR_FILE_MAPPINGS_MAX_FILES) {
1965 		shared_files = kalloc_data(files_count * sizeof(shared_files[0]), Z_WAITOK);
1966 		if (shared_files == NULL) {
1967 			kr = KERN_RESOURCE_SHORTAGE;
1968 			goto done;
1969 		}
1970 	} else {
1971 		SHARED_REGION_TRACE_ERROR(
1972 			("shared_region: %p [%d(%s)] map(): "
1973 			"too many files (%d) max %d\n",
1974 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1975 			proc_getpid(p), p->p_comm,
1976 			files_count, _SR_FILE_MAPPINGS_MAX_FILES));
1977 		kr = KERN_FAILURE;
1978 		goto done;
1979 	}
1980 
1981 	if (mappings_count == 0) {
1982 		SHARED_REGION_TRACE_INFO(
1983 			("shared_region: %p [%d(%s)] map(): "
1984 			"no mappings\n",
1985 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1986 			proc_getpid(p), p->p_comm));
1987 		kr = 0; /* no mappings: we're done ! */
1988 		goto done;
1989 	} else if (mappings_count <= SFM_MAX) {
1990 		mappings = kalloc_data(mappings_count * sizeof(mappings[0]), Z_WAITOK);
1991 		if (mappings == NULL) {
1992 			kr = KERN_RESOURCE_SHORTAGE;
1993 			goto done;
1994 		}
1995 	} else {
1996 		SHARED_REGION_TRACE_ERROR(
1997 			("shared_region: %p [%d(%s)] map(): "
1998 			"too many mappings (%d) max %d\n",
1999 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2000 			proc_getpid(p), p->p_comm,
2001 			mappings_count, SFM_MAX));
2002 		kr = KERN_FAILURE;
2003 		goto done;
2004 	}
2005 
2006 	/*
2007 	 * struct shared_file_np does not have fields that are subject to
2008 	 * sanitization, it is thus copied from userspace as is.
2009 	 */
2010 	kr = shared_region_copyin(p, uap->files, files_count, sizeof(shared_files[0]), shared_files);
2011 	if (kr != KERN_SUCCESS) {
2012 		SHARED_REGION_TRACE_ERROR(
2013 			("shared_region: %p [%d(%s)] copyin() returned 0x%x\n",
2014 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2015 			proc_getpid(p), p->p_comm, kr));
2016 		goto done;
2017 	}
2018 
2019 	kr = shared_region_map_and_slide_2_np_sanitize(
2020 		p,
2021 		uap->mappings_u,
2022 		mappings_count,
2023 		mappings);
2024 	if (__improbable(kr != KERN_SUCCESS)) {
2025 		SHARED_REGION_TRACE_ERROR(
2026 			("shared_region: %p [%d(%s)] sanitize() returned 0x%x\n",
2027 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2028 			proc_getpid(p), p->p_comm, kr));
2029 		kr = vm_sanitize_get_kr(kr);
2030 		goto done;
2031 	}
2032 
2033 	uint32_t max_slide = shared_files[0].sf_slide;
2034 	uint32_t random_val;
2035 	uint32_t slide_amount;
2036 
2037 	if (max_slide != 0) {
2038 		read_random(&random_val, sizeof random_val);
2039 		slide_amount = ((random_val % max_slide) & SLIDE_AMOUNT_MASK);
2040 	} else {
2041 		slide_amount = 0;
2042 	}
2043 #if DEVELOPMENT || DEBUG
2044 	extern bool bootarg_disable_aslr;
2045 	if (bootarg_disable_aslr) {
2046 		slide_amount = 0;
2047 	}
2048 #endif /* DEVELOPMENT || DEBUG */
2049 
2050 	/*
2051 	 * Fix up the mappings to reflect the desired slide.
2052 	 */
2053 	unsigned int f;
2054 	unsigned int m = 0;
2055 	unsigned int i;
2056 	for (f = 0; f < files_count; ++f) {
2057 		shared_files[f].sf_slide = slide_amount;
2058 		for (i = 0; i < shared_files[f].sf_mappings_count; ++i, ++m) {
2059 			if (m >= mappings_count) {
2060 				SHARED_REGION_TRACE_ERROR(
2061 					("shared_region: %p [%d(%s)] map(): "
2062 					"mapping count argument was too small\n",
2063 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
2064 					proc_getpid(p), p->p_comm));
2065 				kr = KERN_FAILURE;
2066 				goto done;
2067 			}
2068 			if (__improbable(
2069 				    os_add_overflow(
2070 					    mappings[m].sms_address,
2071 					    slide_amount,
2072 					    &mappings[m].sms_address))) {
2073 				kr = KERN_INVALID_ARGUMENT;
2074 				goto done;
2075 			}
2076 			if (mappings[m].sms_slide_size != 0) {
2077 				mach_vm_address_t discard;
2078 				/* Slide and check that new start/size pairs do not overflow. */
2079 				if (__improbable(
2080 					    os_add_overflow(
2081 						    mappings[m].sms_slide_start,
2082 						    slide_amount,
2083 						    &mappings[m].sms_slide_start) ||
2084 					    os_add_overflow(
2085 						    mappings[m].sms_slide_start,
2086 						    mappings[m].sms_slide_size,
2087 						    &discard))) {
2088 					kr = KERN_INVALID_ARGUMENT;
2089 					goto done;
2090 				}
2091 			}
2092 		}
2093 	}
2094 
2095 	kr = _shared_region_map_and_slide(p, files_count, shared_files, mappings_count, mappings);
2096 done:
2097 	kfree_data(shared_files, files_count * sizeof(shared_files[0]));
2098 	kfree_data(mappings, mappings_count * sizeof(mappings[0]));
2099 
2100 	SHARED_REGION_TRACE_DEBUG(
2101 		("shared_region: %p [%d(%s)] map_and_slide(0x%llx) <- 0x%x\n",
2102 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
2103 		proc_getpid(p), p->p_comm,
2104 		(uint64_t)uap->mappings_u, kr));
2105 
2106 	return kr;
2107 }
2108 
2109 /*
2110  * A syscall for dyld to use to map data pages that need load time relocation fixups.
2111  * The fixups are performed by a custom pager during page-in, so the pages still appear
2112  * "clean" and hence are easily discarded under memory pressure. They can be re-paged-in
2113  * on demand later, all w/o using the compressor.
2114  *
2115  * Note these page are treated as MAP_PRIVATE. So if the application dirties any pages while
2116  * running, they are COW'd as normal.
2117  */
2118 int
map_with_linking_np(struct proc * p,struct map_with_linking_np_args * uap,__unused int * retvalp)2119 map_with_linking_np(
2120 	struct proc                     *p,
2121 	struct map_with_linking_np_args *uap,
2122 	__unused int                    *retvalp)
2123 {
2124 	uint32_t                        region_count;
2125 	uint32_t                        r;
2126 	struct mwl_region               *regions = NULL;
2127 	struct mwl_region               *rp;
2128 	uint32_t                        link_info_size;
2129 	void                            *link_info = NULL;      /* starts with a struct mwl_info_hdr */
2130 	struct mwl_info_hdr             *info_hdr = NULL;
2131 	uint64_t                        binds_size;
2132 	int                             fd;
2133 	struct fileproc                 *fp = NULL;
2134 	struct vnode                    *vp = NULL;
2135 	size_t                          file_size;
2136 	off_t                           fs;
2137 	struct vnode_attr               va;
2138 	memory_object_control_t         file_control = NULL;
2139 	int                             error;
2140 	kern_return_t                   kr = KERN_SUCCESS;
2141 
2142 	/*
2143 	 * Check if dyld has told us it finished with this call.
2144 	 */
2145 	if (p->p_disallow_map_with_linking) {
2146 		printf("%s: [%d(%s)]: map__with_linking() was disabled\n",
2147 		    __func__, proc_getpid(p), p->p_comm);
2148 		kr = KERN_FAILURE;
2149 		goto done;
2150 	}
2151 
2152 	/*
2153 	 * First we do some sanity checking on what dyld has passed us.
2154 	 */
2155 	region_count = uap->region_count;
2156 	link_info_size = uap->link_info_size;
2157 	if (region_count == 0) {
2158 		printf("%s: [%d(%s)]: region_count == 0\n",
2159 		    __func__, proc_getpid(p), p->p_comm);
2160 		kr = KERN_FAILURE;
2161 		goto done;
2162 	}
2163 	if (region_count > MWL_MAX_REGION_COUNT) {
2164 		printf("%s: [%d(%s)]: region_count too big %d\n",
2165 		    __func__, proc_getpid(p), p->p_comm, region_count);
2166 		kr = KERN_FAILURE;
2167 		goto done;
2168 	}
2169 
2170 	if (link_info_size <= MWL_MIN_LINK_INFO_SIZE) {
2171 		printf("%s: [%d(%s)]: link_info_size too small\n",
2172 		    __func__, proc_getpid(p), p->p_comm);
2173 		kr = KERN_FAILURE;
2174 		goto done;
2175 	}
2176 	if (link_info_size >= MWL_MAX_LINK_INFO_SIZE) {
2177 		printf("%s: [%d(%s)]: link_info_size too big %d\n",
2178 		    __func__, proc_getpid(p), p->p_comm, link_info_size);
2179 		kr = KERN_FAILURE;
2180 		goto done;
2181 	}
2182 
2183 	/*
2184 	 * Allocate and copyin the regions and link info
2185 	 */
2186 	regions = kalloc_data(region_count * sizeof(regions[0]), Z_WAITOK);
2187 	if (regions == NULL) {
2188 		printf("%s: [%d(%s)]: failed to allocate regions\n",
2189 		    __func__, proc_getpid(p), p->p_comm);
2190 		kr = KERN_RESOURCE_SHORTAGE;
2191 		goto done;
2192 	}
2193 	kr = shared_region_copyin(p, uap->regions, region_count, sizeof(regions[0]), regions);
2194 	if (kr != KERN_SUCCESS) {
2195 		printf("%s: [%d(%s)]: failed to copyin regions kr=%d\n",
2196 		    __func__, proc_getpid(p), p->p_comm, kr);
2197 		goto done;
2198 	}
2199 
2200 	link_info = kalloc_data(link_info_size, Z_WAITOK);
2201 	if (link_info == NULL) {
2202 		printf("%s: [%d(%s)]: failed to allocate link_info\n",
2203 		    __func__, proc_getpid(p), p->p_comm);
2204 		kr = KERN_RESOURCE_SHORTAGE;
2205 		goto done;
2206 	}
2207 	kr = shared_region_copyin(p, uap->link_info, 1, link_info_size, link_info);
2208 	if (kr != KERN_SUCCESS) {
2209 		printf("%s: [%d(%s)]: failed to copyin link_info kr=%d\n",
2210 		    __func__, proc_getpid(p), p->p_comm, kr);
2211 		goto done;
2212 	}
2213 
2214 	/*
2215 	 * Do some verification the data structures.
2216 	 */
2217 	info_hdr = (struct mwl_info_hdr *)link_info;
2218 	if (info_hdr->mwli_version != MWL_INFO_VERS) {
2219 		printf("%s: [%d(%s)]: unrecognized mwli_version=%d\n",
2220 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_version);
2221 		kr = KERN_FAILURE;
2222 		goto done;
2223 	}
2224 
2225 	if (info_hdr->mwli_binds_offset > link_info_size) {
2226 		printf("%s: [%d(%s)]: mwli_binds_offset too large %d\n",
2227 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_offset);
2228 		kr = KERN_FAILURE;
2229 		goto done;
2230 	}
2231 
2232 	/* some older devs have s/w page size > h/w page size, no need to support them */
2233 	if (info_hdr->mwli_page_size != PAGE_SIZE) {
2234 		/* no printf, since this is expected on some devices */
2235 		kr = KERN_INVALID_ARGUMENT;
2236 		goto done;
2237 	}
2238 
2239 	binds_size = (uint64_t)info_hdr->mwli_binds_count *
2240 	    ((info_hdr->mwli_pointer_format == DYLD_CHAINED_PTR_32) ? 4 : 8);
2241 	if (binds_size > link_info_size - info_hdr->mwli_binds_offset) {
2242 		printf("%s: [%d(%s)]: mwli_binds_count too large %d\n",
2243 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_count);
2244 		kr = KERN_FAILURE;
2245 		goto done;
2246 	}
2247 
2248 	if (info_hdr->mwli_chains_offset > link_info_size) {
2249 		printf("%s: [%d(%s)]: mwli_chains_offset too large %d\n",
2250 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_offset);
2251 		kr = KERN_FAILURE;
2252 		goto done;
2253 	}
2254 
2255 
2256 	/*
2257 	 * Ensure the chained starts in the link info and make sure the
2258 	 * segment info offsets are within bounds.
2259 	 */
2260 	if (info_hdr->mwli_chains_size < sizeof(struct dyld_chained_starts_in_image)) {
2261 		printf("%s: [%d(%s)]: mwli_chains_size too small %d\n",
2262 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2263 		kr = KERN_FAILURE;
2264 		goto done;
2265 	}
2266 	if (info_hdr->mwli_chains_size > link_info_size - info_hdr->mwli_chains_offset) {
2267 		printf("%s: [%d(%s)]: mwli_chains_size too large %d\n",
2268 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2269 		kr = KERN_FAILURE;
2270 		goto done;
2271 	}
2272 
2273 	/* Note that more verification of offsets is done in the pager itself */
2274 
2275 	/*
2276 	 * Ensure we've only been given one FD and verify valid protections.
2277 	 */
2278 	fd = regions[0].mwlr_fd;
2279 	for (r = 0; r < region_count; ++r) {
2280 		if (regions[r].mwlr_fd != fd) {
2281 			printf("%s: [%d(%s)]: mwlr_fd mismatch %d and %d\n",
2282 			    __func__, proc_getpid(p), p->p_comm, fd, regions[r].mwlr_fd);
2283 			kr = KERN_FAILURE;
2284 			goto done;
2285 		}
2286 
2287 		/*
2288 		 * Only allow data mappings and not zero fill. Permit TPRO
2289 		 * mappings only when VM_PROT_READ | VM_PROT_WRITE.
2290 		 */
2291 		if (regions[r].mwlr_protections & VM_PROT_EXECUTE) {
2292 			printf("%s: [%d(%s)]: mwlr_protections EXECUTE not allowed\n",
2293 			    __func__, proc_getpid(p), p->p_comm);
2294 			kr = KERN_FAILURE;
2295 			goto done;
2296 		}
2297 		if (regions[r].mwlr_protections & VM_PROT_ZF) {
2298 			printf("%s: [%d(%s)]: region %d, found VM_PROT_ZF not allowed\n",
2299 			    __func__, proc_getpid(p), p->p_comm, r);
2300 			kr = KERN_FAILURE;
2301 			goto done;
2302 		}
2303 		if ((regions[r].mwlr_protections & VM_PROT_TPRO) &&
2304 		    !(regions[r].mwlr_protections & VM_PROT_WRITE)) {
2305 			printf("%s: [%d(%s)]: region %d, found VM_PROT_TPRO without VM_PROT_WRITE\n",
2306 			    __func__, proc_getpid(p), p->p_comm, r);
2307 			kr = KERN_FAILURE;
2308 			goto done;
2309 		}
2310 	}
2311 
2312 
2313 	/* get file structure from file descriptor */
2314 	error = fp_get_ftype(p, fd, DTYPE_VNODE, EINVAL, &fp);
2315 	if (error) {
2316 		printf("%s: [%d(%s)]: fp_get_ftype() failed, error %d\n",
2317 		    __func__, proc_getpid(p), p->p_comm, error);
2318 		kr = KERN_FAILURE;
2319 		goto done;
2320 	}
2321 
2322 	/* We need at least read permission on the file */
2323 	if (!(fp->fp_glob->fg_flag & FREAD)) {
2324 		printf("%s: [%d(%s)]: not readable\n",
2325 		    __func__, proc_getpid(p), p->p_comm);
2326 		kr = KERN_FAILURE;
2327 		goto done;
2328 	}
2329 
2330 	/* Get the vnode from file structure */
2331 	vp = (struct vnode *)fp_get_data(fp);
2332 	error = vnode_getwithref(vp);
2333 	if (error) {
2334 		printf("%s: [%d(%s)]: failed to get vnode, error %d\n",
2335 		    __func__, proc_getpid(p), p->p_comm, error);
2336 		kr = KERN_FAILURE;
2337 		vp = NULL; /* just to be sure */
2338 		goto done;
2339 	}
2340 
2341 	/* Make sure the vnode is a regular file */
2342 	if (vp->v_type != VREG) {
2343 		printf("%s: [%d(%s)]: vnode not VREG\n",
2344 		    __func__, proc_getpid(p), p->p_comm);
2345 		kr = KERN_FAILURE;
2346 		goto done;
2347 	}
2348 
2349 	/* get vnode size */
2350 	error = vnode_size(vp, &fs, vfs_context_current());
2351 	if (error) {
2352 		goto done;
2353 	}
2354 	file_size = fs;
2355 
2356 	/* get the file's memory object handle */
2357 	file_control = ubc_getobject(vp, UBC_HOLDOBJECT);
2358 	if (file_control == MEMORY_OBJECT_CONTROL_NULL) {
2359 		printf("%s: [%d(%s)]: no memory object\n",
2360 		    __func__, proc_getpid(p), p->p_comm);
2361 		kr = KERN_FAILURE;
2362 		goto done;
2363 	}
2364 
2365 	for (r = 0; r < region_count; ++r) {
2366 		rp = &regions[r];
2367 
2368 #if CONFIG_MACF
2369 		vm_prot_t prot = (rp->mwlr_protections & VM_PROT_ALL);
2370 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
2371 		    fp->fp_glob, prot, MAP_FILE | MAP_PRIVATE | MAP_FIXED, rp->mwlr_file_offset, &prot);
2372 		if (error) {
2373 			printf("%s: [%d(%s)]: mac_file_check_mmap() failed, region %d, error %d\n",
2374 			    __func__, proc_getpid(p), p->p_comm, r, error);
2375 			kr = KERN_FAILURE;
2376 			goto done;
2377 		}
2378 #endif /* MAC */
2379 
2380 		/* check that the mappings are properly covered by code signatures */
2381 		if (cs_system_enforcement()) {
2382 			if (!ubc_cs_is_range_codesigned(vp, rp->mwlr_file_offset, rp->mwlr_size)) {
2383 				printf("%s: [%d(%s)]: region %d, not code signed\n",
2384 				    __func__, proc_getpid(p), p->p_comm, r);
2385 				kr = KERN_FAILURE;
2386 				goto done;
2387 			}
2388 		}
2389 	}
2390 
2391 	/* update the vnode's access time */
2392 	if (!(vnode_vfsvisflags(vp) & MNT_NOATIME)) {
2393 		VATTR_INIT(&va);
2394 		nanotime(&va.va_access_time);
2395 		VATTR_SET_ACTIVE(&va, va_access_time);
2396 		vnode_setattr(vp, &va, vfs_context_current());
2397 	}
2398 
2399 	/* get the VM to do the work */
2400 	kr = vm_map_with_linking(proc_task(p), regions, region_count, &link_info, link_info_size, file_control);
2401 
2402 done:
2403 	if (fp != NULL) {
2404 		/* release the file descriptor */
2405 		fp_drop(p, fd, fp, 0);
2406 	}
2407 	if (vp != NULL) {
2408 		(void)vnode_put(vp);
2409 	}
2410 	if (regions != NULL) {
2411 		kfree_data(regions, region_count * sizeof(regions[0]));
2412 	}
2413 	/* link info is NULL if it is used in the pager, if things worked */
2414 	if (link_info != NULL) {
2415 		kfree_data(link_info, link_info_size);
2416 	}
2417 
2418 	switch (kr) {
2419 	case KERN_SUCCESS:
2420 		return 0;
2421 	case KERN_RESOURCE_SHORTAGE:
2422 		return ENOMEM;
2423 	default:
2424 		return EINVAL;
2425 	}
2426 }
2427 
2428 #if DEBUG || DEVELOPMENT
2429 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count,
2430     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count, 0, "");
2431 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count_max,
2432     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count_max, 0, "");
2433 #endif /* DEBUG || DEVELOPMENT */
2434 
2435 /* sysctl overflow room */
2436 
2437 SYSCTL_INT(_vm, OID_AUTO, pagesize, CTLFLAG_RD | CTLFLAG_LOCKED,
2438     (int *) &page_size, 0, "vm page size");
2439 
2440 /* vm_page_free_target is provided as a makeshift solution for applications that want to
2441  *       allocate buffer space, possibly purgeable memory, but not cause inactive pages to be
2442  *       reclaimed. It allows the app to calculate how much memory is free outside the free target. */
2443 extern unsigned int     vm_page_free_target;
2444 SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED,
2445     &vm_page_free_target, 0, "Pageout daemon free target");
2446 
2447 SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED,
2448     &vm_pageout_state.vm_memory_pressure, 0, "Memory pressure indicator");
2449 
2450 static int
2451 vm_ctl_page_free_wanted SYSCTL_HANDLER_ARGS
2452 {
2453 #pragma unused(oidp, arg1, arg2)
2454 	unsigned int page_free_wanted;
2455 
2456 	page_free_wanted = mach_vm_ctl_page_free_wanted();
2457 	return SYSCTL_OUT(req, &page_free_wanted, sizeof(page_free_wanted));
2458 }
2459 SYSCTL_PROC(_vm, OID_AUTO, page_free_wanted,
2460     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
2461     0, 0, vm_ctl_page_free_wanted, "I", "");
2462 
2463 extern unsigned int     vm_page_purgeable_count;
2464 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2465     &vm_page_purgeable_count, 0, "Purgeable page count");
2466 
2467 extern unsigned int     vm_page_purgeable_wired_count;
2468 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2469     &vm_page_purgeable_wired_count, 0, "Wired purgeable page count");
2470 
2471 extern unsigned int vm_page_kern_lpage_count;
2472 SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2473     &vm_page_kern_lpage_count, 0, "kernel used large pages");
2474 
2475 SCALABLE_COUNTER_DECLARE(vm_page_grab_count);
2476 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed");
2477 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_kern);
2478 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_kern, vm_page_grab_count_kern, "Total pages grabbed (kernel)");
2479 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_iopl);
2480 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_iopl, vm_page_grab_count_iopl, "Total pages grabbed (iopl)");
2481 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_upl);
2482 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_upl, vm_page_grab_count_upl, "Total pages grabbed (upl)");
2483 
2484 
2485 #if DEVELOPMENT || DEBUG
2486 SCALABLE_COUNTER_DECLARE(vm_page_deactivate_behind_count);
2487 SYSCTL_SCALABLE_COUNTER(_vm, pages_deactivated_behind, vm_page_deactivate_behind_count,
2488     "Number of pages deactivated behind");
2489 #endif
2490 
2491 #if DEVELOPMENT || DEBUG
2492 #if __ARM_MIXED_PAGE_SIZE__
2493 static int vm_mixed_pagesize_supported = 1;
2494 #else
2495 static int vm_mixed_pagesize_supported = 0;
2496 #endif /*__ARM_MIXED_PAGE_SIZE__ */
2497 SYSCTL_INT(_debug, OID_AUTO, vm_mixed_pagesize_supported, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED,
2498     &vm_mixed_pagesize_supported, 0, "kernel support for mixed pagesize");
2499 
2500 SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
2501     &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed");
2502 
2503 SYSCTL_INT(_vm, OID_AUTO, pageout_purged_objects, CTLFLAG_RD | CTLFLAG_LOCKED,
2504     &vm_pageout_debug.vm_pageout_purged_objects, 0, "System purged object count");
2505 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED,
2506     &vm_pageout_debug.vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)");
2507 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED,
2508     &vm_pageout_debug.vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)");
2509 
2510 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2511     &vm_pageout_debug.vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated");
2512 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2513     &vm_pageout_debug.vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated");
2514 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2515     &vm_pageout_debug.vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated");         /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */
2516 SYSCTL_ULONG(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2517     &vm_pageout_vminfo.vm_pageout_freed_cleaned, "Cleaned pages freed");
2518 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2519     &vm_pageout_debug.vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated");
2520 SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2521     &vm_pageout_debug.vm_pageout_enqueued_cleaned, 0, "");         /* sum of next two */
2522 #endif /* DEVELOPMENT || DEBUG */
2523 
2524 extern int madvise_free_debug;
2525 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
2526     &madvise_free_debug, 0, "zero-fill on madvise(MADV_FREE*)");
2527 extern int madvise_free_debug_sometimes;
2528 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug_sometimes, CTLFLAG_RW | CTLFLAG_LOCKED,
2529     &madvise_free_debug_sometimes, 0, "sometimes zero-fill on madvise(MADV_FREE*)");
2530 
2531 SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2532     &vm_page_stats_reusable.reusable_count, 0, "Reusable page count");
2533 SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2534     &vm_page_stats_reusable.reusable_pages_success, "");
2535 SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2536     &vm_page_stats_reusable.reusable_pages_failure, "");
2537 SYSCTL_QUAD(_vm, OID_AUTO, reusable_pages_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2538     &vm_page_stats_reusable.reusable_pages_shared, "");
2539 SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2540     &vm_page_stats_reusable.all_reusable_calls, "");
2541 SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2542     &vm_page_stats_reusable.partial_reusable_calls, "");
2543 SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2544     &vm_page_stats_reusable.reuse_pages_success, "");
2545 SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2546     &vm_page_stats_reusable.reuse_pages_failure, "");
2547 SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2548     &vm_page_stats_reusable.all_reuse_calls, "");
2549 SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2550     &vm_page_stats_reusable.partial_reuse_calls, "");
2551 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2552     &vm_page_stats_reusable.can_reuse_success, "");
2553 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2554     &vm_page_stats_reusable.can_reuse_failure, "");
2555 SYSCTL_QUAD(_vm, OID_AUTO, reusable_reclaimed, CTLFLAG_RD | CTLFLAG_LOCKED,
2556     &vm_page_stats_reusable.reusable_reclaimed, "");
2557 SYSCTL_QUAD(_vm, OID_AUTO, reusable_nonwritable, CTLFLAG_RD | CTLFLAG_LOCKED,
2558     &vm_page_stats_reusable.reusable_nonwritable, "");
2559 SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2560     &vm_page_stats_reusable.reusable_shared, "");
2561 SYSCTL_QUAD(_vm, OID_AUTO, free_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2562     &vm_page_stats_reusable.free_shared, "");
2563 
2564 
2565 extern unsigned int vm_page_free_count, vm_page_speculative_count;
2566 SYSCTL_UINT(_vm, OID_AUTO, page_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_count, 0, "");
2567 SYSCTL_UINT(_vm, OID_AUTO, page_speculative_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_speculative_count, 0, "");
2568 
2569 extern unsigned int vm_page_cleaned_count;
2570 SYSCTL_UINT(_vm, OID_AUTO, page_cleaned_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_cleaned_count, 0, "Cleaned queue size");
2571 
2572 extern unsigned int vm_page_pageable_internal_count, vm_page_pageable_external_count;
2573 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_internal_count, 0, "");
2574 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, "");
2575 
2576 /* pageout counts */
2577 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_clean, 0, "");
2578 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_used, 0, "");
2579 
2580 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, "");
2581 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_external, "");
2582 SYSCTL_ULONG(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2583 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_external, "");
2584 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2585 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_cleaned, "");
2586 
2587 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_sharedcache, "");
2588 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache, "");
2589 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_realtime, "");
2590 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime, "");
2591 extern unsigned int vm_page_realtime_count;
2592 SYSCTL_UINT(_vm, OID_AUTO, page_realtime_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_realtime_count, 0, "");
2593 extern int vm_pageout_protect_realtime;
2594 SYSCTL_INT(_vm, OID_AUTO, pageout_protect_realtime, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_protect_realtime, 0, "");
2595 
2596 /* counts of pages prefaulted when entering a memory object */
2597 extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout;
2598 extern int64_t vm_prefault_nb_no_page, vm_prefault_nb_wrong_page;
2599 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_pages, "");
2600 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_bailout, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_bailout, "");
2601 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_no_page, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_no_page, "");
2602 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_wrong_page, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_wrong_page, "");
2603 
2604 #if defined (__x86_64__)
2605 extern unsigned int vm_clump_promote_threshold;
2606 SYSCTL_UINT(_vm, OID_AUTO, vm_clump_promote_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_clump_promote_threshold, 0, "clump size threshold for promotes");
2607 #if DEVELOPMENT || DEBUG
2608 extern unsigned long vm_clump_stats[];
2609 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[1], "free page allocations from clump of 1 page");
2610 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[2], "free page allocations from clump of 2 pages");
2611 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[3], "free page allocations from clump of 3 pages");
2612 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats4, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[4], "free page allocations from clump of 4 pages");
2613 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats5, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[5], "free page allocations from clump of 5 pages");
2614 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats6, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[6], "free page allocations from clump of 6 pages");
2615 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats7, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[7], "free page allocations from clump of 7 pages");
2616 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats8, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[8], "free page allocations from clump of 8 pages");
2617 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats9, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[9], "free page allocations from clump of 9 pages");
2618 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats10, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[10], "free page allocations from clump of 10 pages");
2619 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats11, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[11], "free page allocations from clump of 11 pages");
2620 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats12, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[12], "free page allocations from clump of 12 pages");
2621 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats13, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[13], "free page allocations from clump of 13 pages");
2622 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats14, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[14], "free page allocations from clump of 14 pages");
2623 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats15, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[15], "free page allocations from clump of 15 pages");
2624 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats16, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[16], "free page allocations from clump of 16 pages");
2625 extern unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
2626 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_alloc, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_allocs, "free page allocations");
2627 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inserts, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inserts, "free page insertions");
2628 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inrange, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inrange, "free page insertions that are part of vm_pages");
2629 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_promotes, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_promotes, "pages promoted to head");
2630 #endif  /* if DEVELOPMENT || DEBUG */
2631 #endif  /* #if defined (__x86_64__) */
2632 
2633 #if CONFIG_SECLUDED_MEMORY
2634 
2635 SYSCTL_UINT(_vm, OID_AUTO, num_tasks_can_use_secluded_mem, CTLFLAG_RD | CTLFLAG_LOCKED, &num_tasks_can_use_secluded_mem, 0, "");
2636 extern unsigned int vm_page_secluded_target;
2637 extern unsigned int vm_page_secluded_count;
2638 extern unsigned int vm_page_secluded_count_free;
2639 extern unsigned int vm_page_secluded_count_inuse;
2640 extern unsigned int vm_page_secluded_count_over_target;
2641 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, "");
2642 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, "");
2643 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, "");
2644 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, "");
2645 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_over_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_over_target, 0, "");
2646 
2647 extern struct vm_page_secluded_data vm_page_secluded;
2648 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, "");
2649 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_free, 0, "");
2650 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_other, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_other, 0, "");
2651 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_locked, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_locked, 0, "");
2652 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_state, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_state, 0, "");
2653 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_realtime, 0, "");
2654 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_dirty, 0, "");
2655 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, "");
2656 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, "");
2657 
2658 #endif /* CONFIG_SECLUDED_MEMORY */
2659 
2660 #if CONFIG_DEFERRED_RECLAIM
2661 #pragma mark Deferred Reclaim
2662 SYSCTL_NODE(_vm, OID_AUTO, reclaim, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Deferred Memory Reclamation");
2663 #if DEVELOPMENT || DEBUG
2664 /*
2665  * VM reclaim testing
2666  */
2667 extern bool vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task);
2668 
2669 static int
2670 sysctl_vm_reclaim_wait_for_pid SYSCTL_HANDLER_ARGS
2671 {
2672 	int error = EINVAL, pid = 0;
2673 	/*
2674 	 * Only send on write
2675 	 */
2676 	error = sysctl_handle_int(oidp, &pid, 0, req);
2677 	if (error || !req->newptr) {
2678 		return error;
2679 	}
2680 	if (pid <= 0) {
2681 		return EINVAL;
2682 	}
2683 	proc_t p = proc_find(pid);
2684 	if (p == PROC_NULL) {
2685 		return ESRCH;
2686 	}
2687 	task_t t = proc_task(p);
2688 	if (t == TASK_NULL) {
2689 		proc_rele(p);
2690 		return ESRCH;
2691 	}
2692 	task_reference(t);
2693 	proc_rele(p);
2694 
2695 	bool success = vm_deferred_reclamation_block_until_task_has_been_reclaimed(t);
2696 	if (success) {
2697 		error = 0;
2698 	}
2699 	task_deallocate(t);
2700 
2701 	return error;
2702 }
2703 
2704 SYSCTL_PROC(_vm_reclaim, OID_AUTO, wait_for_pid,
2705     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2706     &sysctl_vm_reclaim_wait_for_pid, "I",
2707     "Block until the given pid has been drained by kernel GC");
2708 
2709 static int
2710 sysctl_vm_reclaim_drain_pid SYSCTL_HANDLER_ARGS
2711 {
2712 	int error = EINVAL;
2713 	kern_return_t kr;
2714 	pid_t pid;
2715 	error = sysctl_handle_int(oidp, &pid, 0, req);
2716 	/* Only reclaim on write */
2717 	if (error || !req->newptr) {
2718 		return error;
2719 	}
2720 	if (pid <= 0) {
2721 		return EINVAL;
2722 	}
2723 	proc_t p = proc_find(pid);
2724 	if (p == PROC_NULL) {
2725 		return ESRCH;
2726 	}
2727 	task_t t = proc_task(p);
2728 	if (t == TASK_NULL) {
2729 		proc_rele(p);
2730 		return ESRCH;
2731 	}
2732 	task_reference(t);
2733 	proc_rele(p);
2734 	kr = vm_deferred_reclamation_task_drain(t, RECLAIM_OPTIONS_NONE);
2735 	task_deallocate(t);
2736 	return mach_to_bsd_errno(kr);
2737 }
2738 
2739 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_pid,
2740     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2741     &sysctl_vm_reclaim_drain_pid, "I",
2742     "Drain the deferred reclamation buffer for a pid");
2743 
2744 static int
proc_filter_reclaimable(proc_t p,__unused void * arg)2745 proc_filter_reclaimable(proc_t p, __unused void *arg)
2746 {
2747 	task_t task = proc_task(p);
2748 	return vm_deferred_reclamation_task_has_ring(task);
2749 }
2750 
2751 static int
proc_reclaim_drain(proc_t p,__unused void * arg)2752 proc_reclaim_drain(proc_t p, __unused void *arg)
2753 {
2754 	kern_return_t kr;
2755 	task_t task = proc_task(p);
2756 	kr = vm_deferred_reclamation_task_drain(task, RECLAIM_OPTIONS_NONE);
2757 	return mach_to_bsd_errno(kr);
2758 }
2759 
2760 static int
2761 sysctl_vm_reclaim_drain_all SYSCTL_HANDLER_ARGS
2762 {
2763 	int error;
2764 	int val;
2765 	if (!req->newptr) {
2766 		return EINVAL;
2767 	}
2768 	error = sysctl_handle_int(oidp, &val, 0, req);
2769 	if (error || val == FALSE) {
2770 		return error;
2771 	}
2772 	proc_iterate(PROC_ALLPROCLIST, proc_reclaim_drain, NULL,
2773 	    proc_filter_reclaimable, NULL);
2774 	return 0;
2775 }
2776 
2777 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_all,
2778     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2779     &sysctl_vm_reclaim_drain_all, "I",
2780     "Fully reclaim from every deferred reclamation buffer on the system");
2781 
2782 extern uint32_t vm_reclaim_buffer_count;
2783 extern uint64_t vm_reclaim_gc_epoch;
2784 extern uint64_t vm_reclaim_gc_reclaim_count;
2785 extern uint64_t vm_reclaim_sampling_period_abs;
2786 extern uint64_t vm_reclaim_sampling_period_ns;
2787 extern bool vm_reclaim_debug;
2788 #if XNU_TARGET_OS_IOS
2789 extern uint64_t vm_reclaim_max_threshold;
2790 #else /* !XNU_TARGET_OS_IOS */
2791 extern bool vm_reclaim_enabled;
2792 extern uint32_t vm_reclaim_autotrim_pct_normal;
2793 extern uint32_t vm_reclaim_autotrim_pct_pressure;
2794 extern uint32_t vm_reclaim_autotrim_pct_critical;
2795 extern uint32_t vm_reclaim_wma_weight_base;
2796 extern uint32_t vm_reclaim_wma_weight_cur;
2797 extern uint32_t vm_reclaim_wma_denom;
2798 extern uint64_t vm_reclaim_abandonment_threshold;
2799 #endif /* XNU_TARGET_OS_IOS */
2800 
2801 SYSCTL_UINT(_vm_reclaim, OID_AUTO, reclaim_buffer_count,
2802     CTLFLAG_RD | CTLFLAG_LOCKED, (uint32_t *)&vm_reclaim_buffer_count, 0,
2803     "The number of deferred memory buffers currently alive");
2804 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_epoch,
2805     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_epoch,
2806     "Number of times the global GC thread has run");
2807 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_reclaim_count,
2808     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_reclaim_count,
2809     "Number of times the global GC thread has reclaimed from a buffer");
2810 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, debug,
2811     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_debug, 0,
2812     "Debug logs for vm.reclaim");
2813 #if XNU_TARGET_OS_IOS
2814 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, max_threshold,
2815     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_max_threshold,
2816     "Maximum amount of virtual memory (in B) that may be deferred without "
2817     "synchronous reclamation");
2818 #else /* !XNU_TARGET_OS_IOS */
2819 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, enabled,
2820     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_enabled, 0,
2821     "Whether deferred memory reclamation is enabled on this system");
2822 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_normal,
2823     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_normal, 0,
2824     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2825     "to engage auto-trim when the system is operating normally");
2826 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_pressure,
2827     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_pressure, 0,
2828     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2829     "to engage auto-trim when the system is under memory pressure");
2830 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_critical,
2831     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_critical, 0,
2832     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2833     "to engage auto-trim when the system is under critical memory pressure");
2834 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_base,
2835     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_base, 0,
2836     "Weight applied to historical minimum buffer size samples");
2837 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_cur,
2838     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_cur, 0,
2839     "Weight applied to current sampled minimum buffer size");
2840 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_denom,
2841     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_denom, 0,
2842     "Denominator for weighted moving average calculation");
2843 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, abandonment_threshold,
2844     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_abandonment_threshold,
2845     "The number of sampling periods between accounting updates that may elapse "
2846     "before the buffer is considered \"abandoned\"");
2847 #endif /* XNU_TARGET_OS_IOS */
2848 
2849 static int
2850 sysctl_vm_reclaim_sampling_period SYSCTL_HANDLER_ARGS
2851 {
2852 	uint64_t new_val_ns;
2853 	uint64_t old_val_ns = vm_reclaim_sampling_period_ns;
2854 	int err = sysctl_io_number(req, vm_reclaim_sampling_period_ns,
2855 	    sizeof(vm_reclaim_sampling_period_ns), &new_val_ns, NULL);
2856 	if (err || !req->newptr) {
2857 		return err;
2858 	}
2859 	if (new_val_ns != old_val_ns) {
2860 		vm_reclaim_sampling_period_ns = new_val_ns;
2861 		nanoseconds_to_absolutetime(vm_reclaim_sampling_period_ns, &vm_reclaim_sampling_period_abs);
2862 	}
2863 	return 0;
2864 }
2865 
2866 SYSCTL_PROC(_vm_reclaim, OID_AUTO, sampling_period_ns,
2867     CTLFLAG_RW | CTLTYPE_QUAD | CTLFLAG_LOCKED, NULL, 0, sysctl_vm_reclaim_sampling_period, "QU",
2868     "Interval (nanoseconds) at which to sample the minimum buffer size and "
2869     "consider trimming excess");
2870 #endif /* DEVELOPMENT || DEBUG */
2871 #endif /* CONFIG_DEFERRED_RECLAIM */
2872 
2873 #include <kern/thread.h>
2874 #include <sys/user.h>
2875 
2876 void vm_pageout_io_throttle(void);
2877 
2878 void
vm_pageout_io_throttle(void)2879 vm_pageout_io_throttle(void)
2880 {
2881 	struct uthread *uthread = current_uthread();
2882 
2883 	/*
2884 	 * thread is marked as a low priority I/O type
2885 	 * and the I/O we issued while in this cleaning operation
2886 	 * collided with normal I/O operations... we'll
2887 	 * delay in order to mitigate the impact of this
2888 	 * task on the normal operation of the system
2889 	 */
2890 
2891 	if (uthread->uu_lowpri_window) {
2892 		throttle_lowpri_io(1);
2893 	}
2894 }
2895 
2896 int
vm_pressure_monitor(__unused struct proc * p,struct vm_pressure_monitor_args * uap,int * retval)2897 vm_pressure_monitor(
2898 	__unused struct proc *p,
2899 	struct vm_pressure_monitor_args *uap,
2900 	int *retval)
2901 {
2902 	kern_return_t   kr;
2903 	uint32_t        pages_reclaimed;
2904 	uint32_t        pages_wanted;
2905 
2906 	kr = mach_vm_pressure_monitor(
2907 		(boolean_t) uap->wait_for_pressure,
2908 		uap->nsecs_monitored,
2909 		(uap->pages_reclaimed) ? &pages_reclaimed : NULL,
2910 		&pages_wanted);
2911 
2912 	switch (kr) {
2913 	case KERN_SUCCESS:
2914 		break;
2915 	case KERN_ABORTED:
2916 		return EINTR;
2917 	default:
2918 		return EINVAL;
2919 	}
2920 
2921 	if (uap->pages_reclaimed) {
2922 		if (copyout((void *)&pages_reclaimed,
2923 		    uap->pages_reclaimed,
2924 		    sizeof(pages_reclaimed)) != 0) {
2925 			return EFAULT;
2926 		}
2927 	}
2928 
2929 	*retval = (int) pages_wanted;
2930 	return 0;
2931 }
2932 
2933 int
kas_info(struct proc * p,struct kas_info_args * uap,int * retval __unused)2934 kas_info(struct proc *p,
2935     struct kas_info_args *uap,
2936     int *retval __unused)
2937 {
2938 #ifndef CONFIG_KAS_INFO
2939 	(void)p;
2940 	(void)uap;
2941 	return ENOTSUP;
2942 #else /* CONFIG_KAS_INFO */
2943 	int                     selector = uap->selector;
2944 	user_addr_t     valuep = uap->value;
2945 	user_addr_t     sizep = uap->size;
2946 	user_size_t size, rsize;
2947 	int                     error;
2948 
2949 	if (!kauth_cred_issuser(kauth_cred_get())) {
2950 		return EPERM;
2951 	}
2952 
2953 #if CONFIG_MACF
2954 	error = mac_system_check_kas_info(kauth_cred_get(), selector);
2955 	if (error) {
2956 		return error;
2957 	}
2958 #endif
2959 
2960 	if (IS_64BIT_PROCESS(p)) {
2961 		user64_size_t size64;
2962 		error = copyin(sizep, &size64, sizeof(size64));
2963 		size = (user_size_t)size64;
2964 	} else {
2965 		user32_size_t size32;
2966 		error = copyin(sizep, &size32, sizeof(size32));
2967 		size = (user_size_t)size32;
2968 	}
2969 	if (error) {
2970 		return error;
2971 	}
2972 
2973 	switch (selector) {
2974 	case KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR:
2975 	{
2976 		uint64_t slide = vm_kernel_slide;
2977 
2978 		if (sizeof(slide) != size) {
2979 			return EINVAL;
2980 		}
2981 
2982 		error = copyout(&slide, valuep, sizeof(slide));
2983 		if (error) {
2984 			return error;
2985 		}
2986 		rsize = size;
2987 	}
2988 	break;
2989 	case KAS_INFO_KERNEL_SEGMENT_VMADDR_SELECTOR:
2990 	{
2991 		uint32_t i;
2992 		kernel_mach_header_t *mh = &_mh_execute_header;
2993 		struct load_command *cmd;
2994 		cmd = (struct load_command*) &mh[1];
2995 		uint64_t *bases;
2996 		rsize = mh->ncmds * sizeof(uint64_t);
2997 
2998 		/*
2999 		 * Return the size if no data was passed
3000 		 */
3001 		if (valuep == 0) {
3002 			break;
3003 		}
3004 
3005 		if (rsize > size) {
3006 			return EINVAL;
3007 		}
3008 
3009 		bases = kalloc_data(rsize, Z_WAITOK | Z_ZERO);
3010 
3011 		for (i = 0; i < mh->ncmds; i++) {
3012 			if (cmd->cmd == LC_SEGMENT_KERNEL) {
3013 				__IGNORE_WCASTALIGN(kernel_segment_command_t * sg = (kernel_segment_command_t *) cmd);
3014 				bases[i] = (uint64_t)sg->vmaddr;
3015 			}
3016 			cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize);
3017 		}
3018 
3019 		error = copyout(bases, valuep, rsize);
3020 
3021 		kfree_data(bases, rsize);
3022 
3023 		if (error) {
3024 			return error;
3025 		}
3026 	}
3027 	break;
3028 	case KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR:
3029 	case KAS_INFO_TXM_TEXT_SLIDE_SELECTOR:
3030 	{
3031 #if CONFIG_SPTM
3032 		const uint64_t slide =
3033 		    (selector == KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR) ? vm_sptm_offsets.slide : vm_txm_offsets.slide;
3034 #else
3035 		const uint64_t slide = 0;
3036 #endif
3037 
3038 		if (sizeof(slide) != size) {
3039 			return EINVAL;
3040 		}
3041 
3042 		error = copyout(&slide, valuep, sizeof(slide));
3043 		if (error) {
3044 			return error;
3045 		}
3046 		rsize = size;
3047 	}
3048 	break;
3049 	default:
3050 		return EINVAL;
3051 	}
3052 
3053 	if (IS_64BIT_PROCESS(p)) {
3054 		user64_size_t size64 = (user64_size_t)rsize;
3055 		error = copyout(&size64, sizep, sizeof(size64));
3056 	} else {
3057 		user32_size_t size32 = (user32_size_t)rsize;
3058 		error = copyout(&size32, sizep, sizeof(size32));
3059 	}
3060 
3061 	return error;
3062 #endif /* CONFIG_KAS_INFO */
3063 }
3064 
3065 #pragma clang diagnostic push
3066 #pragma clang diagnostic ignored "-Wcast-qual"
3067 #pragma clang diagnostic ignored "-Wunused-function"
3068 
3069 static void
asserts()3070 asserts()
3071 {
3072 	static_assert(sizeof(vm_min_kernel_address) == sizeof(unsigned long));
3073 	static_assert(sizeof(vm_max_kernel_address) == sizeof(unsigned long));
3074 }
3075 
3076 SYSCTL_ULONG(_vm, OID_AUTO, vm_min_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_min_kernel_address, "");
3077 SYSCTL_ULONG(_vm, OID_AUTO, vm_max_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_max_kernel_address, "");
3078 #pragma clang diagnostic pop
3079 
3080 extern uint32_t vm_page_pages;
3081 SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0, "");
3082 
3083 extern uint32_t vm_page_busy_absent_skipped;
3084 SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, "");
3085 
3086 extern uint32_t vm_page_upl_tainted;
3087 SYSCTL_UINT(_vm, OID_AUTO, upl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_upl_tainted, 0, "");
3088 
3089 extern uint32_t vm_page_iopl_tainted;
3090 SYSCTL_UINT(_vm, OID_AUTO, iopl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_iopl_tainted, 0, "");
3091 
3092 #if __arm64__ && (DEVELOPMENT || DEBUG)
3093 extern int vm_footprint_suspend_allowed;
3094 SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, "");
3095 
3096 extern void pmap_footprint_suspend(vm_map_t map, boolean_t suspend);
3097 static int
3098 sysctl_vm_footprint_suspend SYSCTL_HANDLER_ARGS
3099 {
3100 #pragma unused(oidp, arg1, arg2)
3101 	int error = 0;
3102 	int new_value;
3103 
3104 	if (req->newptr == USER_ADDR_NULL) {
3105 		return 0;
3106 	}
3107 	error = SYSCTL_IN(req, &new_value, sizeof(int));
3108 	if (error) {
3109 		return error;
3110 	}
3111 	if (!vm_footprint_suspend_allowed) {
3112 		if (new_value != 0) {
3113 			/* suspends are not allowed... */
3114 			return 0;
3115 		}
3116 		/* ... but let resumes proceed */
3117 	}
3118 	DTRACE_VM2(footprint_suspend,
3119 	    vm_map_t, current_map(),
3120 	    int, new_value);
3121 
3122 	pmap_footprint_suspend(current_map(), new_value);
3123 
3124 	return 0;
3125 }
3126 SYSCTL_PROC(_vm, OID_AUTO, footprint_suspend,
3127     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3128     0, 0, &sysctl_vm_footprint_suspend, "I", "");
3129 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
3130 
3131 extern uint64_t vm_map_corpse_footprint_count;
3132 extern uint64_t vm_map_corpse_footprint_size_avg;
3133 extern uint64_t vm_map_corpse_footprint_size_max;
3134 extern uint64_t vm_map_corpse_footprint_full;
3135 extern uint64_t vm_map_corpse_footprint_no_buf;
3136 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_count,
3137     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_count, "");
3138 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_avg,
3139     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_avg, "");
3140 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_max,
3141     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_max, "");
3142 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_full,
3143     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_full, "");
3144 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_no_buf,
3145     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_no_buf, "");
3146 
3147 #if CODE_SIGNING_MONITOR
3148 extern uint64_t vm_cs_defer_to_csm;
3149 extern uint64_t vm_cs_defer_to_csm_not;
3150 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm,
3151     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm, "");
3152 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm_not,
3153     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm_not, "");
3154 #endif /* CODE_SIGNING_MONITOR */
3155 
3156 extern uint64_t shared_region_pager_copied;
3157 extern uint64_t shared_region_pager_slid;
3158 extern uint64_t shared_region_pager_slid_error;
3159 extern uint64_t shared_region_pager_reclaimed;
3160 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_copied,
3161     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_copied, "");
3162 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid,
3163     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid, "");
3164 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid_error,
3165     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid_error, "");
3166 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed,
3167     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_reclaimed, "");
3168 extern int shared_region_destroy_delay;
3169 SYSCTL_INT(_vm, OID_AUTO, shared_region_destroy_delay,
3170     CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_destroy_delay, 0, "");
3171 
3172 #if MACH_ASSERT
3173 extern int pmap_ledgers_panic_leeway;
3174 SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
3175 #endif /* MACH_ASSERT */
3176 
3177 
3178 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_count;
3179 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_size;
3180 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_max;
3181 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart;
3182 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_error;
3183 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_count;
3184 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_size;
3185 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_max;
3186 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart;
3187 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_error;
3188 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_count;
3189 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_size;
3190 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_max;
3191 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_count,
3192     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_count, "");
3193 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_size,
3194     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_size, "");
3195 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_max,
3196     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_max, "");
3197 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_restart,
3198     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_restart, "");
3199 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_error,
3200     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_error, "");
3201 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_count,
3202     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_count, "");
3203 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_size,
3204     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_size, "");
3205 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_max,
3206     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_max, "");
3207 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_restart,
3208     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_restart, "");
3209 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_error,
3210     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_error, "");
3211 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_count,
3212     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_count, "");
3213 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_size,
3214     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_size, "");
3215 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_max,
3216     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_max, "");
3217 
3218 extern int vm_protect_privileged_from_untrusted;
3219 SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted,
3220     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, "");
3221 extern uint64_t vm_copied_on_read;
3222 extern uint64_t vm_copied_on_read_kernel_map;
3223 extern uint64_t vm_copied_on_read_platform_map;
3224 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read,
3225     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, "");
3226 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read_kernel_map,
3227     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read_kernel_map, "");
3228 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read_platform_map,
3229     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read_platform_map, "");
3230 
3231 extern int vm_shared_region_count;
3232 extern int vm_shared_region_peak;
3233 SYSCTL_INT(_vm, OID_AUTO, shared_region_count,
3234     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_count, 0, "");
3235 SYSCTL_INT(_vm, OID_AUTO, shared_region_peak,
3236     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_peak, 0, "");
3237 #if DEVELOPMENT || DEBUG
3238 extern unsigned int shared_region_pagers_resident_count;
3239 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_count,
3240     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_count, 0, "");
3241 extern unsigned int shared_region_pagers_resident_peak;
3242 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_peak,
3243     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_peak, 0, "");
3244 extern int shared_region_pager_count;
3245 SYSCTL_INT(_vm, OID_AUTO, shared_region_pager_count,
3246     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_count, 0, "");
3247 #if __has_feature(ptrauth_calls)
3248 extern int shared_region_key_count;
3249 SYSCTL_INT(_vm, OID_AUTO, shared_region_key_count,
3250     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_key_count, 0, "");
3251 extern int vm_shared_region_reslide_count;
3252 SYSCTL_INT(_vm, OID_AUTO, shared_region_reslide_count,
3253     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_reslide_count, 0, "");
3254 #endif /* __has_feature(ptrauth_calls) */
3255 #endif /* DEVELOPMENT || DEBUG */
3256 
3257 #if MACH_ASSERT
3258 extern int debug4k_filter;
3259 SYSCTL_INT(_vm, OID_AUTO, debug4k_filter, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_filter, 0, "");
3260 extern int debug4k_panic_on_terminate;
3261 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_terminate, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_terminate, 0, "");
3262 extern int debug4k_panic_on_exception;
3263 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_exception, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_exception, 0, "");
3264 extern int debug4k_panic_on_misaligned_sharing;
3265 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_misaligned_sharing, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_misaligned_sharing, 0, "");
3266 #endif /* MACH_ASSERT */
3267 
3268 extern uint64_t vm_map_set_size_limit_count;
3269 extern uint64_t vm_map_set_data_limit_count;
3270 extern uint64_t vm_map_enter_RLIMIT_AS_count;
3271 extern uint64_t vm_map_enter_RLIMIT_DATA_count;
3272 SYSCTL_QUAD(_vm, OID_AUTO, map_set_size_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_size_limit_count, "");
3273 SYSCTL_QUAD(_vm, OID_AUTO, map_set_data_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_data_limit_count, "");
3274 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_AS_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_AS_count, "");
3275 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_DATA_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_DATA_count, "");
3276 
3277 extern uint64_t vm_fault_resilient_media_initiate;
3278 extern uint64_t vm_fault_resilient_media_retry;
3279 extern uint64_t vm_fault_resilient_media_proceed;
3280 extern uint64_t vm_fault_resilient_media_release;
3281 extern uint64_t vm_fault_resilient_media_abort1;
3282 extern uint64_t vm_fault_resilient_media_abort2;
3283 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_initiate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_initiate, "");
3284 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_retry, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_retry, "");
3285 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_proceed, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_proceed, "");
3286 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_release, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_release, "");
3287 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort1, "");
3288 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort2, "");
3289 #if MACH_ASSERT
3290 extern int vm_fault_resilient_media_inject_error1_rate;
3291 extern int vm_fault_resilient_media_inject_error1;
3292 extern int vm_fault_resilient_media_inject_error2_rate;
3293 extern int vm_fault_resilient_media_inject_error2;
3294 extern int vm_fault_resilient_media_inject_error3_rate;
3295 extern int vm_fault_resilient_media_inject_error3;
3296 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1_rate, 0, "");
3297 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1, 0, "");
3298 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2_rate, 0, "");
3299 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2, 0, "");
3300 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3_rate, 0, "");
3301 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3, 0, "");
3302 #endif /* MACH_ASSERT */
3303 
3304 extern uint64_t pmap_query_page_info_retries;
3305 SYSCTL_QUAD(_vm, OID_AUTO, pmap_query_page_info_retries, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_query_page_info_retries, "");
3306 
3307 /*
3308  * A sysctl which causes all existing shared regions to become stale. They
3309  * will no longer be used by anything new and will be torn down as soon as
3310  * the last existing user exits. A write of non-zero value causes that to happen.
3311  * This should only be used by launchd, so we check that this is initproc.
3312  */
3313 static int
shared_region_pivot(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3314 shared_region_pivot(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3315 {
3316 	unsigned int value = 0;
3317 	int changed = 0;
3318 	int error = sysctl_io_number(req, 0, sizeof(value), &value, &changed);
3319 	if (error || !changed) {
3320 		return error;
3321 	}
3322 	if (current_proc() != initproc) {
3323 		return EPERM;
3324 	}
3325 
3326 	vm_shared_region_pivot();
3327 
3328 	return 0;
3329 }
3330 
3331 SYSCTL_PROC(_vm, OID_AUTO, shared_region_pivot,
3332     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
3333     0, 0, shared_region_pivot, "I", "");
3334 
3335 extern uint64_t vm_object_shadow_forced;
3336 extern uint64_t vm_object_shadow_skipped;
3337 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
3338     &vm_object_shadow_forced, "");
3339 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_skipped, CTLFLAG_RD | CTLFLAG_LOCKED,
3340     &vm_object_shadow_skipped, "");
3341 
3342 extern uint64_t vm_object_upl_throttle_cnt;
3343 SYSCTL_QUAD(_vm, OID_AUTO, object_upl_throttle_cnt, CTLFLAG_RD | CTLFLAG_LOCKED,
3344     &vm_object_upl_throttle_cnt,
3345     "The number of times in which a UPL write was throttled due to pageout starvation");
3346 
3347 
3348 SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED,
3349     &vmtc_total, 0, "total text page corruptions detected");
3350 
3351 
3352 #if DEBUG || DEVELOPMENT
3353 /*
3354  * A sysctl that can be used to corrupt a text page with an illegal instruction.
3355  * Used for testing text page self healing.
3356  */
3357 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
3358 static int
corrupt_text_addr(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3359 corrupt_text_addr(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3360 {
3361 	uint64_t value = 0;
3362 	int error = sysctl_handle_quad(oidp, &value, 0, req);
3363 	if (error || !req->newptr) {
3364 		return error;
3365 	}
3366 
3367 	if (vm_corrupt_text_addr((uintptr_t)value) == KERN_SUCCESS) {
3368 		return 0;
3369 	} else {
3370 		return EINVAL;
3371 	}
3372 }
3373 
3374 SYSCTL_PROC(_vm, OID_AUTO, corrupt_text_addr,
3375     CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3376     0, 0, corrupt_text_addr, "-", "");
3377 #endif /* DEBUG || DEVELOPMENT */
3378 
3379 #if CONFIG_MAP_RANGES
3380 /*
3381  * vm.malloc_ranges
3382  *
3383  * space-separated list of <left:right> hexadecimal addresses.
3384  */
3385 static int
3386 vm_map_malloc_ranges SYSCTL_HANDLER_ARGS
3387 {
3388 	vm_map_t map = current_map();
3389 	struct mach_vm_range r1, r2;
3390 	char str[20 * 4];
3391 	int len;
3392 	mach_vm_offset_t right_hole_max;
3393 
3394 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_DEFAULT, &r1)) {
3395 		return ENOENT;
3396 	}
3397 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_HEAP, &r2)) {
3398 		return ENOENT;
3399 	}
3400 
3401 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
3402 	right_hole_max = MACH_VM_JUMBO_ADDRESS;
3403 #else /* !XNU_TARGET_OS_IOS || !EXTENDED_USER_VA_SUPPORT */
3404 	right_hole_max = get_map_max(map);
3405 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
3406 
3407 	len = scnprintf(str, sizeof(str), "0x%llx:0x%llx 0x%llx:0x%llx",
3408 	    r1.max_address, r2.min_address,
3409 	    r2.max_address, right_hole_max);
3410 
3411 	return SYSCTL_OUT(req, str, len);
3412 }
3413 
3414 SYSCTL_PROC(_vm, OID_AUTO, malloc_ranges,
3415     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3416     0, 0, &vm_map_malloc_ranges, "A", "");
3417 
3418 #if DEBUG || DEVELOPMENT
3419 static int
3420 vm_map_user_range_default SYSCTL_HANDLER_ARGS
3421 {
3422 #pragma unused(arg1, arg2, oidp)
3423 	struct mach_vm_range range;
3424 
3425 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_DEFAULT, &range)
3426 	    != KERN_SUCCESS) {
3427 		return EINVAL;
3428 	}
3429 
3430 	return SYSCTL_OUT(req, &range, sizeof(range));
3431 }
3432 
3433 static int
3434 vm_map_user_range_heap SYSCTL_HANDLER_ARGS
3435 {
3436 #pragma unused(arg1, arg2, oidp)
3437 	struct mach_vm_range range;
3438 
3439 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_HEAP, &range)
3440 	    != KERN_SUCCESS) {
3441 		return EINVAL;
3442 	}
3443 
3444 	return SYSCTL_OUT(req, &range, sizeof(range));
3445 }
3446 
3447 static int
3448 vm_map_user_range_large_file SYSCTL_HANDLER_ARGS
3449 {
3450 #pragma unused(arg1, arg2, oidp)
3451 	struct mach_vm_range range;
3452 
3453 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_LARGE_FILE, &range)
3454 	    != KERN_SUCCESS) {
3455 		return EINVAL;
3456 	}
3457 
3458 	return SYSCTL_OUT(req, &range, sizeof(range));
3459 }
3460 
3461 /*
3462  * A sysctl that can be used to return ranges for the current VM map.
3463  * Used for testing VM ranges.
3464  */
3465 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_default, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3466     0, 0, &vm_map_user_range_default, "S,mach_vm_range", "");
3467 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_heap, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3468     0, 0, &vm_map_user_range_heap, "S,mach_vm_range", "");
3469 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_large_file, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3470     0, 0, &vm_map_user_range_large_file, "S,mach_vm_range", "");
3471 
3472 #endif /* DEBUG || DEVELOPMENT */
3473 #endif /* CONFIG_MAP_RANGES */
3474 
3475 #if DEBUG || DEVELOPMENT
3476 #endif /* DEBUG || DEVELOPMENT */
3477 
3478 extern uint64_t vm_map_range_overflows_count;
3479 SYSCTL_QUAD(_vm, OID_AUTO, map_range_overflows_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_range_overflows_count, "");
3480 extern boolean_t vm_map_range_overflows_log;
3481 SYSCTL_INT(_vm, OID_AUTO, map_range_oveflows_log, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_range_overflows_log, 0, "");
3482 
3483 extern uint64_t c_seg_filled_no_contention;
3484 extern uint64_t c_seg_filled_contention;
3485 extern clock_sec_t c_seg_filled_contention_sec_max;
3486 extern clock_nsec_t c_seg_filled_contention_nsec_max;
3487 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_no_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_no_contention, "");
3488 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention, "");
3489 SYSCTL_ULONG(_vm, OID_AUTO, c_seg_filled_contention_sec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_sec_max, "");
3490 SYSCTL_UINT(_vm, OID_AUTO, c_seg_filled_contention_nsec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_nsec_max, 0, "");
3491 #if (XNU_TARGET_OS_OSX && __arm64__)
3492 extern clock_nsec_t c_process_major_report_over_ms; /* report if over ? ms */
3493 extern int c_process_major_yield_after; /* yield after moving ? segments */
3494 extern uint64_t c_process_major_reports;
3495 extern clock_sec_t c_process_major_max_sec;
3496 extern clock_nsec_t c_process_major_max_nsec;
3497 extern uint32_t c_process_major_peak_segcount;
3498 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_report_over_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_report_over_ms, 0, "");
3499 SYSCTL_INT(_vm, OID_AUTO, c_process_major_yield_after, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_yield_after, 0, "");
3500 SYSCTL_QUAD(_vm, OID_AUTO, c_process_major_reports, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_reports, "");
3501 SYSCTL_ULONG(_vm, OID_AUTO, c_process_major_max_sec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_sec, "");
3502 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_max_nsec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_nsec, 0, "");
3503 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_peak_segcount, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_peak_segcount, 0, "");
3504 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3505 
3506 #if DEVELOPMENT || DEBUG
3507 extern int panic_object_not_alive;
3508 SYSCTL_INT(_vm, OID_AUTO, panic_object_not_alive, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &panic_object_not_alive, 0, "");
3509 #endif /* DEVELOPMENT || DEBUG */
3510 
3511 #if FBDP_DEBUG_OBJECT_NO_PAGER
3512 extern int fbdp_no_panic;
3513 SYSCTL_INT(_vm, OID_AUTO, fbdp_no_panic, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &fbdp_no_panic, 0, "");
3514 #endif /* MACH_ASSERT */
3515 
3516 extern uint64_t cluster_direct_write_wired;
3517 SYSCTL_QUAD(_vm, OID_AUTO, cluster_direct_write_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &cluster_direct_write_wired, "");
3518 
3519 extern uint64_t vm_object_pageout_not_on_queue;
3520 extern uint64_t vm_object_pageout_not_pageable;
3521 extern uint64_t vm_object_pageout_pageable;
3522 extern uint64_t vm_object_pageout_active_local;
3523 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_on_queue, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_on_queue, "");
3524 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_pageable, "");
3525 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_pageable, "");
3526 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_active_local, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_active_local, "");
3527 
3528 
3529 #if DEVELOPMENT || DEBUG
3530 
3531 static uint32_t
sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)3532 sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)
3533 {
3534 #pragma unused(with_data)
3535 	return VM_C_SEGMENT_INFO_MAGIC;
3536 }
3537 
3538 /* The largest possible single segment + its slots is
3539  * (sizeof(c_segment_info) + C_SLOT_MAX_INDEX * sizeof(c_slot_info)) + (data of a single segment) */
3540 #define SYSCTL_SEG_BUF_SIZE (8 * 1024 + 64 * 1024)
3541 
3542 extern uint32_t c_segments_available;
3543 
3544 struct sysctl_buf_header {
3545 	uint32_t magic;
3546 } __attribute__((packed));
3547 
3548 /* This sysctl iterates over the populated c_segments and writes some info about each one and its slots.
3549  * instead of doing everything here, the function calls a function vm_compressor.c. */
3550 static int
sysctl_compressor_segments_stream(struct sysctl_req * req,vm_c_serialize_add_data_t with_data)3551 sysctl_compressor_segments_stream(struct sysctl_req *req, vm_c_serialize_add_data_t with_data)
3552 {
3553 	char* buf = kalloc_data(SYSCTL_SEG_BUF_SIZE, Z_WAITOK | Z_ZERO);
3554 	if (!buf) {
3555 		return ENOMEM;
3556 	}
3557 	size_t offset = 0;
3558 	int error = 0;
3559 	int segno = 0;
3560 	/* 4 byte header to identify the version of the formatting of the data.
3561 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3562 	((struct sysctl_buf_header*)buf)->magic = sysctl_compressor_seg_magic(with_data);
3563 	offset += sizeof(uint32_t);
3564 
3565 	while (segno < c_segments_available) {
3566 		size_t left_sz = SYSCTL_SEG_BUF_SIZE - offset;
3567 		kern_return_t kr = vm_compressor_serialize_segment_debug_info(segno, buf + offset, &left_sz, with_data);
3568 		if (kr == KERN_NO_SPACE) {
3569 			/* failed to add another segment, push the current buffer out and try again */
3570 			if (offset == 0) {
3571 				error = EINVAL; /* no space to write but I didn't write anything, shouldn't really happen */
3572 				goto out;
3573 			}
3574 			/* write out chunk */
3575 			error = SYSCTL_OUT(req, buf, offset);
3576 			if (error) {
3577 				goto out;
3578 			}
3579 			offset = 0;
3580 			bzero(buf, SYSCTL_SEG_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3581 			/* don't increment segno, need to try again saving the current one */
3582 		} else if (kr != KERN_SUCCESS) {
3583 			error = EINVAL;
3584 			goto out;
3585 		} else {
3586 			offset += left_sz;
3587 			++segno;
3588 			assert(offset <= SYSCTL_SEG_BUF_SIZE);
3589 		}
3590 	}
3591 
3592 	if (offset > 0) { /* write last chunk */
3593 		error = SYSCTL_OUT(req, buf, offset);
3594 	}
3595 
3596 out:
3597 	kfree_data(buf, SYSCTL_SEG_BUF_SIZE)
3598 	return error;
3599 }
3600 
3601 static int
sysctl_compressor_segments(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3602 sysctl_compressor_segments(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3603 {
3604 	return sysctl_compressor_segments_stream(req, VM_C_SERIALIZE_DATA_NONE);
3605 }
3606 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments, "S", "");
3607 
3608 
3609 extern uint32_t vm_compressor_fragmentation_level(void);
3610 
3611 static int
sysctl_compressor_fragmentation_level(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3612 sysctl_compressor_fragmentation_level(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3613 {
3614 	uint32_t value = vm_compressor_fragmentation_level();
3615 	return SYSCTL_OUT(req, &value, sizeof(value));
3616 }
3617 
3618 SYSCTL_PROC(_vm, OID_AUTO, compressor_fragmentation_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_fragmentation_level, "IU", "");
3619 
3620 extern uint32_t vm_compressor_incore_fragmentation_wasted_pages(void);
3621 
3622 static int
sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3623 sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3624 {
3625 	uint32_t value = vm_compressor_incore_fragmentation_wasted_pages();
3626 	return SYSCTL_OUT(req, &value, sizeof(value));
3627 }
3628 
3629 SYSCTL_PROC(_vm, OID_AUTO, compressor_incore_fragmentation_wasted_pages, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_incore_fragmentation_wasted_pages, "IU", "");
3630 
3631 
3632 
3633 #define SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE (8 * 1024)
3634 
3635 
3636 /* This sysctl iterates over all the entries of the vm_map of the a given process and write some info about the vm_object pointed by the entries.
3637  * This can be used for mapping where are all the pages of a process located in the compressor.
3638  */
3639 static int
sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)3640 sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3641 {
3642 	int error = 0;
3643 	char *buf = NULL;
3644 	proc_t p = PROC_NULL;
3645 	task_t task = TASK_NULL;
3646 	vm_map_t map = VM_MAP_NULL;
3647 	__block size_t offset = 0;
3648 
3649 	/* go from pid to proc to task to vm_map. see sysctl_procargsx() for another example of this procession */
3650 	int *name = arg1;
3651 	int namelen = arg2;
3652 	if (namelen < 1) {
3653 		return EINVAL;
3654 	}
3655 	int pid = name[0];
3656 	p = proc_find(pid);  /* this increments a reference to the proc */
3657 	if (p == PROC_NULL) {
3658 		return EINVAL;
3659 	}
3660 	task = proc_task(p);
3661 	proc_rele(p);  /* decrement ref of proc */
3662 	p = PROC_NULL;
3663 	if (task == TASK_NULL) {
3664 		return EINVAL;
3665 	}
3666 	/* convert proc reference to task reference */
3667 	task_reference(task);
3668 	/* task reference to map reference */
3669 	map = get_task_map_reference(task);
3670 	task_deallocate(task);
3671 
3672 	if (map == VM_MAP_NULL) {
3673 		return EINVAL;  /* nothing allocated yet */
3674 	}
3675 
3676 	buf = kalloc_data(SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE, Z_WAITOK | Z_ZERO);
3677 	if (!buf) {
3678 		error = ENOMEM;
3679 		goto out;
3680 	}
3681 
3682 	/* 4 byte header to identify the version of the formatting of the data.
3683 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3684 	((struct sysctl_buf_header*)buf)->magic = VM_MAP_ENTRY_INFO_MAGIC;
3685 	offset += sizeof(uint32_t);
3686 
3687 	kern_return_t (^write_header)(int) = ^kern_return_t (int nentries) {
3688 		/* write the header, happens only once at the beginning so we should have enough space */
3689 		assert(offset + sizeof(struct vm_map_info_hdr) < SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE);
3690 		struct vm_map_info_hdr* out_hdr = (struct vm_map_info_hdr*)(buf + offset);
3691 		out_hdr->vmi_nentries = nentries;
3692 		offset += sizeof(struct vm_map_info_hdr);
3693 		return KERN_SUCCESS;
3694 	};
3695 
3696 	kern_return_t (^write_entry)(void*) = ^kern_return_t (void* entry) {
3697 		while (true) { /* try up to 2 times, first try write the the current buffer, otherwise to a new buffer */
3698 			size_t left_sz = SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE - offset;
3699 			kern_return_t kr = vm_map_dump_entry_and_compressor_pager(entry, buf + offset, &left_sz);
3700 			if (kr == KERN_NO_SPACE) {
3701 				/* failed to write anything, flush the current buffer and try again */
3702 				if (offset == 0) {
3703 					return KERN_FAILURE; /* no space to write but I didn't write anything yet, shouldn't really happen */
3704 				}
3705 				/* write out chunk */
3706 				int out_error = SYSCTL_OUT(req, buf, offset);
3707 				if (out_error) {
3708 					return KERN_FAILURE;
3709 				}
3710 				offset = 0;
3711 				bzero(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3712 				continue; /* need to retry the entry dump again with the cleaned buffer */
3713 			} else if (kr != KERN_SUCCESS) {
3714 				return kr;
3715 			}
3716 			offset += left_sz;
3717 			break;
3718 		}
3719 		return KERN_SUCCESS;
3720 	};
3721 
3722 	/* this foreach first calls to the first callback with the number of entries, then calls the second for every entry
3723 	 * when the buffer is exhausted, it is flushed to the sysctl and restarted */
3724 	kern_return_t kr = vm_map_entries_foreach(map, write_header, write_entry);
3725 
3726 	if (kr != KERN_SUCCESS) {
3727 		goto out;
3728 	}
3729 
3730 	if (offset > 0) { /* last chunk */
3731 		error = SYSCTL_OUT(req, buf, offset);
3732 	}
3733 
3734 out:
3735 	if (buf != NULL) {
3736 		kfree_data(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE)
3737 	}
3738 	if (map != NULL) {
3739 		vm_map_deallocate(map);
3740 	}
3741 	return error;
3742 }
3743 
3744 SYSCTL_PROC(_vm, OID_AUTO, task_vm_objects_slotmap, CTLTYPE_NODE | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_task_vm_objects_slotmap, "S", "");
3745 static int
3746 systctl_vm_reset_tag SYSCTL_HANDLER_ARGS
3747 {
3748 #pragma unused(oidp, arg1, arg2)
3749 	int error;
3750 	int tag;
3751 	kern_return_t kr;
3752 
3753 	/* Need to be root */
3754 	if (!kauth_cred_issuser(kauth_cred_get())) {
3755 		return EPERM;
3756 	}
3757 
3758 	error = SYSCTL_IN(req, &tag, sizeof(tag));
3759 	if (error) {
3760 		return error;
3761 	}
3762 
3763 	if (tag > VM_MAX_TAG_VALUE) {
3764 		return EINVAL;
3765 	}
3766 
3767 	kr = vm_tag_reset_peak((vm_tag_t)tag);
3768 
3769 	return mach_to_bsd_errno(kr);
3770 }
3771 
3772 SYSCTL_PROC(_vm, OID_AUTO, reset_tag,
3773     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED,
3774     0, 0, &systctl_vm_reset_tag, "I", "");
3775 
3776 static int
3777 systctl_vm_reset_all_tags SYSCTL_HANDLER_ARGS
3778 {
3779 #pragma unused(oidp, arg1, arg2)
3780 	/* Only reset the values if the sysctl is a write */
3781 	if (!req->newptr) {
3782 		return EINVAL;
3783 	}
3784 
3785 	/* Need to be root */
3786 	if (!kauth_cred_issuser(kauth_cred_get())) {
3787 		return EPERM;
3788 	}
3789 
3790 	vm_tag_reset_all_peaks();
3791 
3792 	return 0;
3793 }
3794 
3795 SYSCTL_PROC(_vm, OID_AUTO, reset_all_tags,
3796     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED,
3797     0, 0, &systctl_vm_reset_all_tags, "I", "");
3798 
3799 #endif /* DEVELOPMENT || DEBUG */
3800