xref: /xnu-12377.61.12/bsd/vm/vm_unix.c (revision 4d495c6e23c53686cf65f45067f79024cf5dcee8)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Mach Operating System
30  * Copyright (c) 1987 Carnegie-Mellon University
31  * All rights reserved.  The CMU software License Agreement specifies
32  * the terms and conditions for use and redistribution.
33  */
34 /*
35  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
36  * support for mandatory and extensible security protections.  This notice
37  * is included in support of clause 2.2 (b) of the Apple Public License,
38  * Version 2.0.
39  */
40 #include <vm/vm_options.h>
41 
42 #include <kern/ecc.h>
43 #include <kern/task.h>
44 #include <kern/thread.h>
45 #include <kern/debug.h>
46 #include <kern/extmod_statistics.h>
47 #include <mach/mach_traps.h>
48 #include <mach/port.h>
49 #include <mach/sdt.h>
50 #include <mach/task.h>
51 #include <mach/task_access.h>
52 #include <mach/task_special_ports.h>
53 #include <mach/time_value.h>
54 #include <mach/vm_map.h>
55 #include <mach/vm_param.h>
56 #include <mach/vm_prot.h>
57 #include <machine/machine_routines.h>
58 
59 #include <sys/file_internal.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/dir.h>
63 #include <sys/namei.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/vm.h>
67 #include <sys/file.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/mount.h>
70 #include <sys/xattr.h>
71 #include <sys/trace.h>
72 #include <sys/kernel.h>
73 #include <sys/ubc_internal.h>
74 #include <sys/user.h>
75 #include <sys/syslog.h>
76 #include <sys/stat.h>
77 #include <sys/sysproto.h>
78 #include <sys/mman.h>
79 #include <sys/sysctl.h>
80 #include <sys/cprotect.h>
81 #include <sys/kpi_socket.h>
82 #include <sys/kas_info.h>
83 #include <sys/socket.h>
84 #include <sys/socketvar.h>
85 #include <sys/random.h>
86 #include <sys/code_signing.h>
87 #if NECP
88 #include <net/necp.h>
89 #endif /* NECP */
90 #if SKYWALK
91 #include <skywalk/os_channel.h>
92 #endif /* SKYWALK */
93 
94 #include <security/audit/audit.h>
95 #include <security/mac.h>
96 #include <bsm/audit_kevents.h>
97 
98 #include <kern/kalloc.h>
99 #include <kern/host_statistics.h>
100 
101 #include <vm/vm_map_internal.h>
102 #include <vm/vm_kern_xnu.h>
103 #include <vm/vm_pageout_xnu.h>
104 
105 #include <mach/shared_region.h>
106 #include <vm/vm_shared_region_internal.h>
107 
108 #include <vm/vm_dyld_pager_internal.h>
109 #include <vm/vm_protos_internal.h>
110 #include <vm/vm_compressor_info.h>         /* for c_segment_info */
111 #include <vm/vm_compressor_internal.h>
112 #include <vm/vm_compressor_xnu.h>          /* for vm_compressor_serialize_segment_debug_info() */
113 #include <vm/vm_object_xnu.h>              /* for vm_chead_select_t */
114 #include <vm/vm_memory_entry_xnu.h>
115 #include <vm/vm_iokit.h>
116 #include <vm/vm_reclaim_xnu.h>
117 #if HAS_MTE
118 #include <vm/vm_compressor_xnu.h>
119 #include <vm/vm_mteinfo_internal.h>
120 #endif /* HAS_MTE */
121 
122 #include <sys/kern_memorystatus.h>
123 #include <sys/kern_memorystatus_freeze.h>
124 #include <sys/proc_internal.h>
125 
126 #include <mach-o/fixup-chains.h>
127 
128 #if CONFIG_MACF
129 #include <security/mac_framework.h>
130 #endif
131 
132 #include <kern/bits.h>
133 
134 #if CONFIG_CSR
135 #include <sys/csr.h>
136 #endif /* CONFIG_CSR */
137 #include <sys/trust_caches.h>
138 #include <libkern/amfi/amfi.h>
139 #include <IOKit/IOBSD.h>
140 
141 #if VM_MAP_DEBUG_APPLE_PROTECT
142 SYSCTL_INT(_vm, OID_AUTO, map_debug_apple_protect, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_apple_protect, 0, "");
143 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
144 
145 #if DEVELOPMENT || DEBUG
146 
147 extern int vm_object_cache_evict_all(void);
148 static int
149 sysctl_vm_object_cache_evict SYSCTL_HANDLER_ARGS
150 {
151 #pragma unused(arg1, arg2, req)
152 	(void) vm_object_cache_evict_all();
153 	return 0;
154 }
155 
156 SYSCTL_PROC(_vm, OID_AUTO, object_cache_evict, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
157     0, 0, &sysctl_vm_object_cache_evict, "I", "");
158 
159 static int
160 sysctl_kmem_alloc_contig SYSCTL_HANDLER_ARGS
161 {
162 #pragma unused(arg1, arg2)
163 	vm_offset_t     kaddr;
164 	kern_return_t   kr;
165 	int     error = 0;
166 	int     size = 0;
167 
168 	error = sysctl_handle_int(oidp, &size, 0, req);
169 	if (error || !req->newptr) {
170 		return error;
171 	}
172 
173 	kr = kmem_alloc_contig(kernel_map, &kaddr, (vm_size_t)size,
174 	    0, 0, 0, KMA_DATA, VM_KERN_MEMORY_IOKIT);
175 
176 	if (kr == KERN_SUCCESS) {
177 		kmem_free(kernel_map, kaddr, size);
178 	}
179 
180 	return error;
181 }
182 
183 SYSCTL_PROC(_vm, OID_AUTO, kmem_alloc_contig, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
184     0, 0, &sysctl_kmem_alloc_contig, "I", "");
185 
186 extern int vm_region_footprint;
187 SYSCTL_INT(_vm, OID_AUTO, region_footprint, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, &vm_region_footprint, 0, "");
188 
189 static int
190 sysctl_kmem_gobj_stats SYSCTL_HANDLER_ARGS
191 {
192 #pragma unused(arg1, arg2, oidp)
193 	kmem_gobj_stats stats = kmem_get_gobj_stats();
194 
195 	return SYSCTL_OUT(req, &stats, sizeof(stats));
196 }
197 
198 SYSCTL_PROC(_vm, OID_AUTO, kmem_gobj_stats,
199     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
200     0, 0, &sysctl_kmem_gobj_stats, "S,kmem_gobj_stats", "");
201 
202 #endif /* DEVELOPMENT || DEBUG */
203 
204 static int
205 sysctl_vm_self_region_footprint SYSCTL_HANDLER_ARGS
206 {
207 #pragma unused(arg1, arg2, oidp)
208 	int     error = 0;
209 	int     value;
210 
211 	value = task_self_region_footprint();
212 	error = SYSCTL_OUT(req, &value, sizeof(int));
213 	if (error) {
214 		return error;
215 	}
216 
217 	if (!req->newptr) {
218 		return 0;
219 	}
220 
221 	error = SYSCTL_IN(req, &value, sizeof(int));
222 	if (error) {
223 		return error;
224 	}
225 	task_self_region_footprint_set(value);
226 	return 0;
227 }
228 SYSCTL_PROC(_vm, OID_AUTO, self_region_footprint, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_footprint, "I", "");
229 
230 static int
231 sysctl_vm_self_region_page_size SYSCTL_HANDLER_ARGS
232 {
233 #pragma unused(arg1, arg2, oidp)
234 	int     error = 0;
235 	int     value;
236 
237 	value = (1 << thread_self_region_page_shift());
238 	error = SYSCTL_OUT(req, &value, sizeof(int));
239 	if (error) {
240 		return error;
241 	}
242 
243 	if (!req->newptr) {
244 		return 0;
245 	}
246 
247 	error = SYSCTL_IN(req, &value, sizeof(int));
248 	if (error) {
249 		return error;
250 	}
251 
252 	if (value != 0 && value != 4096 && value != 16384) {
253 		return EINVAL;
254 	}
255 
256 #if !__ARM_MIXED_PAGE_SIZE__
257 	if (value != vm_map_page_size(current_map())) {
258 		return EINVAL;
259 	}
260 #endif /* !__ARM_MIXED_PAGE_SIZE__ */
261 
262 	thread_self_region_page_shift_set(bit_first(value));
263 	return 0;
264 }
265 SYSCTL_PROC(_vm, OID_AUTO, self_region_page_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_page_size, "I", "");
266 
267 static int
268 sysctl_vm_self_region_info_flags SYSCTL_HANDLER_ARGS
269 {
270 #pragma unused(arg1, arg2, oidp)
271 	int     error = 0;
272 	int     value;
273 	kern_return_t kr;
274 
275 	value = task_self_region_info_flags();
276 	error = SYSCTL_OUT(req, &value, sizeof(int));
277 	if (error) {
278 		return error;
279 	}
280 
281 	if (!req->newptr) {
282 		return 0;
283 	}
284 
285 	error = SYSCTL_IN(req, &value, sizeof(int));
286 	if (error) {
287 		return error;
288 	}
289 
290 	kr = task_self_region_info_flags_set(value);
291 	if (kr != KERN_SUCCESS) {
292 		return EINVAL;
293 	}
294 
295 	return 0;
296 }
297 SYSCTL_PROC(_vm, OID_AUTO, self_region_info_flags, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_info_flags, "I", "");
298 
299 
300 #if DEVELOPMENT || DEBUG
301 extern int panic_on_unsigned_execute;
302 SYSCTL_INT(_vm, OID_AUTO, panic_on_unsigned_execute, CTLFLAG_RW | CTLFLAG_LOCKED, &panic_on_unsigned_execute, 0, "");
303 
304 extern int vm_log_xnu_user_debug;
305 SYSCTL_INT(_vm, OID_AUTO, log_xnu_user_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_xnu_user_debug, 0, "");
306 #endif /* DEVELOPMENT || DEBUG */
307 
308 extern int vm_log_map_delete_permanent_prot_none;
309 SYSCTL_INT(_vm, OID_AUTO, log_map_delete_permanent_prot_none, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_map_delete_permanent_prot_none, 0, "");
310 
311 extern int cs_executable_create_upl;
312 extern int cs_executable_wire;
313 SYSCTL_INT(_vm, OID_AUTO, cs_executable_create_upl, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_create_upl, 0, "");
314 SYSCTL_INT(_vm, OID_AUTO, cs_executable_wire, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_wire, 0, "");
315 
316 extern int apple_protect_pager_count;
317 extern int apple_protect_pager_count_mapped;
318 extern unsigned int apple_protect_pager_cache_limit;
319 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count, 0, "");
320 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count_mapped, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count_mapped, 0, "");
321 SYSCTL_UINT(_vm, OID_AUTO, apple_protect_pager_cache_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_cache_limit, 0, "");
322 
323 #if DEVELOPMENT || DEBUG
324 extern int radar_20146450;
325 SYSCTL_INT(_vm, OID_AUTO, radar_20146450, CTLFLAG_RW | CTLFLAG_LOCKED, &radar_20146450, 0, "");
326 
327 extern int macho_printf;
328 SYSCTL_INT(_vm, OID_AUTO, macho_printf, CTLFLAG_RW | CTLFLAG_LOCKED, &macho_printf, 0, "");
329 
330 extern int apple_protect_pager_data_request_debug;
331 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_data_request_debug, 0, "");
332 
333 extern unsigned int vm_object_copy_delayed_paging_wait_disable;
334 EXPERIMENT_FACTOR_LEGACY_UINT(_vm, vm_object_copy_delayed_paging_wait_disable, &vm_object_copy_delayed_paging_wait_disable, FALSE, TRUE, "");
335 
336 __enum_closed_decl(vm_submap_test_op, uint32_t, {
337 	vsto_make_submap = 1,  /* make submap from entries in current_map()
338 	                        * at start..end, offset ignored */
339 	vsto_remap_submap = 2, /* map in current_map() at start..end,
340 	                        * from parent address submap_base_address
341 	                        * and submap address offset */
342 	vsto_end
343 });
344 
345 static int
346 sysctl_vm_submap_test_ctl SYSCTL_HANDLER_ARGS
347 {
348 	int error;
349 	struct {
350 		vm_submap_test_op op;
351 		mach_vm_address_t submap_base_address;
352 		mach_vm_address_t start;
353 		mach_vm_address_t end;
354 		mach_vm_address_t offset;
355 	} args;
356 	if (req->newlen != sizeof(args)) {
357 		return EINVAL;
358 	}
359 	error = SYSCTL_IN(req, &args, sizeof(args));
360 	if (error) {
361 		return error;
362 	}
363 
364 	switch (args.op) {
365 	case vsto_make_submap:
366 		vm_map_testing_make_sealed_submap(current_map(), args.start, args.end);
367 		break;
368 	case vsto_remap_submap:
369 		vm_map_testing_remap_submap(current_map(),
370 		    args.submap_base_address, args.start, args.end, args.offset);
371 		break;
372 	default:
373 		return EINVAL;
374 	}
375 
376 	return 0;
377 }
378 SYSCTL_PROC(_vm, OID_AUTO, submap_test_ctl, CTLFLAG_WR | CTLFLAG_LOCKED, 0, 0, &sysctl_vm_submap_test_ctl, "-", "");
379 
380 #if __arm64__
381 /* These are meant to support the page table accounting unit test. */
382 extern unsigned int arm_hardware_page_size;
383 extern unsigned int arm_pt_desc_size;
384 extern unsigned int arm_pt_root_size;
385 extern unsigned int inuse_user_tteroot_count;
386 extern unsigned int inuse_kernel_tteroot_count;
387 extern unsigned int inuse_user_ttepages_count;
388 extern unsigned int inuse_kernel_ttepages_count;
389 extern unsigned int inuse_user_ptepages_count;
390 extern unsigned int inuse_kernel_ptepages_count;
391 SYSCTL_UINT(_vm, OID_AUTO, native_hw_pagesize, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_hardware_page_size, 0, "");
392 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_desc_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_desc_size, 0, "");
393 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_root_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_root_size, 0, "");
394 SYSCTL_UINT(_vm, OID_AUTO, user_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_tteroot_count, 0, "");
395 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_tteroot_count, 0, "");
396 SYSCTL_UINT(_vm, OID_AUTO, user_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ttepages_count, 0, "");
397 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ttepages_count, 0, "");
398 SYSCTL_UINT(_vm, OID_AUTO, user_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ptepages_count, 0, "");
399 SYSCTL_UINT(_vm, OID_AUTO, kernel_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ptepages_count, 0, "");
400 #if !CONFIG_SPTM
401 extern unsigned int free_page_size_tt_count;
402 extern unsigned int free_tt_count;
403 SYSCTL_UINT(_vm, OID_AUTO, free_1page_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_page_size_tt_count, 0, "");
404 SYSCTL_UINT(_vm, OID_AUTO, free_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_tt_count, 0, "");
405 #endif
406 #if DEVELOPMENT || DEBUG
407 extern unsigned long pmap_asid_flushes;
408 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_flushes, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_flushes, "");
409 extern unsigned long pmap_asid_hits;
410 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_hits, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_hits, "");
411 extern unsigned long pmap_asid_misses;
412 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_misses, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_misses, "");
413 extern unsigned long pmap_speculation_restrictions;
414 SYSCTL_ULONG(_vm, OID_AUTO, pmap_speculation_restrictions, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_speculation_restrictions, "");
415 #endif
416 #endif /* __arm64__ */
417 #endif /* DEVELOPMENT || DEBUG */
418 
419 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor, 0, "");
420 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor_pages, 0, "");
421 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate, 0, "");
422 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate_failure, 0, "");
423 SYSCTL_INT(_vm, OID_AUTO, vm_should_cow_but_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.should_cow_but_wired, 0, "");
424 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow, 0, "");
425 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow_pages, 0, "");
426 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_write, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_write, 0, "");
427 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_copy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_copy, 0, "");
428 #if VM_SCAN_FOR_SHADOW_CHAIN
429 static int vm_shadow_max_enabled = 0;    /* Disabled by default */
430 extern int proc_shadow_max(void);
431 static int
432 vm_shadow_max SYSCTL_HANDLER_ARGS
433 {
434 #pragma unused(arg1, arg2, oidp)
435 	int value = 0;
436 
437 	if (vm_shadow_max_enabled) {
438 		value = proc_shadow_max();
439 	}
440 
441 	return SYSCTL_OUT(req, &value, sizeof(value));
442 }
443 SYSCTL_PROC(_vm, OID_AUTO, vm_shadow_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
444     0, 0, &vm_shadow_max, "I", "");
445 
446 SYSCTL_INT(_vm, OID_AUTO, vm_shadow_max_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_shadow_max_enabled, 0, "");
447 
448 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
449 
450 SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, "");
451 
452 #if PAGE_SLEEP_WITH_INHERITOR
453 #if DEVELOPMENT || DEBUG
454 extern uint32_t page_worker_table_size;
455 SYSCTL_INT(_vm, OID_AUTO, page_worker_table_size, CTLFLAG_RD | CTLFLAG_LOCKED, &page_worker_table_size, 0, "");
456 SCALABLE_COUNTER_DECLARE(page_worker_hash_collisions);
457 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_hash_collisions, page_worker_hash_collisions, "");
458 SCALABLE_COUNTER_DECLARE(page_worker_inheritor_sleeps);
459 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_inheritor_sleeps, page_worker_inheritor_sleeps, "");
460 #endif /* DEVELOPMENT || DEBUG */
461 #endif /* PAGE_SLEEP_WITH_INHERITOR */
462 
463 #if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1
464 extern uint32_t vm_cheads;
465 extern vm_chead_select_t vm_chead_select;
466 extern boolean_t vm_chead_rehint;
467 #if DEVELOPMENT || DEBUG
468 SYSCTL_UINT(_vm, OID_AUTO, compressor_heads, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cheads, 0, "");
469 SYSCTL_UINT(_vm, OID_AUTO, compressor_head_select, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_chead_select, 0, "");
470 SYSCTL_INT(_vm, OID_AUTO, compressor_head_rehint, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_chead_rehint, 0, "");
471 #endif /* DEVELOPMENT || DEBUG */
472 EXPERIMENT_FACTOR_UINT(compressor_heads, &vm_cheads, 1, COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT, "");
473 EXPERIMENT_FACTOR_UINT(compressor_head_select, &vm_chead_select, CSEL_MIN, CSEL_MAX, "");
474 EXPERIMENT_FACTOR_INT(compressor_head_rehint, &vm_chead_rehint, 0, 1, "");
475 #endif /* COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 */
476 
477 /*
478  * Sysctl's related to data/stack execution.  See osfmk/vm/vm_map.c
479  */
480 
481 #if DEVELOPMENT || DEBUG
482 extern int allow_stack_exec, allow_data_exec;
483 
484 SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, "");
485 SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, "");
486 
487 #endif /* DEVELOPMENT || DEBUG */
488 
489 static const char *prot_values[] = {
490 	"none",
491 	"read-only",
492 	"write-only",
493 	"read-write",
494 	"execute-only",
495 	"read-execute",
496 	"write-execute",
497 	"read-write-execute"
498 };
499 
500 void
log_stack_execution_failure(addr64_t vaddr,vm_prot_t prot)501 log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot)
502 {
503 	printf("Data/Stack execution not permitted: %s[pid %d] at virtual address 0x%qx, protections were %s\n",
504 	    current_proc()->p_comm, proc_getpid(current_proc()), vaddr, prot_values[prot & VM_PROT_ALL]);
505 }
506 
507 /*
508  * shared_region_unnest_logging: level of logging of unnesting events
509  * 0	- no logging
510  * 1	- throttled logging of unexpected unnesting events (default)
511  * 2	- unthrottled logging of unexpected unnesting events
512  * 3+	- unthrottled logging of all unnesting events
513  */
514 int shared_region_unnest_logging = 1;
515 
516 SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED,
517     &shared_region_unnest_logging, 0, "");
518 
519 int vm_shared_region_unnest_log_interval = 10;
520 int shared_region_unnest_log_count_threshold = 5;
521 
522 
523 #if XNU_TARGET_OS_OSX
524 
525 #if defined (__x86_64__)
526 static int scdir_enforce = 1;
527 #else /* defined (__x86_64__) */
528 static int scdir_enforce = 0;   /* AOT caches live elsewhere */
529 #endif /* defined (__x86_64__) */
530 
531 static char *scdir_path[] = {
532 	"/System/Library/dyld/",
533 	"/System/Volumes/Preboot/Cryptexes/OS/System/Library/dyld",
534 	"/System/Cryptexes/OS/System/Library/dyld",
535 	NULL
536 };
537 
538 #else /* XNU_TARGET_OS_OSX */
539 
540 static int scdir_enforce = 0;
541 static char *scdir_path[] = {
542 	"/System/Library/Caches/com.apple.dyld/",
543 	"/private/preboot/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
544 	"/System/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
545 	NULL
546 };
547 
548 #endif /* XNU_TARGET_OS_OSX */
549 
550 static char *driverkit_scdir_path[] = {
551 	"/System/DriverKit/System/Library/dyld/",
552 #if XNU_TARGET_OS_OSX
553 	"/System/Volumes/Preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
554 #else
555 	"/private/preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
556 #endif /* XNU_TARGET_OS_OSX */
557 	"/System/Cryptexes/OS/System/DriverKit/System/Library/dyld",
558 	NULL
559 };
560 
561 #ifndef SECURE_KERNEL
562 static int sysctl_scdir_enforce SYSCTL_HANDLER_ARGS
563 {
564 #if CONFIG_CSR
565 	if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
566 		printf("Failed attempt to set vm.enforce_shared_cache_dir sysctl\n");
567 		return EPERM;
568 	}
569 #endif /* CONFIG_CSR */
570 	return sysctl_handle_int(oidp, arg1, arg2, req);
571 }
572 
573 SYSCTL_PROC(_vm, OID_AUTO, enforce_shared_cache_dir, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, sysctl_scdir_enforce, "I", "");
574 #endif
575 
576 /* These log rate throttling state variables aren't thread safe, but
577  * are sufficient unto the task.
578  */
579 static int64_t last_unnest_log_time = 0;
580 static int shared_region_unnest_log_count = 0;
581 
582 void
log_unnest_badness(vm_map_t m,vm_map_offset_t s,vm_map_offset_t e,boolean_t is_nested_map,vm_map_offset_t lowest_unnestable_addr)583 log_unnest_badness(
584 	vm_map_t        m,
585 	vm_map_offset_t s,
586 	vm_map_offset_t e,
587 	boolean_t       is_nested_map,
588 	vm_map_offset_t lowest_unnestable_addr)
589 {
590 	struct timeval  tv;
591 
592 	if (shared_region_unnest_logging == 0) {
593 		return;
594 	}
595 
596 	if (shared_region_unnest_logging <= 2 &&
597 	    is_nested_map &&
598 	    s >= lowest_unnestable_addr) {
599 		/*
600 		 * Unnesting of writable map entries is fine.
601 		 */
602 		return;
603 	}
604 
605 	if (shared_region_unnest_logging <= 1) {
606 		microtime(&tv);
607 		if ((tv.tv_sec - last_unnest_log_time) <
608 		    vm_shared_region_unnest_log_interval) {
609 			if (shared_region_unnest_log_count++ >
610 			    shared_region_unnest_log_count_threshold) {
611 				return;
612 			}
613 		} else {
614 			last_unnest_log_time = tv.tv_sec;
615 			shared_region_unnest_log_count = 0;
616 		}
617 	}
618 
619 	DTRACE_VM4(log_unnest_badness,
620 	    vm_map_t, m,
621 	    vm_map_offset_t, s,
622 	    vm_map_offset_t, e,
623 	    vm_map_offset_t, lowest_unnestable_addr);
624 	printf("%s[%d] triggered unnest of range 0x%qx->0x%qx of DYLD shared region in VM map %p. While not abnormal for debuggers, this increases system memory footprint until the target exits.\n", current_proc()->p_comm, proc_getpid(current_proc()), (uint64_t)s, (uint64_t)e, (void *) VM_KERNEL_ADDRPERM(m));
625 }
626 
627 uint64_t
vm_purge_filebacked_pagers(void)628 vm_purge_filebacked_pagers(void)
629 {
630 	uint64_t pages_purged;
631 
632 	pages_purged = 0;
633 	pages_purged += apple_protect_pager_purge_all();
634 	pages_purged += shared_region_pager_purge_all();
635 	pages_purged += dyld_pager_purge_all();
636 #if DEVELOPMENT || DEBUG
637 	printf("%s:%d pages purged: %llu\n", __FUNCTION__, __LINE__, pages_purged);
638 #endif /* DEVELOPMENT || DEBUG */
639 	return pages_purged;
640 }
641 
642 int
useracc(user_addr_ut addr_u,user_size_ut len_u,int prot)643 useracc(
644 	user_addr_ut    addr_u,
645 	user_size_ut    len_u,
646 	int             prot)
647 {
648 	vm_map_t        map;
649 	vm_prot_t       vm_prot = VM_PROT_WRITE;
650 
651 	map = current_map();
652 
653 	if (prot == B_READ) {
654 		vm_prot = VM_PROT_READ;
655 	}
656 
657 	return vm_map_check_protection(map, addr_u,
658 	           vm_sanitize_compute_ut_end(addr_u, len_u), vm_prot,
659 	           VM_SANITIZE_CALLER_USERACC);
660 }
661 
662 #if XNU_PLATFORM_MacOSX
663 static __attribute__((always_inline, warn_unused_result))
664 kern_return_t
vslock_sanitize(vm_map_t map,user_addr_ut addr_u,user_size_ut len_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)665 vslock_sanitize(
666 	vm_map_t                map,
667 	user_addr_ut            addr_u,
668 	user_size_ut            len_u,
669 	vm_sanitize_caller_t    vm_sanitize_caller,
670 	vm_map_offset_t        *start,
671 	vm_map_offset_t        *end,
672 	vm_map_size_t          *size)
673 {
674 	return vm_sanitize_addr_size(addr_u, len_u, vm_sanitize_caller,
675 	           map,
676 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
677 	           size);
678 }
679 #endif /* XNU_PLATFORM_MacOSX */
680 
681 int
vslock(user_addr_ut addr,user_size_ut len)682 vslock(user_addr_ut addr, user_size_ut len)
683 {
684 	kern_return_t kret;
685 
686 #if XNU_PLATFORM_MacOSX
687 	/*
688 	 * Preserve previous behavior on macOS for overflows due to bin
689 	 * compatibility i.e. return success for overflows without doing
690 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
691 	 * for overflow errors which gets converted to KERN_SUCCESS by
692 	 * vm_sanitize_get_kr.
693 	 */
694 	vm_map_offset_t start, end;
695 	vm_map_size_t   size;
696 
697 	kret = vslock_sanitize(current_map(),
698 	    addr,
699 	    len,
700 	    VM_SANITIZE_CALLER_VSLOCK,
701 	    &start,
702 	    &end,
703 	    &size);
704 	if (__improbable(kret != KERN_SUCCESS)) {
705 		switch (vm_sanitize_get_kr(kret)) {
706 		case KERN_SUCCESS:
707 			return 0;
708 		case KERN_INVALID_ADDRESS:
709 		case KERN_NO_SPACE:
710 			return ENOMEM;
711 		case KERN_PROTECTION_FAILURE:
712 			return EACCES;
713 		default:
714 			return EINVAL;
715 		}
716 	}
717 #endif /* XNU_PLATFORM_MacOSX */
718 
719 	kret = vm_map_wire_kernel(current_map(), addr,
720 	    vm_sanitize_compute_ut_end(addr, len),
721 	    vm_sanitize_wrap_prot(VM_PROT_READ | VM_PROT_WRITE),
722 	    VM_KERN_MEMORY_BSD,
723 	    FALSE);
724 
725 	switch (kret) {
726 	case KERN_SUCCESS:
727 		return 0;
728 	case KERN_INVALID_ADDRESS:
729 	case KERN_NO_SPACE:
730 		return ENOMEM;
731 	case KERN_PROTECTION_FAILURE:
732 		return EACCES;
733 	default:
734 		return EINVAL;
735 	}
736 }
737 
738 int
vsunlock(user_addr_ut addr,user_size_ut len,__unused int dirtied)739 vsunlock(user_addr_ut addr, user_size_ut len, __unused int dirtied)
740 {
741 #if FIXME  /* [ */
742 	pmap_t          pmap;
743 	vm_page_t       pg;
744 	vm_map_offset_t vaddr;
745 	ppnum_t         paddr;
746 #endif  /* FIXME ] */
747 	kern_return_t   kret;
748 	vm_map_t        map;
749 
750 	map = current_map();
751 
752 #if FIXME  /* [ */
753 	if (dirtied) {
754 		pmap = get_task_pmap(current_task());
755 		for (vaddr = vm_map_trunc_page(addr, PAGE_MASK);
756 		    vaddr < vm_map_round_page(addr + len, PAGE_MASK);
757 		    vaddr += PAGE_SIZE) {
758 			paddr = pmap_find_phys(pmap, vaddr);
759 			pg = PHYS_TO_VM_PAGE(paddr);
760 			vm_page_set_modified(pg);
761 		}
762 	}
763 #endif  /* FIXME ] */
764 #ifdef  lint
765 	dirtied++;
766 #endif  /* lint */
767 
768 #if XNU_PLATFORM_MacOSX
769 	/*
770 	 * Preserve previous behavior on macOS for overflows due to bin
771 	 * compatibility i.e. return success for overflows without doing
772 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
773 	 * for overflow errors which gets converted to KERN_SUCCESS by
774 	 * vm_sanitize_get_kr.
775 	 */
776 	vm_map_offset_t start, end;
777 	vm_map_size_t   size;
778 
779 	kret = vslock_sanitize(map,
780 	    addr,
781 	    len,
782 	    VM_SANITIZE_CALLER_VSUNLOCK,
783 	    &start,
784 	    &end,
785 	    &size);
786 	if (__improbable(kret != KERN_SUCCESS)) {
787 		switch (vm_sanitize_get_kr(kret)) {
788 		case KERN_SUCCESS:
789 			return 0;
790 		case KERN_INVALID_ADDRESS:
791 		case KERN_NO_SPACE:
792 			return ENOMEM;
793 		case KERN_PROTECTION_FAILURE:
794 			return EACCES;
795 		default:
796 			return EINVAL;
797 		}
798 	}
799 #endif /* XNU_PLATFORM_MacOSX */
800 
801 	kret = vm_map_unwire(map, addr,
802 	    vm_sanitize_compute_ut_end(addr, len), false);
803 	switch (kret) {
804 	case KERN_SUCCESS:
805 		return 0;
806 	case KERN_INVALID_ADDRESS:
807 	case KERN_NO_SPACE:
808 		return ENOMEM;
809 	case KERN_PROTECTION_FAILURE:
810 		return EACCES;
811 	default:
812 		return EINVAL;
813 	}
814 }
815 
816 int
subyte(user_addr_t addr,int byte)817 subyte(
818 	user_addr_t addr,
819 	int byte)
820 {
821 	char character;
822 
823 	character = (char)byte;
824 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
825 }
826 
827 int
suibyte(user_addr_t addr,int byte)828 suibyte(
829 	user_addr_t addr,
830 	int byte)
831 {
832 	char character;
833 
834 	character = (char)byte;
835 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
836 }
837 
838 int
fubyte(user_addr_t addr)839 fubyte(user_addr_t addr)
840 {
841 	unsigned char byte;
842 
843 	if (copyin(addr, (void *) &byte, sizeof(char))) {
844 		return -1;
845 	}
846 	return byte;
847 }
848 
849 int
fuibyte(user_addr_t addr)850 fuibyte(user_addr_t addr)
851 {
852 	unsigned char byte;
853 
854 	if (copyin(addr, (void *) &(byte), sizeof(char))) {
855 		return -1;
856 	}
857 	return byte;
858 }
859 
860 int
suword(user_addr_t addr,long word)861 suword(
862 	user_addr_t addr,
863 	long word)
864 {
865 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
866 }
867 
868 long
fuword(user_addr_t addr)869 fuword(user_addr_t addr)
870 {
871 	long word = 0;
872 
873 	if (copyin(addr, (void *) &word, sizeof(int))) {
874 		return -1;
875 	}
876 	return word;
877 }
878 
879 /* suiword and fuiword are the same as suword and fuword, respectively */
880 
881 int
suiword(user_addr_t addr,long word)882 suiword(
883 	user_addr_t addr,
884 	long word)
885 {
886 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
887 }
888 
889 long
fuiword(user_addr_t addr)890 fuiword(user_addr_t addr)
891 {
892 	long word = 0;
893 
894 	if (copyin(addr, (void *) &word, sizeof(int))) {
895 		return -1;
896 	}
897 	return word;
898 }
899 
900 /*
901  * With a 32-bit kernel and mixed 32/64-bit user tasks, this interface allows the
902  * fetching and setting of process-sized size_t and pointer values.
903  */
904 int
sulong(user_addr_t addr,int64_t word)905 sulong(user_addr_t addr, int64_t word)
906 {
907 	if (IS_64BIT_PROCESS(current_proc())) {
908 		return copyout((void *)&word, addr, sizeof(word)) == 0 ? 0 : -1;
909 	} else {
910 		return suiword(addr, (long)word);
911 	}
912 }
913 
914 int64_t
fulong(user_addr_t addr)915 fulong(user_addr_t addr)
916 {
917 	int64_t longword;
918 
919 	if (IS_64BIT_PROCESS(current_proc())) {
920 		if (copyin(addr, (void *)&longword, sizeof(longword)) != 0) {
921 			return -1;
922 		}
923 		return longword;
924 	} else {
925 		return (int64_t)fuiword(addr);
926 	}
927 }
928 
929 int
suulong(user_addr_t addr,uint64_t uword)930 suulong(user_addr_t addr, uint64_t uword)
931 {
932 	if (IS_64BIT_PROCESS(current_proc())) {
933 		return copyout((void *)&uword, addr, sizeof(uword)) == 0 ? 0 : -1;
934 	} else {
935 		return suiword(addr, (uint32_t)uword);
936 	}
937 }
938 
939 uint64_t
fuulong(user_addr_t addr)940 fuulong(user_addr_t addr)
941 {
942 	uint64_t ulongword;
943 
944 	if (IS_64BIT_PROCESS(current_proc())) {
945 		if (copyin(addr, (void *)&ulongword, sizeof(ulongword)) != 0) {
946 			return -1ULL;
947 		}
948 		return ulongword;
949 	} else {
950 		return (uint64_t)fuiword(addr);
951 	}
952 }
953 
954 int
swapon(__unused proc_t procp,__unused struct swapon_args * uap,__unused int * retval)955 swapon(__unused proc_t procp, __unused struct swapon_args *uap, __unused int *retval)
956 {
957 	return ENOTSUP;
958 }
959 
960 #if defined(SECURE_KERNEL)
961 static int kern_secure_kernel = 1;
962 #else
963 static int kern_secure_kernel = 0;
964 #endif
965 
966 SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_secure_kernel, 0, "");
967 SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW | CTLFLAG_LOCKED,
968     &shared_region_trace_level, 0, "");
969 SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD | CTLFLAG_LOCKED,
970     &shared_region_version, 0, "");
971 SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED,
972     &shared_region_persistence, 0, "");
973 
974 /*
975  * shared_region_check_np:
976  *
977  * This system call is intended for dyld.
978  *
979  * dyld calls this when any process starts to see if the process's shared
980  * region is already set up and ready to use.
981  * This call returns the base address of the first mapping in the
982  * process's shared region's first mapping.
983  * dyld will then check what's mapped at that address.
984  *
985  * If the shared region is empty, dyld will then attempt to map the shared
986  * cache file in the shared region via the shared_region_map_and_slide_2_np()
987  * system call.
988  *
989  * If something's already mapped in the shared region, dyld will check if it
990  * matches the shared cache it would like to use for that process.
991  * If it matches, evrything's ready and the process can proceed and use the
992  * shared region.
993  * If it doesn't match, dyld will unmap the shared region and map the shared
994  * cache into the process's address space via mmap().
995  *
996  * A NULL pointer argument can be used by dyld to indicate it has unmapped
997  * the shared region. We will remove the shared_region reference from the task.
998  *
999  * ERROR VALUES
1000  * EINVAL	no shared region
1001  * ENOMEM	shared region is empty
1002  * EFAULT	bad address for "start_address"
1003  */
1004 int
shared_region_check_np(__unused struct proc * p,struct shared_region_check_np_args * uap,__unused int * retvalp)1005 shared_region_check_np(
1006 	__unused struct proc                    *p,
1007 	struct shared_region_check_np_args      *uap,
1008 	__unused int                            *retvalp)
1009 {
1010 	vm_shared_region_t      shared_region;
1011 	mach_vm_offset_t        start_address = 0;
1012 	int                     error = 0;
1013 	kern_return_t           kr = KERN_FAILURE;
1014 	task_t                  task = current_task();
1015 
1016 	SHARED_REGION_TRACE_DEBUG(
1017 		("shared_region: %p [%d(%s)] -> check_np(0x%llx)\n",
1018 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1019 		proc_getpid(p), p->p_comm,
1020 		(uint64_t)uap->start_address));
1021 
1022 	/*
1023 	 * Special value of start_address used to indicate that map_with_linking() should
1024 	 * no longer be allowed in this process
1025 	 */
1026 	if (uap->start_address == (task_get_64bit_addr(task) ? DYLD_VM_END_MWL : (uint32_t)DYLD_VM_END_MWL)) {
1027 		p->p_disallow_map_with_linking = TRUE;
1028 		return 0;
1029 	}
1030 
1031 	/* retrieve the current task's shared region */
1032 	shared_region = vm_shared_region_get(task);
1033 	if (shared_region != NULL) {
1034 		/*
1035 		 * A NULL argument is used by dyld to indicate the task
1036 		 * has unmapped its shared region.
1037 		 */
1038 		if (uap->start_address == 0) {
1039 			/* unmap it first */
1040 			vm_shared_region_remove(task, shared_region);
1041 			vm_shared_region_set(task, NULL);
1042 		} else {
1043 			/* retrieve address of its first mapping... */
1044 			kr = vm_shared_region_start_address(shared_region, &start_address);
1045 			if (kr != KERN_SUCCESS) {
1046 				SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1047 				    "check_np(0x%llx) "
1048 				    "vm_shared_region_start_address() returned 0x%x\n",
1049 				    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1050 				    proc_getpid(p), p->p_comm,
1051 				    (uint64_t)uap->start_address, kr));
1052 				error = ENOMEM;
1053 			}
1054 			if (error == 0) {
1055 				/* Insert the shared region submap and various bits of debug info into the task. */
1056 				kr = vm_shared_region_update_task(task, shared_region, start_address);
1057 				if (kr != KERN_SUCCESS) {
1058 					SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1059 					    "check_np(0x%llx) "
1060 					    "vm_shared_update_task() returned 0x%x\n",
1061 					    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1062 					    proc_getpid(p), p->p_comm,
1063 					    (uint64_t)uap->start_address, kr));
1064 
1065 					error = ENOMEM;
1066 				}
1067 			}
1068 #if __has_feature(ptrauth_calls)
1069 			/*
1070 			 * Remap any section of the shared library that
1071 			 * has authenticated pointers into private memory.
1072 			 */
1073 			if ((error == 0) && (vm_shared_region_auth_remap(shared_region) != KERN_SUCCESS)) {
1074 				SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1075 				    "check_np(0x%llx) "
1076 				    "vm_shared_region_auth_remap() failed\n",
1077 				    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1078 				    proc_getpid(p), p->p_comm,
1079 				    (uint64_t)uap->start_address));
1080 				error = ENOMEM;
1081 			}
1082 #endif /* __has_feature(ptrauth_calls) */
1083 			/* Give the start address to the caller */
1084 			if (error == 0) {
1085 				error = copyout(&start_address,
1086 				    (user_addr_t) uap->start_address,
1087 				    sizeof(start_address));
1088 				if (error != 0) {
1089 					SHARED_REGION_TRACE_ERROR(
1090 						("shared_region: %p [%d(%s)] "
1091 						"check_np(0x%llx) "
1092 						"copyout(0x%llx) error %d\n",
1093 						(void *)VM_KERNEL_ADDRPERM(current_thread()),
1094 						proc_getpid(p), p->p_comm,
1095 						(uint64_t)uap->start_address, (uint64_t)start_address,
1096 						error));
1097 				}
1098 			}
1099 		}
1100 		vm_shared_region_deallocate(shared_region);
1101 	} else {
1102 		/* no shared region ! */
1103 		error = EINVAL;
1104 	}
1105 
1106 	SHARED_REGION_TRACE_DEBUG(
1107 		("shared_region: %p [%d(%s)] check_np(0x%llx) <- 0x%llx %d\n",
1108 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1109 		proc_getpid(p), p->p_comm,
1110 		(uint64_t)uap->start_address, (uint64_t)start_address, error));
1111 
1112 	return error;
1113 }
1114 
1115 
1116 static int
shared_region_copyin(struct proc * p,user_addr_t user_addr,unsigned int count,unsigned int element_size,void * kernel_data)1117 shared_region_copyin(
1118 	struct proc  *p,
1119 	user_addr_t  user_addr,
1120 	unsigned int count,
1121 	unsigned int element_size,
1122 	void         *kernel_data)
1123 {
1124 	int             error = 0;
1125 	vm_size_t       size = count * element_size;
1126 
1127 	error = copyin(user_addr, kernel_data, size);
1128 	if (error) {
1129 		SHARED_REGION_TRACE_ERROR(
1130 			("shared_region: %p [%d(%s)] map(): "
1131 			"copyin(0x%llx, %ld) failed (error=%d)\n",
1132 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1133 			proc_getpid(p), p->p_comm,
1134 			(uint64_t)user_addr, (long)size, error));
1135 	}
1136 	return error;
1137 }
1138 
1139 /*
1140  * A reasonable upper limit to prevent overflow of allocation/copyin.
1141  */
1142 #define _SR_FILE_MAPPINGS_MAX_FILES 256
1143 
1144 /* forward declaration */
1145 __attribute__((noinline))
1146 static void shared_region_map_and_slide_cleanup(
1147 	struct proc              *p,
1148 	uint32_t                 files_count,
1149 	struct _sr_file_mappings *sr_file_mappings,
1150 	struct vm_shared_region  *shared_region);
1151 
1152 /*
1153  * Setup part of _shared_region_map_and_slide().
1154  * It had to be broken out of _shared_region_map_and_slide() to
1155  * prevent compiler inlining from blowing out the stack.
1156  */
1157 __attribute__((noinline))
1158 static int
shared_region_map_and_slide_setup(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings,struct _sr_file_mappings ** sr_file_mappings,struct vm_shared_region ** shared_region_ptr,struct vnode * rdir_vp)1159 shared_region_map_and_slide_setup(
1160 	struct proc                         *p,
1161 	uint32_t                            files_count,
1162 	struct shared_file_np               *files,
1163 	uint32_t                            mappings_count,
1164 	struct shared_file_mapping_slide_np *mappings,
1165 	struct _sr_file_mappings            **sr_file_mappings,
1166 	struct vm_shared_region             **shared_region_ptr,
1167 	struct vnode                        *rdir_vp)
1168 {
1169 	int                             error = 0;
1170 	struct _sr_file_mappings        *srfmp;
1171 	uint32_t                        mappings_next;
1172 	struct vnode_attr               va;
1173 	off_t                           fs;
1174 #if CONFIG_MACF
1175 	vm_prot_t                       maxprot = VM_PROT_ALL;
1176 #endif
1177 	uint32_t                        i;
1178 	struct vm_shared_region         *shared_region = NULL;
1179 	boolean_t                       is_driverkit = task_is_driver(current_task());
1180 
1181 	SHARED_REGION_TRACE_DEBUG(
1182 		("shared_region: %p [%d(%s)] -> map_and_slide_setup\n",
1183 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1184 		proc_getpid(p), p->p_comm));
1185 
1186 	if (files_count > _SR_FILE_MAPPINGS_MAX_FILES) {
1187 		error = E2BIG;
1188 		goto done;
1189 	}
1190 	if (files_count == 0) {
1191 		error = EINVAL;
1192 		goto done;
1193 	}
1194 	*sr_file_mappings = kalloc_type(struct _sr_file_mappings, files_count,
1195 	    Z_WAITOK | Z_ZERO);
1196 	if (*sr_file_mappings == NULL) {
1197 		error = ENOMEM;
1198 		goto done;
1199 	}
1200 	mappings_next = 0;
1201 	for (i = 0; i < files_count; i++) {
1202 		srfmp = &(*sr_file_mappings)[i];
1203 		srfmp->fd = files[i].sf_fd;
1204 		srfmp->mappings_count = files[i].sf_mappings_count;
1205 		srfmp->mappings = &mappings[mappings_next];
1206 		mappings_next += srfmp->mappings_count;
1207 		if (mappings_next > mappings_count) {
1208 			error = EINVAL;
1209 			goto done;
1210 		}
1211 		srfmp->slide = files[i].sf_slide;
1212 	}
1213 
1214 	/* get the process's shared region (setup in vm_map_exec()) */
1215 	shared_region = vm_shared_region_get(current_task());
1216 	*shared_region_ptr = shared_region;
1217 	if (shared_region == NULL) {
1218 		SHARED_REGION_TRACE_ERROR(
1219 			("shared_region: %p [%d(%s)] map(): "
1220 			"no shared region\n",
1221 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1222 			proc_getpid(p), p->p_comm));
1223 		error = EINVAL;
1224 		goto done;
1225 	}
1226 
1227 	/*
1228 	 * Check the shared region matches the current root
1229 	 * directory of this process.  Deny the mapping to
1230 	 * avoid tainting the shared region with something that
1231 	 * doesn't quite belong into it.
1232 	 */
1233 	struct vnode *sr_vnode = vm_shared_region_root_dir(shared_region);
1234 	if (sr_vnode != NULL ?  rdir_vp != sr_vnode : rdir_vp != rootvnode) {
1235 		SHARED_REGION_TRACE_ERROR(
1236 			("shared_region: map(%p) root_dir mismatch\n",
1237 			(void *)VM_KERNEL_ADDRPERM(current_thread())));
1238 		error = EPERM;
1239 		goto done;
1240 	}
1241 
1242 
1243 	for (srfmp = &(*sr_file_mappings)[0];
1244 	    srfmp < &(*sr_file_mappings)[files_count];
1245 	    srfmp++) {
1246 		if (srfmp->mappings_count == 0) {
1247 			/* no mappings here... */
1248 			continue;
1249 		}
1250 
1251 		/*
1252 		 * A file descriptor of -1 is used to indicate that the data
1253 		 * to be put in the shared region for this mapping comes directly
1254 		 * from the processes address space. Ensure we have proper alignments.
1255 		 */
1256 		if (srfmp->fd == -1) {
1257 			/* only allow one mapping per fd */
1258 			if (srfmp->mappings_count > 1) {
1259 				SHARED_REGION_TRACE_ERROR(
1260 					("shared_region: %p [%d(%s)] map data >1 mapping\n",
1261 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1262 					proc_getpid(p), p->p_comm));
1263 				error = EINVAL;
1264 				goto done;
1265 			}
1266 
1267 			/*
1268 			 * The destination address and size must be page aligned.
1269 			 */
1270 			struct shared_file_mapping_slide_np *mapping = &srfmp->mappings[0];
1271 			mach_vm_address_t dest_addr = mapping->sms_address;
1272 			mach_vm_size_t    map_size = mapping->sms_size;
1273 			if (!vm_map_page_aligned(dest_addr, vm_map_page_mask(current_map()))) {
1274 				SHARED_REGION_TRACE_ERROR(
1275 					("shared_region: %p [%d(%s)] map data destination 0x%llx not aligned\n",
1276 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1277 					proc_getpid(p), p->p_comm, dest_addr));
1278 				error = EINVAL;
1279 				goto done;
1280 			}
1281 			if (!vm_map_page_aligned(map_size, vm_map_page_mask(current_map()))) {
1282 				SHARED_REGION_TRACE_ERROR(
1283 					("shared_region: %p [%d(%s)] map data size 0x%llx not aligned\n",
1284 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1285 					proc_getpid(p), p->p_comm, map_size));
1286 				error = EINVAL;
1287 				goto done;
1288 			}
1289 			continue;
1290 		}
1291 
1292 		/* get file structure from file descriptor */
1293 		error = fp_get_ftype(p, srfmp->fd, DTYPE_VNODE, EINVAL, &srfmp->fp);
1294 		if (error) {
1295 			SHARED_REGION_TRACE_ERROR(
1296 				("shared_region: %p [%d(%s)] map: "
1297 				"fd=%d lookup failed (error=%d)\n",
1298 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1299 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1300 			goto done;
1301 		}
1302 
1303 		/* we need at least read permission on the file */
1304 		if (!(srfmp->fp->fp_glob->fg_flag & FREAD)) {
1305 			SHARED_REGION_TRACE_ERROR(
1306 				("shared_region: %p [%d(%s)] map: "
1307 				"fd=%d not readable\n",
1308 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1309 				proc_getpid(p), p->p_comm, srfmp->fd));
1310 			error = EPERM;
1311 			goto done;
1312 		}
1313 
1314 		/* get vnode from file structure */
1315 		error = vnode_getwithref((vnode_t)fp_get_data(srfmp->fp));
1316 		if (error) {
1317 			SHARED_REGION_TRACE_ERROR(
1318 				("shared_region: %p [%d(%s)] map: "
1319 				"fd=%d getwithref failed (error=%d)\n",
1320 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1321 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1322 			goto done;
1323 		}
1324 		srfmp->vp = (struct vnode *)fp_get_data(srfmp->fp);
1325 
1326 		/* make sure the vnode is a regular file */
1327 		if (srfmp->vp->v_type != VREG) {
1328 			SHARED_REGION_TRACE_ERROR(
1329 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1330 				"not a file (type=%d)\n",
1331 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1332 				proc_getpid(p), p->p_comm,
1333 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1334 				srfmp->vp->v_name, srfmp->vp->v_type));
1335 			error = EINVAL;
1336 			goto done;
1337 		}
1338 
1339 #if CONFIG_MACF
1340 		/* pass in 0 for the offset argument because AMFI does not need the offset
1341 		 *       of the shared cache */
1342 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
1343 		    srfmp->fp->fp_glob, VM_PROT_ALL, MAP_FILE | MAP_PRIVATE | MAP_FIXED, 0, &maxprot);
1344 		if (error) {
1345 			goto done;
1346 		}
1347 #endif /* MAC */
1348 
1349 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1350 		/*
1351 		 * Check if the shared cache is in the trust cache;
1352 		 * if so, we can skip the root ownership check.
1353 		 */
1354 #if DEVELOPMENT || DEBUG
1355 		/*
1356 		 * Skip both root ownership and trust cache check if
1357 		 * enforcement is disabled.
1358 		 */
1359 		if (!cs_system_enforcement()) {
1360 			goto after_root_check;
1361 		}
1362 #endif /* DEVELOPMENT || DEBUG */
1363 		struct cs_blob *blob = csvnode_get_blob(srfmp->vp, 0);
1364 		if (blob == NULL) {
1365 			SHARED_REGION_TRACE_ERROR(
1366 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1367 				"missing CS blob\n",
1368 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1369 				proc_getpid(p), p->p_comm,
1370 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1371 				srfmp->vp->v_name));
1372 			goto root_check;
1373 		}
1374 		const uint8_t *cdhash = csblob_get_cdhash(blob);
1375 		if (cdhash == NULL) {
1376 			SHARED_REGION_TRACE_ERROR(
1377 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1378 				"missing cdhash\n",
1379 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1380 				proc_getpid(p), p->p_comm,
1381 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1382 				srfmp->vp->v_name));
1383 			goto root_check;
1384 		}
1385 
1386 		bool in_trust_cache = false;
1387 		TrustCacheQueryToken_t qt;
1388 		if (query_trust_cache(kTCQueryTypeAll, cdhash, &qt) == KERN_SUCCESS) {
1389 			TCType_t tc_type = kTCTypeInvalid;
1390 			TCReturn_t tc_ret = amfi->TrustCache.queryGetTCType(&qt, &tc_type);
1391 			in_trust_cache = (tc_ret.error == kTCReturnSuccess &&
1392 			    (tc_type == kTCTypeCryptex1BootOS ||
1393 			    tc_type == kTCTypeStatic ||
1394 			    tc_type == kTCTypeEngineering));
1395 		}
1396 		if (!in_trust_cache) {
1397 			SHARED_REGION_TRACE_ERROR(
1398 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1399 				"not in trust cache\n",
1400 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1401 				proc_getpid(p), p->p_comm,
1402 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1403 				srfmp->vp->v_name));
1404 			goto root_check;
1405 		}
1406 		goto after_root_check;
1407 root_check:
1408 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1409 
1410 		/* The shared cache file must be owned by root */
1411 		VATTR_INIT(&va);
1412 		VATTR_WANTED(&va, va_uid);
1413 		error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1414 		if (error) {
1415 			SHARED_REGION_TRACE_ERROR(
1416 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1417 				"vnode_getattr(%p) failed (error=%d)\n",
1418 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1419 				proc_getpid(p), p->p_comm,
1420 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1421 				srfmp->vp->v_name,
1422 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1423 				error));
1424 			goto done;
1425 		}
1426 		if (va.va_uid != 0) {
1427 			SHARED_REGION_TRACE_ERROR(
1428 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1429 				"owned by uid=%d instead of 0\n",
1430 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1431 				proc_getpid(p), p->p_comm,
1432 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1433 				srfmp->vp->v_name, va.va_uid));
1434 			error = EPERM;
1435 			goto done;
1436 		}
1437 
1438 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1439 after_root_check:
1440 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1441 
1442 #if CONFIG_CSR
1443 		if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
1444 			VATTR_INIT(&va);
1445 			VATTR_WANTED(&va, va_flags);
1446 			error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1447 			if (error) {
1448 				SHARED_REGION_TRACE_ERROR(
1449 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1450 					"vnode_getattr(%p) failed (error=%d)\n",
1451 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1452 					proc_getpid(p), p->p_comm,
1453 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1454 					srfmp->vp->v_name,
1455 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1456 					error));
1457 				goto done;
1458 			}
1459 
1460 			if (!(va.va_flags & SF_RESTRICTED)) {
1461 				/*
1462 				 * CSR is not configured in CSR_ALLOW_UNRESTRICTED_FS mode, and
1463 				 * the shared cache file is NOT SIP-protected, so reject the
1464 				 * mapping request
1465 				 */
1466 				SHARED_REGION_TRACE_ERROR(
1467 					("shared_region: %p [%d(%s)] map(%p:'%s'), "
1468 					"vnode is not SIP-protected. \n",
1469 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1470 					proc_getpid(p), p->p_comm,
1471 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1472 					srfmp->vp->v_name));
1473 				error = EPERM;
1474 				goto done;
1475 			}
1476 		}
1477 #else /* CONFIG_CSR */
1478 
1479 		/*
1480 		 * Devices without SIP/ROSP need to make sure that the shared cache
1481 		 * is either on the root volume or in the preboot cryptex volume.
1482 		 */
1483 		assert(rdir_vp != NULL);
1484 		if (srfmp->vp->v_mount != rdir_vp->v_mount) {
1485 			vnode_t preboot_vp = NULL;
1486 #if XNU_TARGET_OS_OSX
1487 #define PREBOOT_CRYPTEX_PATH "/System/Volumes/Preboot/Cryptexes"
1488 #else
1489 #define PREBOOT_CRYPTEX_PATH "/private/preboot/Cryptexes"
1490 #endif
1491 			error = vnode_lookup(PREBOOT_CRYPTEX_PATH, 0, &preboot_vp, vfs_context_current());
1492 			if (error || srfmp->vp->v_mount != preboot_vp->v_mount) {
1493 				SHARED_REGION_TRACE_ERROR(
1494 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1495 					"not on process' root volume nor preboot volume\n",
1496 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1497 					proc_getpid(p), p->p_comm,
1498 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1499 					srfmp->vp->v_name));
1500 				error = EPERM;
1501 				if (preboot_vp) {
1502 					(void)vnode_put(preboot_vp);
1503 				}
1504 				goto done;
1505 			} else if (preboot_vp) {
1506 				(void)vnode_put(preboot_vp);
1507 			}
1508 		}
1509 #endif /* CONFIG_CSR */
1510 
1511 		if (scdir_enforce) {
1512 			char **expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1513 			struct vnode *scdir_vp = NULL;
1514 			for (expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1515 			    *expected_scdir_path != NULL;
1516 			    expected_scdir_path++) {
1517 				/* get vnode for expected_scdir_path */
1518 				error = vnode_lookup(*expected_scdir_path, 0, &scdir_vp, vfs_context_current());
1519 				if (error) {
1520 					SHARED_REGION_TRACE_ERROR(
1521 						("shared_region: %p [%d(%s)]: "
1522 						"vnode_lookup(%s) failed (error=%d)\n",
1523 						(void *)VM_KERNEL_ADDRPERM(current_thread()),
1524 						proc_getpid(p), p->p_comm,
1525 						*expected_scdir_path, error));
1526 					continue;
1527 				}
1528 
1529 				/* check if parent is scdir_vp */
1530 				assert(scdir_vp != NULL);
1531 				if (vnode_parent(srfmp->vp) == scdir_vp) {
1532 					(void)vnode_put(scdir_vp);
1533 					scdir_vp = NULL;
1534 					goto scdir_ok;
1535 				}
1536 				(void)vnode_put(scdir_vp);
1537 				scdir_vp = NULL;
1538 			}
1539 			/* nothing matches */
1540 			SHARED_REGION_TRACE_ERROR(
1541 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1542 				"shared cache file not in expected directory\n",
1543 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1544 				proc_getpid(p), p->p_comm,
1545 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1546 				srfmp->vp->v_name));
1547 			error = EPERM;
1548 			goto done;
1549 		}
1550 scdir_ok:
1551 
1552 		/* get vnode size */
1553 		error = vnode_size(srfmp->vp, &fs, vfs_context_current());
1554 		if (error) {
1555 			SHARED_REGION_TRACE_ERROR(
1556 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1557 				"vnode_size(%p) failed (error=%d)\n",
1558 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1559 				proc_getpid(p), p->p_comm,
1560 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1561 				srfmp->vp->v_name,
1562 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp), error));
1563 			goto done;
1564 		}
1565 		srfmp->file_size = fs;
1566 
1567 		/* get the file's memory object handle */
1568 		srfmp->file_control = ubc_getobject(srfmp->vp, UBC_HOLDOBJECT);
1569 		if (srfmp->file_control == MEMORY_OBJECT_CONTROL_NULL) {
1570 			SHARED_REGION_TRACE_ERROR(
1571 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1572 				"no memory object\n",
1573 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1574 				proc_getpid(p), p->p_comm,
1575 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1576 				srfmp->vp->v_name));
1577 			error = EINVAL;
1578 			goto done;
1579 		}
1580 
1581 		/* check that the mappings are properly covered by code signatures */
1582 		if (!cs_system_enforcement()) {
1583 			/* code signing is not enforced: no need to check */
1584 		} else {
1585 			for (i = 0; i < srfmp->mappings_count; i++) {
1586 				if (srfmp->mappings[i].sms_init_prot & VM_PROT_ZF) {
1587 					/* zero-filled mapping: not backed by the file */
1588 					continue;
1589 				}
1590 				if (ubc_cs_is_range_codesigned(srfmp->vp,
1591 				    srfmp->mappings[i].sms_file_offset,
1592 				    srfmp->mappings[i].sms_size)) {
1593 					/* this mapping is fully covered by code signatures */
1594 					continue;
1595 				}
1596 				SHARED_REGION_TRACE_ERROR(
1597 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1598 					"mapping #%d/%d [0x%llx:0x%llx:0x%llx:0x%x:0x%x] "
1599 					"is not code-signed\n",
1600 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1601 					proc_getpid(p), p->p_comm,
1602 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1603 					srfmp->vp->v_name,
1604 					i, srfmp->mappings_count,
1605 					srfmp->mappings[i].sms_address,
1606 					srfmp->mappings[i].sms_size,
1607 					srfmp->mappings[i].sms_file_offset,
1608 					srfmp->mappings[i].sms_max_prot,
1609 					srfmp->mappings[i].sms_init_prot));
1610 				error = EINVAL;
1611 				goto done;
1612 			}
1613 		}
1614 	}
1615 done:
1616 	if (error != 0) {
1617 		shared_region_map_and_slide_cleanup(p, files_count, *sr_file_mappings, shared_region);
1618 		*sr_file_mappings = NULL;
1619 		*shared_region_ptr = NULL;
1620 	}
1621 	SHARED_REGION_TRACE_DEBUG(
1622 		("shared_region: %p [%d(%s)] map_and_slide_setup <- %d\n",
1623 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1624 		proc_getpid(p), p->p_comm, error));
1625 	return error;
1626 }
1627 
1628 /*
1629  * shared_region_map_np()
1630  *
1631  * This system call is intended for dyld.
1632  *
1633  * dyld uses this to map a shared cache file into a shared region.
1634  * This is usually done only the first time a shared cache is needed.
1635  * Subsequent processes will just use the populated shared region without
1636  * requiring any further setup.
1637  */
1638 static int
_shared_region_map_and_slide(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings)1639 _shared_region_map_and_slide(
1640 	struct proc                         *p,
1641 	uint32_t                            files_count,
1642 	struct shared_file_np               *files,
1643 	uint32_t                            mappings_count,
1644 	struct shared_file_mapping_slide_np *mappings)
1645 {
1646 	int                             error = 0;
1647 	kern_return_t                   kr = KERN_SUCCESS;
1648 	struct _sr_file_mappings        *sr_file_mappings = NULL;
1649 	struct vnode                    *rdir_vp = NULL;
1650 	struct vm_shared_region         *shared_region = NULL;
1651 
1652 	/*
1653 	 * Get a reference to the current proc's root dir.
1654 	 * Need this to prevent racing with chroot.
1655 	 */
1656 	proc_fdlock(p);
1657 	rdir_vp = p->p_fd.fd_rdir;
1658 	if (rdir_vp == NULL) {
1659 		rdir_vp = rootvnode;
1660 	}
1661 	assert(rdir_vp != NULL);
1662 	vnode_get(rdir_vp);
1663 	proc_fdunlock(p);
1664 
1665 	/*
1666 	 * Turn files, mappings into sr_file_mappings and other setup.
1667 	 */
1668 	error = shared_region_map_and_slide_setup(p, files_count,
1669 	    files, mappings_count, mappings,
1670 	    &sr_file_mappings, &shared_region, rdir_vp);
1671 	if (error != 0) {
1672 		vnode_put(rdir_vp);
1673 		return error;
1674 	}
1675 
1676 	/* map the file(s) into that shared region's submap */
1677 	kr = vm_shared_region_map_file(shared_region, files_count, sr_file_mappings);
1678 	if (kr != KERN_SUCCESS) {
1679 		SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] map(): "
1680 		    "vm_shared_region_map_file() failed kr=0x%x\n",
1681 		    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1682 		    proc_getpid(p), p->p_comm, kr));
1683 	}
1684 
1685 	/* convert kern_return_t to errno */
1686 	switch (kr) {
1687 	case KERN_SUCCESS:
1688 		error = 0;
1689 		break;
1690 	case KERN_INVALID_ADDRESS:
1691 		error = EFAULT;
1692 		break;
1693 	case KERN_PROTECTION_FAILURE:
1694 		error = EPERM;
1695 		break;
1696 	case KERN_NO_SPACE:
1697 		error = ENOMEM;
1698 		break;
1699 	case KERN_FAILURE:
1700 	case KERN_INVALID_ARGUMENT:
1701 	default:
1702 		error = EINVAL;
1703 		break;
1704 	}
1705 
1706 	/*
1707 	 * Mark that this process is now using split libraries.
1708 	 */
1709 	if (error == 0 && (p->p_flag & P_NOSHLIB)) {
1710 		OSBitAndAtomic(~((uint32_t)P_NOSHLIB), &p->p_flag);
1711 	}
1712 
1713 	vnode_put(rdir_vp);
1714 	shared_region_map_and_slide_cleanup(p, files_count, sr_file_mappings, shared_region);
1715 
1716 	SHARED_REGION_TRACE_DEBUG(
1717 		("shared_region: %p [%d(%s)] <- map\n",
1718 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1719 		proc_getpid(p), p->p_comm));
1720 
1721 	return error;
1722 }
1723 
1724 /*
1725  * Clean up part of _shared_region_map_and_slide()
1726  * It had to be broken out of _shared_region_map_and_slide() to
1727  * prevent compiler inlining from blowing out the stack.
1728  */
1729 __attribute__((noinline))
1730 static void
shared_region_map_and_slide_cleanup(struct proc * p,uint32_t files_count,struct _sr_file_mappings * sr_file_mappings,struct vm_shared_region * shared_region)1731 shared_region_map_and_slide_cleanup(
1732 	struct proc              *p,
1733 	uint32_t                 files_count,
1734 	struct _sr_file_mappings *sr_file_mappings,
1735 	struct vm_shared_region  *shared_region)
1736 {
1737 	struct _sr_file_mappings *srfmp;
1738 	struct vnode_attr        va;
1739 
1740 	if (sr_file_mappings != NULL) {
1741 		for (srfmp = &sr_file_mappings[0]; srfmp < &sr_file_mappings[files_count]; srfmp++) {
1742 			if (srfmp->vp != NULL) {
1743 				vnode_lock_spin(srfmp->vp);
1744 				srfmp->vp->v_flag |= VSHARED_DYLD;
1745 				vnode_unlock(srfmp->vp);
1746 
1747 				/* update the vnode's access time */
1748 				if (!(vnode_vfsvisflags(srfmp->vp) & MNT_NOATIME)) {
1749 					VATTR_INIT(&va);
1750 					nanotime(&va.va_access_time);
1751 					VATTR_SET_ACTIVE(&va, va_access_time);
1752 					vnode_setattr(srfmp->vp, &va, vfs_context_current());
1753 				}
1754 
1755 #if NAMEDSTREAMS
1756 				/*
1757 				 * If the shared cache is compressed, it may
1758 				 * have a namedstream vnode instantiated for
1759 				 * for it. That namedstream vnode will also
1760 				 * have to be marked with VSHARED_DYLD.
1761 				 */
1762 				if (vnode_hasnamedstreams(srfmp->vp)) {
1763 					vnode_t svp;
1764 					if (vnode_getnamedstream(srfmp->vp, &svp, XATTR_RESOURCEFORK_NAME,
1765 					    NS_OPEN, 0, vfs_context_kernel()) == 0) {
1766 						vnode_lock_spin(svp);
1767 						svp->v_flag |= VSHARED_DYLD;
1768 						vnode_unlock(svp);
1769 						vnode_put(svp);
1770 					}
1771 				}
1772 #endif /* NAMEDSTREAMS */
1773 				/*
1774 				 * release the vnode...
1775 				 * ubc_map() still holds it for us in the non-error case
1776 				 */
1777 				(void) vnode_put(srfmp->vp);
1778 				srfmp->vp = NULL;
1779 			}
1780 			if (srfmp->fp != NULL) {
1781 				/* release the file descriptor */
1782 				fp_drop(p, srfmp->fd, srfmp->fp, 0);
1783 				srfmp->fp = NULL;
1784 			}
1785 		}
1786 		kfree_type(struct _sr_file_mappings, files_count, sr_file_mappings);
1787 	}
1788 
1789 	if (shared_region != NULL) {
1790 		vm_shared_region_deallocate(shared_region);
1791 	}
1792 }
1793 
1794 /*
1795  * For each file mapped, we may have mappings for:
1796  *    TEXT, EXECUTE, LINKEDIT, DATA_CONST, __AUTH, DATA
1797  * so let's round up to 8 mappings per file.
1798  */
1799 #define SFM_MAX       (_SR_FILE_MAPPINGS_MAX_FILES * 8)     /* max mapping structs allowed to pass in */
1800 
1801 /*
1802  * This is the new interface for setting up shared region mappings.
1803  *
1804  * The slide used for shared regions setup using this interface is done differently
1805  * from the old interface. The slide value passed in the shared_files_np represents
1806  * a max value. The kernel will choose a random value based on that, then use it
1807  * for all shared regions.
1808  */
1809 #if defined (__x86_64__)
1810 #define SLIDE_AMOUNT_MASK ~FOURK_PAGE_MASK
1811 #else
1812 #define SLIDE_AMOUNT_MASK ~SIXTEENK_PAGE_MASK
1813 #endif
1814 
1815 static inline __result_use_check kern_return_t
shared_region_map_and_slide_2_np_sanitize(struct proc * p,user_addr_t mappings_userspace_addr,unsigned int count,shared_file_mapping_slide_np_t * mappings)1816 shared_region_map_and_slide_2_np_sanitize(
1817 	struct proc                         *p,
1818 	user_addr_t                         mappings_userspace_addr,
1819 	unsigned int                        count,
1820 	shared_file_mapping_slide_np_t      *mappings)
1821 {
1822 	kern_return_t kr;
1823 	vm_map_t map = current_map();
1824 	mach_vm_address_t addr, end;
1825 	mach_vm_offset_t offset, offset_end;
1826 	mach_vm_size_t size, offset_size;
1827 	user_addr_t slide_start, slide_end, slide_size;
1828 	vm_prot_t cur;
1829 	vm_prot_t max;
1830 
1831 	user_addr_t user_addr = mappings_userspace_addr;
1832 
1833 	for (size_t i = 0; i < count; i++) {
1834 		shared_file_mapping_slide_np_ut mapping_u;
1835 		/*
1836 		 * First we bring each mapping struct into our kernel stack to
1837 		 * avoid TOCTOU.
1838 		 */
1839 		kr = shared_region_copyin(
1840 			p,
1841 			user_addr,
1842 			1, // copy 1 element at a time
1843 			sizeof(shared_file_mapping_slide_np_ut),
1844 			&mapping_u);
1845 		if (__improbable(kr != KERN_SUCCESS)) {
1846 			return kr;
1847 		}
1848 
1849 		/*
1850 		 * Then, we sanitize the data on the kernel stack.
1851 		 */
1852 		kr = vm_sanitize_addr_size(
1853 			mapping_u.sms_address_u,
1854 			mapping_u.sms_size_u,
1855 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1856 			map,
1857 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1858 			| VM_SANITIZE_FLAGS_CHECK_ALIGNED_START
1859 			| VM_SANITIZE_FLAGS_CHECK_ALIGNED_SIZE),
1860 			&addr,
1861 			&end,
1862 			&size);
1863 		if (__improbable(kr != KERN_SUCCESS)) {
1864 			return kr;
1865 		}
1866 
1867 		kr = vm_sanitize_addr_size(
1868 			mapping_u.sms_file_offset_u,
1869 			mapping_u.sms_size_u,
1870 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1871 			PAGE_MASK,
1872 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1873 			| VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1874 			&offset,
1875 			&offset_end,
1876 			&offset_size);
1877 		if (__improbable(kr != KERN_SUCCESS)) {
1878 			return kr;
1879 		}
1880 		if (__improbable(0 != (offset & vm_map_page_mask(map)))) {
1881 			return KERN_INVALID_ARGUMENT;
1882 		}
1883 
1884 		/*
1885 		 * Unsafe access is immediately followed by wrap to
1886 		 * convert from addr to size.
1887 		 */
1888 		mach_vm_size_ut sms_slide_size_u =
1889 		    vm_sanitize_wrap_size(
1890 			VM_SANITIZE_UNSAFE_UNWRAP(
1891 				mapping_u.sms_slide_size_u));
1892 
1893 		kr = vm_sanitize_addr_size(
1894 			mapping_u.sms_slide_start_u,
1895 			sms_slide_size_u,
1896 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1897 			map,
1898 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1899 			| VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1900 			&slide_start,
1901 			&slide_end,
1902 			&slide_size);
1903 		if (__improbable(kr != KERN_SUCCESS)) {
1904 			return kr;
1905 		}
1906 
1907 		kr = vm_sanitize_cur_and_max_prots(
1908 			mapping_u.sms_init_prot_u,
1909 			mapping_u.sms_max_prot_u,
1910 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1911 			map,
1912 			VM_PROT_SFM_EXTENSIONS_MASK | VM_PROT_TPRO,
1913 			&cur,
1914 			&max);
1915 		if (__improbable(kr != KERN_SUCCESS)) {
1916 			return kr;
1917 		}
1918 
1919 		/*
1920 		 * Finally, we move the data from the kernel stack to our
1921 		 * caller-allocated kernel heap buffer.
1922 		 */
1923 		mappings[i].sms_address = addr;
1924 		mappings[i].sms_size = size;
1925 		mappings[i].sms_file_offset = offset;
1926 		mappings[i].sms_slide_size = slide_size;
1927 		mappings[i].sms_slide_start = slide_start;
1928 		mappings[i].sms_max_prot = max;
1929 		mappings[i].sms_init_prot = cur;
1930 
1931 		if (__improbable(os_add_overflow(
1932 			    user_addr,
1933 			    sizeof(shared_file_mapping_slide_np_ut),
1934 			    &user_addr))) {
1935 			return KERN_INVALID_ARGUMENT;
1936 		}
1937 	}
1938 
1939 	return KERN_SUCCESS;
1940 }
1941 
1942 int
shared_region_map_and_slide_2_np(struct proc * p,struct shared_region_map_and_slide_2_np_args * uap,__unused int * retvalp)1943 shared_region_map_and_slide_2_np(
1944 	struct proc                                  *p,
1945 	struct shared_region_map_and_slide_2_np_args *uap,
1946 	__unused int                                 *retvalp)
1947 {
1948 	unsigned int                  files_count;
1949 	struct shared_file_np         *shared_files = NULL;
1950 	unsigned int                  mappings_count;
1951 	struct shared_file_mapping_slide_np *mappings = NULL;
1952 	kern_return_t                 kr = KERN_SUCCESS;
1953 
1954 	files_count = uap->files_count;
1955 	mappings_count = uap->mappings_count;
1956 
1957 	SHARED_REGION_TRACE_DEBUG(
1958 		("shared_region: %p [%d(%s)] -> map_and_slide(0x%llx)\n",
1959 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1960 		proc_getpid(p), p->p_comm,
1961 		(uint64_t)uap->mappings_u));
1962 
1963 	if (files_count == 0) {
1964 		SHARED_REGION_TRACE_INFO(
1965 			("shared_region: %p [%d(%s)] map(): "
1966 			"no files\n",
1967 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1968 			proc_getpid(p), p->p_comm));
1969 		kr = 0; /* no files to map: we're done ! */
1970 		goto done;
1971 	} else if (files_count <= _SR_FILE_MAPPINGS_MAX_FILES) {
1972 		shared_files = kalloc_data(files_count * sizeof(shared_files[0]), Z_WAITOK);
1973 		if (shared_files == NULL) {
1974 			kr = KERN_RESOURCE_SHORTAGE;
1975 			goto done;
1976 		}
1977 	} else {
1978 		SHARED_REGION_TRACE_ERROR(
1979 			("shared_region: %p [%d(%s)] map(): "
1980 			"too many files (%d) max %d\n",
1981 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1982 			proc_getpid(p), p->p_comm,
1983 			files_count, _SR_FILE_MAPPINGS_MAX_FILES));
1984 		kr = KERN_FAILURE;
1985 		goto done;
1986 	}
1987 
1988 	if (mappings_count == 0) {
1989 		SHARED_REGION_TRACE_INFO(
1990 			("shared_region: %p [%d(%s)] map(): "
1991 			"no mappings\n",
1992 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1993 			proc_getpid(p), p->p_comm));
1994 		kr = 0; /* no mappings: we're done ! */
1995 		goto done;
1996 	} else if (mappings_count <= SFM_MAX) {
1997 		mappings = kalloc_data(mappings_count * sizeof(mappings[0]), Z_WAITOK);
1998 		if (mappings == NULL) {
1999 			kr = KERN_RESOURCE_SHORTAGE;
2000 			goto done;
2001 		}
2002 	} else {
2003 		SHARED_REGION_TRACE_ERROR(
2004 			("shared_region: %p [%d(%s)] map(): "
2005 			"too many mappings (%d) max %d\n",
2006 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2007 			proc_getpid(p), p->p_comm,
2008 			mappings_count, SFM_MAX));
2009 		kr = KERN_FAILURE;
2010 		goto done;
2011 	}
2012 
2013 	/*
2014 	 * struct shared_file_np does not have fields that are subject to
2015 	 * sanitization, it is thus copied from userspace as is.
2016 	 */
2017 	kr = shared_region_copyin(p, uap->files, files_count, sizeof(shared_files[0]), shared_files);
2018 	if (kr != KERN_SUCCESS) {
2019 		SHARED_REGION_TRACE_ERROR(
2020 			("shared_region: %p [%d(%s)] copyin() returned 0x%x\n",
2021 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2022 			proc_getpid(p), p->p_comm, kr));
2023 		goto done;
2024 	}
2025 
2026 	kr = shared_region_map_and_slide_2_np_sanitize(
2027 		p,
2028 		uap->mappings_u,
2029 		mappings_count,
2030 		mappings);
2031 	if (__improbable(kr != KERN_SUCCESS)) {
2032 		SHARED_REGION_TRACE_ERROR(
2033 			("shared_region: %p [%d(%s)] sanitize() returned 0x%x\n",
2034 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2035 			proc_getpid(p), p->p_comm, kr));
2036 		kr = vm_sanitize_get_kr(kr);
2037 		goto done;
2038 	}
2039 
2040 	uint32_t max_slide = shared_files[0].sf_slide;
2041 	uint32_t random_val;
2042 	uint32_t slide_amount;
2043 
2044 	if (max_slide != 0) {
2045 		read_random(&random_val, sizeof random_val);
2046 		slide_amount = ((random_val % max_slide) & SLIDE_AMOUNT_MASK);
2047 	} else {
2048 		slide_amount = 0;
2049 	}
2050 #if DEVELOPMENT || DEBUG
2051 	extern bool bootarg_disable_aslr;
2052 	if (bootarg_disable_aslr) {
2053 		slide_amount = 0;
2054 	}
2055 #endif /* DEVELOPMENT || DEBUG */
2056 
2057 	/*
2058 	 * Fix up the mappings to reflect the desired slide.
2059 	 */
2060 	unsigned int f;
2061 	unsigned int m = 0;
2062 	unsigned int i;
2063 	for (f = 0; f < files_count; ++f) {
2064 		shared_files[f].sf_slide = slide_amount;
2065 		for (i = 0; i < shared_files[f].sf_mappings_count; ++i, ++m) {
2066 			if (m >= mappings_count) {
2067 				SHARED_REGION_TRACE_ERROR(
2068 					("shared_region: %p [%d(%s)] map(): "
2069 					"mapping count argument was too small\n",
2070 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
2071 					proc_getpid(p), p->p_comm));
2072 				kr = KERN_FAILURE;
2073 				goto done;
2074 			}
2075 			if (__improbable(
2076 				    os_add_overflow(
2077 					    mappings[m].sms_address,
2078 					    slide_amount,
2079 					    &mappings[m].sms_address))) {
2080 				kr = KERN_INVALID_ARGUMENT;
2081 				goto done;
2082 			}
2083 			if (mappings[m].sms_slide_size != 0) {
2084 				mach_vm_address_t discard;
2085 				/* Slide and check that new start/size pairs do not overflow. */
2086 				if (__improbable(
2087 					    os_add_overflow(
2088 						    mappings[m].sms_slide_start,
2089 						    slide_amount,
2090 						    &mappings[m].sms_slide_start) ||
2091 					    os_add_overflow(
2092 						    mappings[m].sms_slide_start,
2093 						    mappings[m].sms_slide_size,
2094 						    &discard))) {
2095 					kr = KERN_INVALID_ARGUMENT;
2096 					goto done;
2097 				}
2098 			}
2099 		}
2100 	}
2101 
2102 	kr = _shared_region_map_and_slide(p, files_count, shared_files, mappings_count, mappings);
2103 done:
2104 	kfree_data(shared_files, files_count * sizeof(shared_files[0]));
2105 	kfree_data(mappings, mappings_count * sizeof(mappings[0]));
2106 
2107 	SHARED_REGION_TRACE_DEBUG(
2108 		("shared_region: %p [%d(%s)] map_and_slide(0x%llx) <- 0x%x\n",
2109 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
2110 		proc_getpid(p), p->p_comm,
2111 		(uint64_t)uap->mappings_u, kr));
2112 
2113 	return kr;
2114 }
2115 
2116 
2117 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_total_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_total_success, "");
2118 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_total_fail, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_total_fail, "");
2119 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_overflow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_overflow, "");
2120 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_bad_offset, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_bad_offset, "");
2121 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_bad_addr, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_bad_addr, "");
2122 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_bad_prot, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_bad_prot, "");
2123 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_bad_file, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_bad_file, "");
2124 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_bad_shadows, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_bad_shadows, "");
2125 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_bad_cow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_bad_cow, "");
2126 
2127 /*
2128  * A syscall for dyld to use to map data pages that need load time relocation fixups.
2129  * The fixups are performed by a custom pager during page-in, so the pages still appear
2130  * "clean" and hence are easily discarded under memory pressure. They can be re-paged-in
2131  * on demand later, all w/o using the compressor.
2132  *
2133  * Note these page are treated as MAP_PRIVATE. So if the application dirties any pages while
2134  * running, they are COW'd as normal.
2135  */
2136 int
map_with_linking_np(struct proc * p,struct map_with_linking_np_args * uap,__unused int * retvalp)2137 map_with_linking_np(
2138 	struct proc                     *p,
2139 	struct map_with_linking_np_args *uap,
2140 	__unused int                    *retvalp)
2141 {
2142 	uint32_t                        region_count;
2143 	uint32_t                        r;
2144 	struct mwl_region               *regions = NULL;
2145 	struct mwl_region               *rp;
2146 	uint32_t                        link_info_size;
2147 	void                            *link_info = NULL;      /* starts with a struct mwl_info_hdr */
2148 	struct mwl_info_hdr             *info_hdr = NULL;
2149 	uint64_t                        binds_size;
2150 	int                             fd;
2151 	struct fileproc                 *fp = NULL;
2152 	struct vnode                    *vp = NULL;
2153 	size_t                          file_size;
2154 	off_t                           fs;
2155 	struct vnode_attr               va;
2156 	memory_object_control_t         file_control = NULL;
2157 	int                             error;
2158 	kern_return_t                   kr = KERN_SUCCESS;
2159 
2160 	/*
2161 	 * Check if dyld has told us it finished with this call.
2162 	 */
2163 	if (p->p_disallow_map_with_linking) {
2164 		printf("%s: [%d(%s)]: map__with_linking() was disabled\n",
2165 		    __func__, proc_getpid(p), p->p_comm);
2166 		kr = KERN_FAILURE;
2167 		goto done;
2168 	}
2169 
2170 	/*
2171 	 * First we do some sanity checking on what dyld has passed us.
2172 	 */
2173 	region_count = uap->region_count;
2174 	link_info_size = uap->link_info_size;
2175 	if (region_count == 0) {
2176 		printf("%s: [%d(%s)]: region_count == 0\n",
2177 		    __func__, proc_getpid(p), p->p_comm);
2178 		kr = KERN_FAILURE;
2179 		goto done;
2180 	}
2181 	if (region_count > MWL_MAX_REGION_COUNT) {
2182 		printf("%s: [%d(%s)]: region_count too big %d\n",
2183 		    __func__, proc_getpid(p), p->p_comm, region_count);
2184 		kr = KERN_FAILURE;
2185 		goto done;
2186 	}
2187 
2188 	if (link_info_size <= MWL_MIN_LINK_INFO_SIZE) {
2189 		printf("%s: [%d(%s)]: link_info_size too small\n",
2190 		    __func__, proc_getpid(p), p->p_comm);
2191 		kr = KERN_FAILURE;
2192 		goto done;
2193 	}
2194 	if (link_info_size >= MWL_MAX_LINK_INFO_SIZE) {
2195 		printf("%s: [%d(%s)]: link_info_size too big %d\n",
2196 		    __func__, proc_getpid(p), p->p_comm, link_info_size);
2197 		kr = KERN_FAILURE;
2198 		goto done;
2199 	}
2200 
2201 	/*
2202 	 * Allocate and copyin the regions and link info
2203 	 */
2204 	regions = kalloc_data(region_count * sizeof(regions[0]), Z_WAITOK);
2205 	if (regions == NULL) {
2206 		printf("%s: [%d(%s)]: failed to allocate regions\n",
2207 		    __func__, proc_getpid(p), p->p_comm);
2208 		kr = KERN_RESOURCE_SHORTAGE;
2209 		goto done;
2210 	}
2211 	kr = shared_region_copyin(p, uap->regions, region_count, sizeof(regions[0]), regions);
2212 	if (kr != KERN_SUCCESS) {
2213 		printf("%s: [%d(%s)]: failed to copyin regions kr=%d\n",
2214 		    __func__, proc_getpid(p), p->p_comm, kr);
2215 		goto done;
2216 	}
2217 
2218 	link_info = kalloc_data(link_info_size, Z_WAITOK);
2219 	if (link_info == NULL) {
2220 		printf("%s: [%d(%s)]: failed to allocate link_info\n",
2221 		    __func__, proc_getpid(p), p->p_comm);
2222 		kr = KERN_RESOURCE_SHORTAGE;
2223 		goto done;
2224 	}
2225 	kr = shared_region_copyin(p, uap->link_info, 1, link_info_size, link_info);
2226 	if (kr != KERN_SUCCESS) {
2227 		printf("%s: [%d(%s)]: failed to copyin link_info kr=%d\n",
2228 		    __func__, proc_getpid(p), p->p_comm, kr);
2229 		goto done;
2230 	}
2231 
2232 	/*
2233 	 * Do some verification the data structures.
2234 	 */
2235 	info_hdr = (struct mwl_info_hdr *)link_info;
2236 	if (info_hdr->mwli_version != MWL_INFO_VERS) {
2237 		printf("%s: [%d(%s)]: unrecognized mwli_version=%d\n",
2238 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_version);
2239 		kr = KERN_FAILURE;
2240 		goto done;
2241 	}
2242 
2243 	if (info_hdr->mwli_binds_offset > link_info_size) {
2244 		printf("%s: [%d(%s)]: mwli_binds_offset too large %d\n",
2245 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_offset);
2246 		kr = KERN_FAILURE;
2247 		goto done;
2248 	}
2249 
2250 	/* some older devs have s/w page size > h/w page size, no need to support them */
2251 	if (info_hdr->mwli_page_size != PAGE_SIZE) {
2252 		/* no printf, since this is expected on some devices */
2253 		kr = KERN_INVALID_ARGUMENT;
2254 		goto done;
2255 	}
2256 
2257 	binds_size = (uint64_t)info_hdr->mwli_binds_count *
2258 	    ((info_hdr->mwli_pointer_format == DYLD_CHAINED_PTR_32) ? 4 : 8);
2259 	if (binds_size > link_info_size - info_hdr->mwli_binds_offset) {
2260 		printf("%s: [%d(%s)]: mwli_binds_count too large %d\n",
2261 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_count);
2262 		kr = KERN_FAILURE;
2263 		goto done;
2264 	}
2265 
2266 	if (info_hdr->mwli_chains_offset > link_info_size) {
2267 		printf("%s: [%d(%s)]: mwli_chains_offset too large %d\n",
2268 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_offset);
2269 		kr = KERN_FAILURE;
2270 		goto done;
2271 	}
2272 
2273 
2274 	/*
2275 	 * Ensure the chained starts in the link info and make sure the
2276 	 * segment info offsets are within bounds.
2277 	 */
2278 	if (info_hdr->mwli_chains_size < sizeof(struct dyld_chained_starts_in_image)) {
2279 		printf("%s: [%d(%s)]: mwli_chains_size too small %d\n",
2280 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2281 		kr = KERN_FAILURE;
2282 		goto done;
2283 	}
2284 	if (info_hdr->mwli_chains_size > link_info_size - info_hdr->mwli_chains_offset) {
2285 		printf("%s: [%d(%s)]: mwli_chains_size too large %d\n",
2286 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2287 		kr = KERN_FAILURE;
2288 		goto done;
2289 	}
2290 
2291 	/* Note that more verification of offsets is done in the pager itself */
2292 
2293 	/*
2294 	 * Ensure we've only been given one FD and verify valid protections.
2295 	 */
2296 	fd = regions[0].mwlr_fd;
2297 	for (r = 0; r < region_count; ++r) {
2298 		if (regions[r].mwlr_fd != fd) {
2299 			printf("%s: [%d(%s)]: mwlr_fd mismatch %d and %d\n",
2300 			    __func__, proc_getpid(p), p->p_comm, fd, regions[r].mwlr_fd);
2301 			kr = KERN_FAILURE;
2302 			goto done;
2303 		}
2304 
2305 		/*
2306 		 * Only allow data mappings and not zero fill. Permit TPRO
2307 		 * mappings only when VM_PROT_READ | VM_PROT_WRITE.
2308 		 */
2309 		if (regions[r].mwlr_protections & VM_PROT_EXECUTE) {
2310 			printf("%s: [%d(%s)]: mwlr_protections EXECUTE not allowed\n",
2311 			    __func__, proc_getpid(p), p->p_comm);
2312 			kr = KERN_FAILURE;
2313 			goto done;
2314 		}
2315 		if (regions[r].mwlr_protections & VM_PROT_ZF) {
2316 			printf("%s: [%d(%s)]: region %d, found VM_PROT_ZF not allowed\n",
2317 			    __func__, proc_getpid(p), p->p_comm, r);
2318 			kr = KERN_FAILURE;
2319 			goto done;
2320 		}
2321 		if ((regions[r].mwlr_protections & VM_PROT_TPRO) &&
2322 		    !(regions[r].mwlr_protections & VM_PROT_WRITE)) {
2323 			printf("%s: [%d(%s)]: region %d, found VM_PROT_TPRO without VM_PROT_WRITE\n",
2324 			    __func__, proc_getpid(p), p->p_comm, r);
2325 			kr = KERN_FAILURE;
2326 			goto done;
2327 		}
2328 	}
2329 
2330 
2331 	/* get file structure from file descriptor */
2332 	error = fp_get_ftype(p, fd, DTYPE_VNODE, EINVAL, &fp);
2333 	if (error) {
2334 		printf("%s: [%d(%s)]: fp_get_ftype() failed, error %d\n",
2335 		    __func__, proc_getpid(p), p->p_comm, error);
2336 		kr = KERN_FAILURE;
2337 		goto done;
2338 	}
2339 
2340 	/* We need at least read permission on the file */
2341 	if (!(fp->fp_glob->fg_flag & FREAD)) {
2342 		printf("%s: [%d(%s)]: not readable\n",
2343 		    __func__, proc_getpid(p), p->p_comm);
2344 		kr = KERN_FAILURE;
2345 		goto done;
2346 	}
2347 
2348 	/* Get the vnode from file structure */
2349 	vp = (struct vnode *)fp_get_data(fp);
2350 	error = vnode_getwithref(vp);
2351 	if (error) {
2352 		printf("%s: [%d(%s)]: failed to get vnode, error %d\n",
2353 		    __func__, proc_getpid(p), p->p_comm, error);
2354 		kr = KERN_FAILURE;
2355 		vp = NULL; /* just to be sure */
2356 		goto done;
2357 	}
2358 
2359 	/* Make sure the vnode is a regular file */
2360 	if (vp->v_type != VREG) {
2361 		printf("%s: [%d(%s)]: vnode not VREG\n",
2362 		    __func__, proc_getpid(p), p->p_comm);
2363 		kr = KERN_FAILURE;
2364 		goto done;
2365 	}
2366 
2367 	/* get vnode size */
2368 	error = vnode_size(vp, &fs, vfs_context_current());
2369 	if (error) {
2370 		goto done;
2371 	}
2372 	file_size = fs;
2373 
2374 	/* get the file's memory object handle */
2375 	file_control = ubc_getobject(vp, UBC_HOLDOBJECT);
2376 	if (file_control == MEMORY_OBJECT_CONTROL_NULL) {
2377 		printf("%s: [%d(%s)]: no memory object\n",
2378 		    __func__, proc_getpid(p), p->p_comm);
2379 		kr = KERN_FAILURE;
2380 		goto done;
2381 	}
2382 
2383 	for (r = 0; r < region_count; ++r) {
2384 		rp = &regions[r];
2385 
2386 #if CONFIG_MACF
2387 		vm_prot_t prot = (rp->mwlr_protections & VM_PROT_ALL);
2388 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
2389 		    fp->fp_glob, prot, MAP_FILE | MAP_PRIVATE | MAP_FIXED, rp->mwlr_file_offset, &prot);
2390 		if (error) {
2391 			printf("%s: [%d(%s)]: mac_file_check_mmap() failed, region %d, error %d\n",
2392 			    __func__, proc_getpid(p), p->p_comm, r, error);
2393 			kr = KERN_FAILURE;
2394 			goto done;
2395 		}
2396 #endif /* MAC */
2397 
2398 		/* check that the mappings are properly covered by code signatures */
2399 		if (cs_system_enforcement()) {
2400 			if (!ubc_cs_is_range_codesigned(vp, rp->mwlr_file_offset, rp->mwlr_size)) {
2401 				printf("%s: [%d(%s)]: region %d, not code signed\n",
2402 				    __func__, proc_getpid(p), p->p_comm, r);
2403 				kr = KERN_FAILURE;
2404 				goto done;
2405 			}
2406 		}
2407 	}
2408 
2409 	/* update the vnode's access time */
2410 	if (!(vnode_vfsvisflags(vp) & MNT_NOATIME)) {
2411 		VATTR_INIT(&va);
2412 		nanotime(&va.va_access_time);
2413 		VATTR_SET_ACTIVE(&va, va_access_time);
2414 		vnode_setattr(vp, &va, vfs_context_current());
2415 	}
2416 
2417 	/* get the VM to do the work */
2418 	kr = vm_map_with_linking(proc_task(p), regions, region_count, &link_info, link_info_size, file_control);
2419 
2420 done:
2421 	if (fp != NULL) {
2422 		/* release the file descriptor */
2423 		fp_drop(p, fd, fp, 0);
2424 	}
2425 	if (vp != NULL) {
2426 		(void)vnode_put(vp);
2427 	}
2428 	if (regions != NULL) {
2429 		kfree_data(regions, region_count * sizeof(regions[0]));
2430 	}
2431 	/* link info is NULL if it is used in the pager, if things worked */
2432 	if (link_info != NULL) {
2433 		kfree_data(link_info, link_info_size);
2434 	}
2435 
2436 	switch (kr) {
2437 	case KERN_SUCCESS:
2438 		return 0;
2439 	case KERN_RESOURCE_SHORTAGE:
2440 		return ENOMEM;
2441 	default:
2442 		return EINVAL;
2443 	}
2444 }
2445 
2446 #if DEBUG || DEVELOPMENT
2447 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count,
2448     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count, 0, "");
2449 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count_max,
2450     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count_max, 0, "");
2451 #endif /* DEBUG || DEVELOPMENT */
2452 
2453 /* sysctl overflow room */
2454 
2455 SYSCTL_INT(_vm, OID_AUTO, pagesize, CTLFLAG_RD | CTLFLAG_LOCKED,
2456     (int *) &page_size, 0, "vm page size");
2457 
2458 /* vm_page_free_target is provided as a makeshift solution for applications that want to
2459  *       allocate buffer space, possibly purgeable memory, but not cause inactive pages to be
2460  *       reclaimed. It allows the app to calculate how much memory is free outside the free target. */
2461 extern unsigned int     vm_page_free_target;
2462 SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED,
2463     &vm_page_free_target, 0, "Pageout daemon free target");
2464 
2465 SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED,
2466     &vm_pageout_state.vm_memory_pressure, 0, "Memory pressure indicator");
2467 
2468 static int
2469 vm_ctl_page_free_wanted SYSCTL_HANDLER_ARGS
2470 {
2471 #pragma unused(oidp, arg1, arg2)
2472 	unsigned int page_free_wanted;
2473 
2474 	page_free_wanted = mach_vm_ctl_page_free_wanted();
2475 	return SYSCTL_OUT(req, &page_free_wanted, sizeof(page_free_wanted));
2476 }
2477 SYSCTL_PROC(_vm, OID_AUTO, page_free_wanted,
2478     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
2479     0, 0, vm_ctl_page_free_wanted, "I", "");
2480 
2481 extern unsigned int     vm_page_purgeable_count;
2482 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2483     &vm_page_purgeable_count, 0, "Purgeable page count");
2484 
2485 extern unsigned int     vm_page_purgeable_wired_count;
2486 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2487     &vm_page_purgeable_wired_count, 0, "Wired purgeable page count");
2488 
2489 extern unsigned int vm_page_kern_lpage_count;
2490 SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2491     &vm_page_kern_lpage_count, 0, "kernel used large pages");
2492 
2493 SCALABLE_COUNTER_DECLARE(vm_page_grab_count);
2494 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed");
2495 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_kern);
2496 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_kern, vm_page_grab_count_kern, "Total pages grabbed (kernel)");
2497 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_iopl);
2498 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_iopl, vm_page_grab_count_iopl, "Total pages grabbed (iopl)");
2499 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_upl);
2500 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_upl, vm_page_grab_count_upl, "Total pages grabbed (upl)");
2501 
2502 
2503 #if DEVELOPMENT || DEBUG
2504 SCALABLE_COUNTER_DECLARE(vm_page_deactivate_behind_count);
2505 SYSCTL_SCALABLE_COUNTER(_vm, pages_deactivated_behind, vm_page_deactivate_behind_count,
2506     "Number of pages deactivated behind");
2507 #endif
2508 
2509 #if DEVELOPMENT || DEBUG
2510 #if __ARM_MIXED_PAGE_SIZE__
2511 static int vm_mixed_pagesize_supported = 1;
2512 #else
2513 static int vm_mixed_pagesize_supported = 0;
2514 #endif /*__ARM_MIXED_PAGE_SIZE__ */
2515 SYSCTL_INT(_debug, OID_AUTO, vm_mixed_pagesize_supported, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED,
2516     &vm_mixed_pagesize_supported, 0, "kernel support for mixed pagesize");
2517 
2518 SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
2519     &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed");
2520 
2521 SYSCTL_INT(_vm, OID_AUTO, pageout_purged_objects, CTLFLAG_RD | CTLFLAG_LOCKED,
2522     &vm_pageout_debug.vm_pageout_purged_objects, 0, "System purged object count");
2523 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED,
2524     &vm_pageout_debug.vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)");
2525 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED,
2526     &vm_pageout_debug.vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)");
2527 
2528 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2529     &vm_pageout_debug.vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated");
2530 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2531     &vm_pageout_debug.vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated");
2532 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2533     &vm_pageout_debug.vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated");         /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */
2534 SYSCTL_ULONG(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2535     &vm_pageout_vminfo.vm_pageout_freed_cleaned, "Cleaned pages freed");
2536 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2537     &vm_pageout_debug.vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated");
2538 SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2539     &vm_pageout_debug.vm_pageout_enqueued_cleaned, 0, "");         /* sum of next two */
2540 #endif /* DEVELOPMENT || DEBUG */
2541 
2542 extern int madvise_free_debug;
2543 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
2544     &madvise_free_debug, 0, "zero-fill on madvise(MADV_FREE*)");
2545 extern int madvise_free_debug_sometimes;
2546 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug_sometimes, CTLFLAG_RW | CTLFLAG_LOCKED,
2547     &madvise_free_debug_sometimes, 0, "sometimes zero-fill on madvise(MADV_FREE*)");
2548 
2549 SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2550     &vm_page_stats_reusable.reusable_count, 0, "Reusable page count");
2551 SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2552     &vm_page_stats_reusable.reusable_pages_success, "");
2553 SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2554     &vm_page_stats_reusable.reusable_pages_failure, "");
2555 SYSCTL_QUAD(_vm, OID_AUTO, reusable_pages_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2556     &vm_page_stats_reusable.reusable_pages_shared, "");
2557 SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2558     &vm_page_stats_reusable.all_reusable_calls, "");
2559 SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2560     &vm_page_stats_reusable.partial_reusable_calls, "");
2561 SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2562     &vm_page_stats_reusable.reuse_pages_success, "");
2563 SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2564     &vm_page_stats_reusable.reuse_pages_failure, "");
2565 SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2566     &vm_page_stats_reusable.all_reuse_calls, "");
2567 SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2568     &vm_page_stats_reusable.partial_reuse_calls, "");
2569 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2570     &vm_page_stats_reusable.can_reuse_success, "");
2571 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2572     &vm_page_stats_reusable.can_reuse_failure, "");
2573 SYSCTL_QUAD(_vm, OID_AUTO, reusable_reclaimed, CTLFLAG_RD | CTLFLAG_LOCKED,
2574     &vm_page_stats_reusable.reusable_reclaimed, "");
2575 SYSCTL_QUAD(_vm, OID_AUTO, reusable_nonwritable, CTLFLAG_RD | CTLFLAG_LOCKED,
2576     &vm_page_stats_reusable.reusable_nonwritable, "");
2577 SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2578     &vm_page_stats_reusable.reusable_shared, "");
2579 SYSCTL_QUAD(_vm, OID_AUTO, free_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2580     &vm_page_stats_reusable.free_shared, "");
2581 
2582 
2583 extern unsigned int vm_page_free_count, vm_page_speculative_count;
2584 SYSCTL_UINT(_vm, OID_AUTO, page_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_count, 0, "");
2585 SYSCTL_UINT(_vm, OID_AUTO, page_speculative_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_speculative_count, 0, "");
2586 
2587 extern unsigned int vm_page_cleaned_count;
2588 SYSCTL_UINT(_vm, OID_AUTO, page_cleaned_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_cleaned_count, 0, "Cleaned queue size");
2589 
2590 extern unsigned int vm_page_pageable_internal_count, vm_page_pageable_external_count;
2591 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_internal_count, 0, "");
2592 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, "");
2593 
2594 /* pageout counts */
2595 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_clean, 0, "");
2596 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_used, 0, "");
2597 
2598 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, "");
2599 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_external, "");
2600 SYSCTL_ULONG(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2601 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_external, "");
2602 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2603 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_cleaned, "");
2604 
2605 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_sharedcache, "");
2606 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache, "");
2607 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_realtime, "");
2608 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime, "");
2609 extern unsigned int vm_page_realtime_count;
2610 SYSCTL_UINT(_vm, OID_AUTO, page_realtime_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_realtime_count, 0, "");
2611 extern int vm_pageout_protect_realtime;
2612 SYSCTL_INT(_vm, OID_AUTO, pageout_protect_realtime, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_protect_realtime, 0, "");
2613 
2614 /* counts of pages prefaulted when entering a memory object */
2615 extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout;
2616 extern int64_t vm_prefault_nb_no_page, vm_prefault_nb_wrong_page;
2617 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_pages, "");
2618 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_bailout, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_bailout, "");
2619 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_no_page, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_no_page, "");
2620 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_wrong_page, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_wrong_page, "");
2621 
2622 #if defined (__x86_64__)
2623 extern unsigned int vm_clump_promote_threshold;
2624 SYSCTL_UINT(_vm, OID_AUTO, vm_clump_promote_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_clump_promote_threshold, 0, "clump size threshold for promotes");
2625 #if DEVELOPMENT || DEBUG
2626 extern unsigned long vm_clump_stats[];
2627 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[1], "free page allocations from clump of 1 page");
2628 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[2], "free page allocations from clump of 2 pages");
2629 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[3], "free page allocations from clump of 3 pages");
2630 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats4, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[4], "free page allocations from clump of 4 pages");
2631 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats5, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[5], "free page allocations from clump of 5 pages");
2632 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats6, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[6], "free page allocations from clump of 6 pages");
2633 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats7, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[7], "free page allocations from clump of 7 pages");
2634 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats8, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[8], "free page allocations from clump of 8 pages");
2635 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats9, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[9], "free page allocations from clump of 9 pages");
2636 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats10, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[10], "free page allocations from clump of 10 pages");
2637 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats11, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[11], "free page allocations from clump of 11 pages");
2638 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats12, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[12], "free page allocations from clump of 12 pages");
2639 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats13, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[13], "free page allocations from clump of 13 pages");
2640 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats14, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[14], "free page allocations from clump of 14 pages");
2641 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats15, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[15], "free page allocations from clump of 15 pages");
2642 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats16, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[16], "free page allocations from clump of 16 pages");
2643 extern unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
2644 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_alloc, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_allocs, "free page allocations");
2645 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inserts, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inserts, "free page insertions");
2646 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inrange, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inrange, "free page insertions that are part of vm_pages");
2647 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_promotes, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_promotes, "pages promoted to head");
2648 #endif  /* if DEVELOPMENT || DEBUG */
2649 #endif  /* #if defined (__x86_64__) */
2650 
2651 #if CONFIG_SECLUDED_MEMORY
2652 
2653 SYSCTL_UINT(_vm, OID_AUTO, num_tasks_can_use_secluded_mem, CTLFLAG_RD | CTLFLAG_LOCKED, &num_tasks_can_use_secluded_mem, 0, "");
2654 extern unsigned int vm_page_secluded_target;
2655 extern unsigned int vm_page_secluded_count;
2656 extern unsigned int vm_page_secluded_count_free;
2657 extern unsigned int vm_page_secluded_count_inuse;
2658 extern unsigned int vm_page_secluded_count_over_target;
2659 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, "");
2660 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, "");
2661 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, "");
2662 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, "");
2663 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_over_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_over_target, 0, "");
2664 
2665 extern struct vm_page_secluded_data vm_page_secluded;
2666 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, "");
2667 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_free, 0, "");
2668 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_other, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_other, 0, "");
2669 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_locked, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_locked, 0, "");
2670 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_state, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_state, 0, "");
2671 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_realtime, 0, "");
2672 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_dirty, 0, "");
2673 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, "");
2674 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, "");
2675 
2676 #endif /* CONFIG_SECLUDED_MEMORY */
2677 
2678 #if CONFIG_DEFERRED_RECLAIM
2679 #pragma mark Deferred Reclaim
2680 SYSCTL_NODE(_vm, OID_AUTO, reclaim, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Deferred Memory Reclamation");
2681 #if DEVELOPMENT || DEBUG
2682 /*
2683  * VM reclaim testing
2684  */
2685 extern bool vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task);
2686 
2687 static int
2688 sysctl_vm_reclaim_wait_for_pid SYSCTL_HANDLER_ARGS
2689 {
2690 	int error = EINVAL, pid = 0;
2691 	/*
2692 	 * Only send on write
2693 	 */
2694 	error = sysctl_handle_int(oidp, &pid, 0, req);
2695 	if (error || !req->newptr) {
2696 		return error;
2697 	}
2698 	if (pid <= 0) {
2699 		return EINVAL;
2700 	}
2701 	proc_t p = proc_find(pid);
2702 	if (p == PROC_NULL) {
2703 		return ESRCH;
2704 	}
2705 	task_t t = proc_task(p);
2706 	if (t == TASK_NULL) {
2707 		proc_rele(p);
2708 		return ESRCH;
2709 	}
2710 	task_reference(t);
2711 	proc_rele(p);
2712 
2713 	bool success = vm_deferred_reclamation_block_until_task_has_been_reclaimed(t);
2714 	if (success) {
2715 		error = 0;
2716 	}
2717 	task_deallocate(t);
2718 
2719 	return error;
2720 }
2721 
2722 SYSCTL_PROC(_vm_reclaim, OID_AUTO, wait_for_pid,
2723     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2724     &sysctl_vm_reclaim_wait_for_pid, "I",
2725     "Block until the given pid has been drained by kernel GC");
2726 
2727 static int
2728 sysctl_vm_reclaim_drain_pid SYSCTL_HANDLER_ARGS
2729 {
2730 	int error = EINVAL;
2731 	kern_return_t kr;
2732 	pid_t pid;
2733 	error = sysctl_handle_int(oidp, &pid, 0, req);
2734 	/* Only reclaim on write */
2735 	if (error || !req->newptr) {
2736 		return error;
2737 	}
2738 	if (pid <= 0) {
2739 		return EINVAL;
2740 	}
2741 	proc_t p = proc_find(pid);
2742 	if (p == PROC_NULL) {
2743 		return ESRCH;
2744 	}
2745 	task_t t = proc_task(p);
2746 	if (t == TASK_NULL) {
2747 		proc_rele(p);
2748 		return ESRCH;
2749 	}
2750 	task_reference(t);
2751 	proc_rele(p);
2752 	kr = vm_deferred_reclamation_task_drain(t, RECLAIM_OPTIONS_NONE);
2753 	task_deallocate(t);
2754 	return mach_to_bsd_errno(kr);
2755 }
2756 
2757 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_pid,
2758     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2759     &sysctl_vm_reclaim_drain_pid, "I",
2760     "Drain the deferred reclamation buffer for a pid");
2761 
2762 static int
proc_filter_reclaimable(proc_t p,__unused void * arg)2763 proc_filter_reclaimable(proc_t p, __unused void *arg)
2764 {
2765 	task_t task = proc_task(p);
2766 	return vm_deferred_reclamation_task_has_ring(task);
2767 }
2768 
2769 static int
proc_reclaim_drain(proc_t p,__unused void * arg)2770 proc_reclaim_drain(proc_t p, __unused void *arg)
2771 {
2772 	kern_return_t kr;
2773 	task_t task = proc_task(p);
2774 	kr = vm_deferred_reclamation_task_drain(task, RECLAIM_OPTIONS_NONE);
2775 	return mach_to_bsd_errno(kr);
2776 }
2777 
2778 static int
2779 sysctl_vm_reclaim_drain_all SYSCTL_HANDLER_ARGS
2780 {
2781 	int error;
2782 	int val;
2783 	if (!req->newptr) {
2784 		return EINVAL;
2785 	}
2786 	error = sysctl_handle_int(oidp, &val, 0, req);
2787 	if (error || val == FALSE) {
2788 		return error;
2789 	}
2790 	proc_iterate(PROC_ALLPROCLIST, proc_reclaim_drain, NULL,
2791 	    proc_filter_reclaimable, NULL);
2792 	return 0;
2793 }
2794 
2795 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_all,
2796     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2797     &sysctl_vm_reclaim_drain_all, "I",
2798     "Fully reclaim from every deferred reclamation buffer on the system");
2799 
2800 extern uint32_t vm_reclaim_buffer_count;
2801 extern uint64_t vm_reclaim_gc_epoch;
2802 extern uint64_t vm_reclaim_gc_reclaim_count;
2803 extern uint64_t vm_reclaim_sampling_period_abs;
2804 extern uint64_t vm_reclaim_sampling_period_ns;
2805 extern bool vm_reclaim_debug;
2806 extern bool vm_reclaim_enabled;
2807 extern uint32_t vm_reclaim_autotrim_pct_normal;
2808 extern uint32_t vm_reclaim_autotrim_pct_pressure;
2809 extern uint32_t vm_reclaim_autotrim_pct_critical;
2810 extern uint32_t vm_reclaim_wma_weight_base;
2811 extern uint32_t vm_reclaim_wma_weight_cur;
2812 extern uint32_t vm_reclaim_wma_denom;
2813 extern uint64_t vm_reclaim_abandonment_threshold;
2814 
2815 SYSCTL_UINT(_vm_reclaim, OID_AUTO, reclaim_buffer_count,
2816     CTLFLAG_RD | CTLFLAG_LOCKED, (uint32_t *)&vm_reclaim_buffer_count, 0,
2817     "The number of deferred memory buffers currently alive");
2818 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_epoch,
2819     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_epoch,
2820     "Number of times the global GC thread has run");
2821 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_reclaim_count,
2822     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_reclaim_count,
2823     "Number of times the global GC thread has reclaimed from a buffer");
2824 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, debug,
2825     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_debug, 0,
2826     "Debug logs for vm.reclaim");
2827 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, enabled,
2828     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_enabled, 0,
2829     "Whether deferred memory reclamation is enabled on this system");
2830 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_normal,
2831     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_normal, 0,
2832     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2833     "to engage auto-trim when the system is operating normally");
2834 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_pressure,
2835     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_pressure, 0,
2836     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2837     "to engage auto-trim when the system is under memory pressure");
2838 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_critical,
2839     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_critical, 0,
2840     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2841     "to engage auto-trim when the system is under critical memory pressure");
2842 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_base,
2843     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_base, 0,
2844     "Weight applied to historical minimum buffer size samples");
2845 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_cur,
2846     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_cur, 0,
2847     "Weight applied to current sampled minimum buffer size");
2848 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_denom,
2849     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_denom, 0,
2850     "Denominator for weighted moving average calculation");
2851 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, abandonment_threshold,
2852     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_abandonment_threshold,
2853     "The number of sampling periods between accounting updates that may elapse "
2854     "before the buffer is considered \"abandoned\"");
2855 
2856 static int
2857 sysctl_vm_reclaim_sampling_period SYSCTL_HANDLER_ARGS
2858 {
2859 	uint64_t new_val_ns;
2860 	uint64_t old_val_ns = vm_reclaim_sampling_period_ns;
2861 	int err = sysctl_io_number(req, vm_reclaim_sampling_period_ns,
2862 	    sizeof(vm_reclaim_sampling_period_ns), &new_val_ns, NULL);
2863 	if (err || !req->newptr) {
2864 		return err;
2865 	}
2866 	if (new_val_ns != old_val_ns) {
2867 		vm_reclaim_sampling_period_ns = new_val_ns;
2868 		nanoseconds_to_absolutetime(vm_reclaim_sampling_period_ns, &vm_reclaim_sampling_period_abs);
2869 	}
2870 	return 0;
2871 }
2872 
2873 SYSCTL_PROC(_vm_reclaim, OID_AUTO, sampling_period_ns,
2874     CTLFLAG_RW | CTLTYPE_QUAD | CTLFLAG_LOCKED, NULL, 0, sysctl_vm_reclaim_sampling_period, "QU",
2875     "Interval (nanoseconds) at which to sample the minimum buffer size and "
2876     "consider trimming excess");
2877 #endif /* DEVELOPMENT || DEBUG */
2878 #endif /* CONFIG_DEFERRED_RECLAIM */
2879 
2880 #include <kern/thread.h>
2881 #include <sys/user.h>
2882 
2883 void vm_pageout_io_throttle(void);
2884 
2885 void
vm_pageout_io_throttle(void)2886 vm_pageout_io_throttle(void)
2887 {
2888 	struct uthread *uthread = current_uthread();
2889 
2890 	/*
2891 	 * thread is marked as a low priority I/O type
2892 	 * and the I/O we issued while in this cleaning operation
2893 	 * collided with normal I/O operations... we'll
2894 	 * delay in order to mitigate the impact of this
2895 	 * task on the normal operation of the system
2896 	 */
2897 
2898 	if (uthread->uu_lowpri_window) {
2899 		throttle_lowpri_io(1);
2900 	}
2901 }
2902 
2903 int
vm_pressure_monitor(__unused struct proc * p,struct vm_pressure_monitor_args * uap,int * retval)2904 vm_pressure_monitor(
2905 	__unused struct proc *p,
2906 	struct vm_pressure_monitor_args *uap,
2907 	int *retval)
2908 {
2909 	kern_return_t   kr;
2910 	uint32_t        pages_reclaimed;
2911 	uint32_t        pages_wanted;
2912 
2913 	kr = mach_vm_pressure_monitor(
2914 		(boolean_t) uap->wait_for_pressure,
2915 		uap->nsecs_monitored,
2916 		(uap->pages_reclaimed) ? &pages_reclaimed : NULL,
2917 		&pages_wanted);
2918 
2919 	switch (kr) {
2920 	case KERN_SUCCESS:
2921 		break;
2922 	case KERN_ABORTED:
2923 		return EINTR;
2924 	default:
2925 		return EINVAL;
2926 	}
2927 
2928 	if (uap->pages_reclaimed) {
2929 		if (copyout((void *)&pages_reclaimed,
2930 		    uap->pages_reclaimed,
2931 		    sizeof(pages_reclaimed)) != 0) {
2932 			return EFAULT;
2933 		}
2934 	}
2935 
2936 	*retval = (int) pages_wanted;
2937 	return 0;
2938 }
2939 
2940 int
kas_info(struct proc * p,struct kas_info_args * uap,int * retval __unused)2941 kas_info(struct proc *p,
2942     struct kas_info_args *uap,
2943     int *retval __unused)
2944 {
2945 #ifndef CONFIG_KAS_INFO
2946 	(void)p;
2947 	(void)uap;
2948 	return ENOTSUP;
2949 #else /* CONFIG_KAS_INFO */
2950 	int                     selector = uap->selector;
2951 	user_addr_t     valuep = uap->value;
2952 	user_addr_t     sizep = uap->size;
2953 	user_size_t size, rsize;
2954 	int                     error;
2955 
2956 	if (!kauth_cred_issuser(kauth_cred_get())) {
2957 		return EPERM;
2958 	}
2959 
2960 #if CONFIG_MACF
2961 	error = mac_system_check_kas_info(kauth_cred_get(), selector);
2962 	if (error) {
2963 		return error;
2964 	}
2965 #endif
2966 
2967 	if (IS_64BIT_PROCESS(p)) {
2968 		user64_size_t size64;
2969 		error = copyin(sizep, &size64, sizeof(size64));
2970 		size = (user_size_t)size64;
2971 	} else {
2972 		user32_size_t size32;
2973 		error = copyin(sizep, &size32, sizeof(size32));
2974 		size = (user_size_t)size32;
2975 	}
2976 	if (error) {
2977 		return error;
2978 	}
2979 
2980 	switch (selector) {
2981 	case KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR:
2982 	{
2983 		uint64_t slide = vm_kernel_slide;
2984 
2985 		if (sizeof(slide) != size) {
2986 			return EINVAL;
2987 		}
2988 
2989 		error = copyout(&slide, valuep, sizeof(slide));
2990 		if (error) {
2991 			return error;
2992 		}
2993 		rsize = size;
2994 	}
2995 	break;
2996 	case KAS_INFO_KERNEL_SEGMENT_VMADDR_SELECTOR:
2997 	{
2998 		uint32_t i;
2999 		kernel_mach_header_t *mh = &_mh_execute_header;
3000 		struct load_command *cmd;
3001 		cmd = (struct load_command*) &mh[1];
3002 		uint64_t *bases;
3003 		rsize = mh->ncmds * sizeof(uint64_t);
3004 
3005 		/*
3006 		 * Return the size if no data was passed
3007 		 */
3008 		if (valuep == 0) {
3009 			break;
3010 		}
3011 
3012 		if (rsize > size) {
3013 			return EINVAL;
3014 		}
3015 
3016 		bases = kalloc_data(rsize, Z_WAITOK | Z_ZERO);
3017 
3018 		for (i = 0; i < mh->ncmds; i++) {
3019 			if (cmd->cmd == LC_SEGMENT_KERNEL) {
3020 				__IGNORE_WCASTALIGN(kernel_segment_command_t * sg = (kernel_segment_command_t *) cmd);
3021 				bases[i] = (uint64_t)sg->vmaddr;
3022 			}
3023 			cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize);
3024 		}
3025 
3026 		error = copyout(bases, valuep, rsize);
3027 
3028 		kfree_data(bases, rsize);
3029 
3030 		if (error) {
3031 			return error;
3032 		}
3033 	}
3034 	break;
3035 	case KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR:
3036 	case KAS_INFO_TXM_TEXT_SLIDE_SELECTOR:
3037 	{
3038 #if CONFIG_SPTM
3039 		const uint64_t slide =
3040 		    (selector == KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR) ? vm_sptm_offsets.slide : vm_txm_offsets.slide;
3041 #else
3042 		const uint64_t slide = 0;
3043 #endif
3044 
3045 		if (sizeof(slide) != size) {
3046 			return EINVAL;
3047 		}
3048 
3049 		error = copyout(&slide, valuep, sizeof(slide));
3050 		if (error) {
3051 			return error;
3052 		}
3053 		rsize = size;
3054 	}
3055 	break;
3056 	default:
3057 		return EINVAL;
3058 	}
3059 
3060 	if (IS_64BIT_PROCESS(p)) {
3061 		user64_size_t size64 = (user64_size_t)rsize;
3062 		error = copyout(&size64, sizep, sizeof(size64));
3063 	} else {
3064 		user32_size_t size32 = (user32_size_t)rsize;
3065 		error = copyout(&size32, sizep, sizeof(size32));
3066 	}
3067 
3068 	return error;
3069 #endif /* CONFIG_KAS_INFO */
3070 }
3071 
3072 #pragma clang diagnostic push
3073 #pragma clang diagnostic ignored "-Wcast-qual"
3074 #pragma clang diagnostic ignored "-Wunused-function"
3075 
3076 static void
asserts()3077 asserts()
3078 {
3079 	static_assert(sizeof(vm_min_kernel_address) == sizeof(unsigned long));
3080 	static_assert(sizeof(vm_max_kernel_address) == sizeof(unsigned long));
3081 }
3082 
3083 SYSCTL_ULONG(_vm, OID_AUTO, vm_min_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_min_kernel_address, "");
3084 SYSCTL_ULONG(_vm, OID_AUTO, vm_max_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_max_kernel_address, "");
3085 #pragma clang diagnostic pop
3086 
3087 extern uint32_t vm_page_pages;
3088 SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0, "");
3089 
3090 extern uint32_t vm_page_busy_absent_skipped;
3091 SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, "");
3092 
3093 extern uint32_t vm_page_upl_tainted;
3094 SYSCTL_UINT(_vm, OID_AUTO, upl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_upl_tainted, 0, "");
3095 
3096 extern uint32_t vm_page_iopl_tainted;
3097 SYSCTL_UINT(_vm, OID_AUTO, iopl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_iopl_tainted, 0, "");
3098 
3099 #if __arm64__ && (DEVELOPMENT || DEBUG)
3100 extern int vm_footprint_suspend_allowed;
3101 SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, "");
3102 
3103 extern void pmap_footprint_suspend(vm_map_t map, boolean_t suspend);
3104 static int
3105 sysctl_vm_footprint_suspend SYSCTL_HANDLER_ARGS
3106 {
3107 #pragma unused(oidp, arg1, arg2)
3108 	int error = 0;
3109 	int new_value;
3110 
3111 	if (req->newptr == USER_ADDR_NULL) {
3112 		return 0;
3113 	}
3114 	error = SYSCTL_IN(req, &new_value, sizeof(int));
3115 	if (error) {
3116 		return error;
3117 	}
3118 	if (!vm_footprint_suspend_allowed) {
3119 		if (new_value != 0) {
3120 			/* suspends are not allowed... */
3121 			return 0;
3122 		}
3123 		/* ... but let resumes proceed */
3124 	}
3125 	DTRACE_VM2(footprint_suspend,
3126 	    vm_map_t, current_map(),
3127 	    int, new_value);
3128 
3129 	pmap_footprint_suspend(current_map(), new_value);
3130 
3131 	return 0;
3132 }
3133 SYSCTL_PROC(_vm, OID_AUTO, footprint_suspend,
3134     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3135     0, 0, &sysctl_vm_footprint_suspend, "I", "");
3136 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
3137 
3138 extern uint64_t vm_map_corpse_footprint_count;
3139 extern uint64_t vm_map_corpse_footprint_size_avg;
3140 extern uint64_t vm_map_corpse_footprint_size_max;
3141 extern uint64_t vm_map_corpse_footprint_full;
3142 extern uint64_t vm_map_corpse_footprint_no_buf;
3143 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_count,
3144     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_count, "");
3145 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_avg,
3146     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_avg, "");
3147 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_max,
3148     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_max, "");
3149 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_full,
3150     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_full, "");
3151 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_no_buf,
3152     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_no_buf, "");
3153 
3154 #if CODE_SIGNING_MONITOR
3155 extern uint64_t vm_cs_defer_to_csm;
3156 extern uint64_t vm_cs_defer_to_csm_not;
3157 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm,
3158     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm, "");
3159 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm_not,
3160     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm_not, "");
3161 #endif /* CODE_SIGNING_MONITOR */
3162 
3163 extern uint64_t shared_region_pager_copied;
3164 extern uint64_t shared_region_pager_slid;
3165 extern uint64_t shared_region_pager_slid_error;
3166 extern uint64_t shared_region_pager_reclaimed;
3167 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_copied,
3168     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_copied, "");
3169 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid,
3170     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid, "");
3171 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid_error,
3172     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid_error, "");
3173 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed,
3174     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_reclaimed, "");
3175 extern int shared_region_destroy_delay;
3176 SYSCTL_INT(_vm, OID_AUTO, shared_region_destroy_delay,
3177     CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_destroy_delay, 0, "");
3178 
3179 #if MACH_ASSERT
3180 extern int pmap_ledgers_panic_leeway;
3181 SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
3182 #endif /* MACH_ASSERT */
3183 
3184 
3185 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_count;
3186 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_size;
3187 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_max;
3188 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart;
3189 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_error;
3190 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_count;
3191 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_size;
3192 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_max;
3193 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart;
3194 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_error;
3195 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_count;
3196 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_size;
3197 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_max;
3198 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_count,
3199     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_count, "");
3200 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_size,
3201     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_size, "");
3202 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_max,
3203     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_max, "");
3204 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_restart,
3205     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_restart, "");
3206 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_error,
3207     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_error, "");
3208 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_count,
3209     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_count, "");
3210 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_size,
3211     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_size, "");
3212 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_max,
3213     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_max, "");
3214 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_restart,
3215     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_restart, "");
3216 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_error,
3217     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_error, "");
3218 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_count,
3219     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_count, "");
3220 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_size,
3221     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_size, "");
3222 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_max,
3223     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_max, "");
3224 
3225 extern int vm_protect_privileged_from_untrusted;
3226 SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted,
3227     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, "");
3228 extern uint64_t vm_copied_on_read;
3229 extern uint64_t vm_copied_on_read_kernel_map;
3230 extern uint64_t vm_copied_on_read_platform_map;
3231 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read,
3232     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, "");
3233 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read_kernel_map,
3234     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read_kernel_map, "");
3235 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read_platform_map,
3236     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read_platform_map, "");
3237 
3238 extern int vm_shared_region_count;
3239 extern int vm_shared_region_peak;
3240 SYSCTL_INT(_vm, OID_AUTO, shared_region_count,
3241     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_count, 0, "");
3242 SYSCTL_INT(_vm, OID_AUTO, shared_region_peak,
3243     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_peak, 0, "");
3244 #if DEVELOPMENT || DEBUG
3245 extern unsigned int shared_region_pagers_resident_count;
3246 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_count,
3247     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_count, 0, "");
3248 extern unsigned int shared_region_pagers_resident_peak;
3249 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_peak,
3250     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_peak, 0, "");
3251 extern int shared_region_pager_count;
3252 SYSCTL_INT(_vm, OID_AUTO, shared_region_pager_count,
3253     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_count, 0, "");
3254 #if __has_feature(ptrauth_calls)
3255 extern int shared_region_key_count;
3256 SYSCTL_INT(_vm, OID_AUTO, shared_region_key_count,
3257     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_key_count, 0, "");
3258 extern int vm_shared_region_reslide_count;
3259 SYSCTL_INT(_vm, OID_AUTO, shared_region_reslide_count,
3260     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_reslide_count, 0, "");
3261 #endif /* __has_feature(ptrauth_calls) */
3262 #endif /* DEVELOPMENT || DEBUG */
3263 
3264 #if MACH_ASSERT
3265 extern int debug4k_filter;
3266 SYSCTL_INT(_vm, OID_AUTO, debug4k_filter, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_filter, 0, "");
3267 extern int debug4k_panic_on_terminate;
3268 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_terminate, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_terminate, 0, "");
3269 extern int debug4k_panic_on_exception;
3270 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_exception, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_exception, 0, "");
3271 extern int debug4k_panic_on_misaligned_sharing;
3272 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_misaligned_sharing, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_misaligned_sharing, 0, "");
3273 #endif /* MACH_ASSERT */
3274 
3275 extern uint64_t vm_map_set_size_limit_count;
3276 extern uint64_t vm_map_set_data_limit_count;
3277 extern uint64_t vm_map_enter_RLIMIT_AS_count;
3278 extern uint64_t vm_map_enter_RLIMIT_DATA_count;
3279 SYSCTL_QUAD(_vm, OID_AUTO, map_set_size_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_size_limit_count, "");
3280 SYSCTL_QUAD(_vm, OID_AUTO, map_set_data_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_data_limit_count, "");
3281 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_AS_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_AS_count, "");
3282 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_DATA_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_DATA_count, "");
3283 
3284 extern uint64_t vm_fault_resilient_media_initiate;
3285 extern uint64_t vm_fault_resilient_media_retry;
3286 extern uint64_t vm_fault_resilient_media_proceed;
3287 extern uint64_t vm_fault_resilient_media_release;
3288 extern uint64_t vm_fault_resilient_media_abort1;
3289 extern uint64_t vm_fault_resilient_media_abort2;
3290 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_initiate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_initiate, "");
3291 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_retry, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_retry, "");
3292 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_proceed, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_proceed, "");
3293 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_release, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_release, "");
3294 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort1, "");
3295 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort2, "");
3296 #if MACH_ASSERT
3297 extern int vm_fault_resilient_media_inject_error1_rate;
3298 extern int vm_fault_resilient_media_inject_error1;
3299 extern int vm_fault_resilient_media_inject_error2_rate;
3300 extern int vm_fault_resilient_media_inject_error2;
3301 extern int vm_fault_resilient_media_inject_error3_rate;
3302 extern int vm_fault_resilient_media_inject_error3;
3303 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1_rate, 0, "");
3304 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1, 0, "");
3305 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2_rate, 0, "");
3306 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2, 0, "");
3307 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3_rate, 0, "");
3308 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3, 0, "");
3309 #endif /* MACH_ASSERT */
3310 
3311 extern uint64_t pmap_query_page_info_retries;
3312 SYSCTL_QUAD(_vm, OID_AUTO, pmap_query_page_info_retries, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_query_page_info_retries, "");
3313 
3314 /*
3315  * A sysctl which causes all existing shared regions to become stale. They
3316  * will no longer be used by anything new and will be torn down as soon as
3317  * the last existing user exits. A write of non-zero value causes that to happen.
3318  * This should only be used by launchd, so we check that this is initproc.
3319  */
3320 static int
shared_region_pivot(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3321 shared_region_pivot(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3322 {
3323 	unsigned int value = 0;
3324 	int changed = 0;
3325 	int error = sysctl_io_number(req, 0, sizeof(value), &value, &changed);
3326 	if (error || !changed) {
3327 		return error;
3328 	}
3329 	if (current_proc() != initproc) {
3330 		return EPERM;
3331 	}
3332 
3333 	vm_shared_region_pivot();
3334 
3335 	return 0;
3336 }
3337 
3338 SYSCTL_PROC(_vm, OID_AUTO, shared_region_pivot,
3339     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
3340     0, 0, shared_region_pivot, "I", "");
3341 
3342 extern uint64_t vm_object_shadow_forced;
3343 extern uint64_t vm_object_shadow_skipped;
3344 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
3345     &vm_object_shadow_forced, "");
3346 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_skipped, CTLFLAG_RD | CTLFLAG_LOCKED,
3347     &vm_object_shadow_skipped, "");
3348 
3349 extern uint64_t vm_object_upl_throttle_cnt;
3350 SYSCTL_QUAD(_vm, OID_AUTO, object_upl_throttle_cnt, CTLFLAG_RD | CTLFLAG_LOCKED,
3351     &vm_object_upl_throttle_cnt,
3352     "The number of times in which a UPL write was throttled due to pageout starvation");
3353 
3354 #if HAS_MTE
3355 #pragma mark MTE
3356 
3357 SYSCTL_NODE(_vm, OID_AUTO, mte, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "mte");
3358 
3359 /* sysctls for vm.mte.* counters. */
3360 
3361 SYSCTL_UINT(_vm_mte, OID_AUTO, tagged, CTLFLAG_RD,
3362     &vm_page_tagged_count, 0, "tagged pages in use");
3363 
3364 SYSCTL_QUAD(_vm_mte, OID_AUTO, refill_thread_wakeups, CTLFLAG_RD,
3365     &vm_mte_refill_thread_wakeups,
3366     "the number of times the refill thread was woken up");
3367 
3368 /* sysctls for vm.mte.free.* counters. */
3369 
3370 SYSCTL_NODE(_vm_mte, OID_AUTO, free, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "free counts");
3371 
3372 SYSCTL_UINT(_vm_mte_free, OID_AUTO, total, CTLFLAG_RD,
3373     &vm_page_free_count, 0,
3374     "total free pages (same as vm.page_free_count)");
3375 SYSCTL_UINT(_vm_mte_free, OID_AUTO, taggable, CTLFLAG_RD,
3376     &vm_page_free_taggable_count, 0,
3377     "free taggable pages in the MTE free queue");
3378 SYSCTL_UINT(_vm_mte_free, OID_AUTO, claimable, CTLFLAG_RD,
3379     &mte_claimable_queue.vmpfq_count, 0,
3380     "free tag storage pages on the MTE claimable queue");
3381 
3382 SYSCTL_SCALABLE_COUNTER(_vm_mte_free, cpu_untagged, vm_cpu_free_count,
3383     "free untagged pages in CPU lists");
3384 SYSCTL_SCALABLE_COUNTER(_vm_mte_free, cpu_claimed, vm_cpu_free_claimed_count,
3385     "free claimed pages in CPU lists");
3386 SYSCTL_SCALABLE_COUNTER(_vm_mte_free, cpu_tagged, vm_cpu_free_tagged_count,
3387     "free tagged pages in CPU lists");
3388 
3389 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_0, CTLFLAG_RD,
3390     &mte_free_queues[MTE_FREE_UNTAGGABLE_0].vmpfq_count, 0,
3391     "disabled/pinned/deactivating/claimed (with 16 free pages or less) tag storage pages")
3392 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_1, CTLFLAG_RD,
3393     &mte_free_queues[MTE_FREE_UNTAGGABLE_1].vmpfq_count, 0,
3394     "claimed (with 17 free pages or more) or disabled (with 16 pages or less) tag storage pages")
3395 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_2, CTLFLAG_RD,
3396     &mte_free_queues[MTE_FREE_UNTAGGABLE_2].vmpfq_count, 0,
3397     "disabled (with 17 pages or more) tag storage pages")
3398 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_0, CTLFLAG_RD,
3399     &mte_free_queues[MTE_FREE_ACTIVE_0].vmpfq_count, 0,
3400     "active tag storages with free covered pages (bucket 0)");
3401 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_1, CTLFLAG_RD,
3402     &mte_free_queues[MTE_FREE_ACTIVE_1].vmpfq_count, 0,
3403     "active tag storages with free covered pages (bucket 1)");
3404 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_2, CTLFLAG_RD,
3405     &mte_free_queues[MTE_FREE_ACTIVE_2].vmpfq_count, 0,
3406     "active tag storages with free covered pages (bucket 2)");
3407 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_3, CTLFLAG_RD,
3408     &mte_free_queues[MTE_FREE_ACTIVE_3].vmpfq_count, 0,
3409     "active tag storages with free covered pages (bucket 3)");
3410 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_activating, CTLFLAG_RD,
3411     &mte_free_queues[MTE_FREE_UNTAGGABLE_ACTIVATING].vmpfq_count, 0,
3412     "activating/reclaiming tag storages with free covered pages");
3413 
3414 /* sysctls for vm.mte.tag_storage.cell_* counters. */
3415 
3416 SYSCTL_NODE(_vm_mte, OID_AUTO, cell, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "mte cell");
3417 
3418 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, disabled, CTLFLAG_RD,
3419     &mte_info_lists[MTE_LIST_DISABLED_IDX].count, 0,
3420     "free inactive tag storage pages");
3421 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, disabled_recursive, CTLFLAG_RD,
3422     &vm_page_recursive_tag_storage_count, 0,
3423     "recursive tag storage pages");
3424 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, disabled_unmanaged, CTLFLAG_RD,
3425     &vm_page_unmanaged_tag_storage_count, 0,
3426     "unmanaged tag storage pages");
3427 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, retired, CTLFLAG_RD,
3428     &vm_page_retired_tag_storage_count, 0,
3429     "retired tag storage pages");
3430 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, pinned, CTLFLAG_RD,
3431     &mte_info_lists[MTE_LIST_PINNED_IDX].count, 0,
3432     "unreclaimable tag storage pages");
3433 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, deactivating, CTLFLAG_RD,
3434     &mte_info_lists[MTE_LIST_DEACTIVATING_IDX].count, 0,
3435     "deactivating tag storage pages");
3436 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, claimed, CTLFLAG_RD,
3437     &mte_info_lists[MTE_LIST_CLAIMED_IDX].count, 0,
3438     "claimed tag storage pages");
3439 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, inactive, CTLFLAG_RD,
3440     &mte_info_lists[MTE_LIST_INACTIVE_IDX].count, 0,
3441     "free inactive tag storage pages");
3442 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, reclaiming, CTLFLAG_RD,
3443     &mte_info_lists[MTE_LIST_RECLAIMING_IDX].count, 0,
3444     "reclaiming tag storage pages");
3445 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, activating, CTLFLAG_RD,
3446     &mte_info_lists[MTE_LIST_ACTIVATING_IDX].count, 0,
3447     "activating tag storage pages");
3448 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, active_0, CTLFLAG_RD,
3449     &mte_info_lists[MTE_LIST_ACTIVE_0_IDX].count, 0,
3450     "active tag storage pages with no used page tagged");
3451 static int
3452 tag_storage_active SYSCTL_HANDLER_ARGS
3453 {
3454 #pragma unused(arg1, arg2, oidp)
3455 	uint32_t value = mteinfo_tag_storage_active(false);
3456 
3457 	return SYSCTL_OUT(req, &value, sizeof(value));
3458 }
3459 SYSCTL_PROC(_vm_mte_cell, OID_AUTO, active,
3460     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3461     0, 0, &tag_storage_active, "I",
3462     "active tag storage pages");
3463 
3464 /* sysctls for vm.mte.tag_storage.* counters. */
3465 
3466 SYSCTL_NODE(_vm_mte, OID_AUTO, tag_storage, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "mte tag storage");
3467 
3468 SYSCTL_UINT(_vm_mte_tag_storage, OID_AUTO, reserved, CTLFLAG_RD,
3469     &vm_page_tag_storage_reserved, 0,
3470     "free tag storage pages reserve");
3471 SYSCTL_UINT(_vm_mte_tag_storage, OID_AUTO, wired, CTLFLAG_RD,
3472     &vm_page_wired_tag_storage_count, 0,
3473     "wired tag storage pages");
3474 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, activations, CTLFLAG_RD,
3475     &vm_page_tag_storage_activation_count,
3476     "tag storage activations (inactive/claimed -> active)");
3477 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, deactivations, CTLFLAG_RD,
3478     &vm_page_tag_storage_deactivation_count,
3479     "tag storage deactivations (active -> inactive)");
3480 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaims, CTLFLAG_RD,
3481     &vm_page_tag_storage_reclaim_success_count,
3482     "successful tag storage reclamations");
3483 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaims_from_cpu, CTLFLAG_RD,
3484     &vm_page_tag_storage_reclaim_from_cpu_count,
3485     "successful tag storage reclamations from the cpu free lists");
3486 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaim_failures, CTLFLAG_RD,
3487     &vm_page_tag_storage_reclaim_failure_count,
3488     "failed tag storage reclamations");
3489 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaim_wired_failures, CTLFLAG_RD,
3490     &vm_page_tag_storage_reclaim_wired_failure_count,
3491     "failed tag storage reclamations due to tag storage being wired");
3492 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, wire_relocations, CTLFLAG_RD,
3493     &vm_page_tag_storage_wire_relocation_count,
3494     "tag storage relocations due to wiring");
3495 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaim_compressor_failures, CTLFLAG_RD,
3496     &vm_page_tag_storage_reclaim_compressor_failure_count,
3497     "failed tag storage reclamations due to tag storage used in compressor pool");
3498 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, compressor_relocations, CTLFLAG_RD,
3499     &vm_page_tag_storage_compressor_relocation_count,
3500     "tag storage relocations due to compressor pool");
3501 SYSCTL_UINT(_vm_mte_tag_storage, OID_AUTO, free_unmanaged, CTLFLAG_RD,
3502     &vm_page_free_unmanaged_tag_storage_count, 0,
3503     "number of free unmanaged tag storage pages");
3504 
3505 SYSCTL_SCALABLE_COUNTER(_vm_mte_tag_storage, cpu_allocated_claimed,
3506     vm_cpu_claimed_count, "claimed tag storage pages allocated");
3507 
3508 static int
3509 tag_storage_fragmentation SYSCTL_HANDLER_ARGS
3510 {
3511 #pragma unused(arg1, arg2, oidp)
3512 	uint32_t value = mteinfo_tag_storage_fragmentation(false);
3513 
3514 	return SYSCTL_OUT(req, &value, sizeof(value));
3515 }
3516 SYSCTL_PROC(_vm_mte_tag_storage, OID_AUTO, fragmentation,
3517     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3518     0, 0, &tag_storage_fragmentation, "I",
3519     "the achievable the fragmentation of the tag storage space (in parts per thousand)");
3520 
3521 static int
3522 tag_storage_fragmentation_actual SYSCTL_HANDLER_ARGS
3523 {
3524 #pragma unused(arg1, arg2, oidp)
3525 	uint32_t value = mteinfo_tag_storage_fragmentation(true);
3526 
3527 	return SYSCTL_OUT(req, &value, sizeof(value));
3528 }
3529 SYSCTL_PROC(_vm_mte_tag_storage, OID_AUTO, fragmentation_actual,
3530     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3531     0, 0, &tag_storage_fragmentation_actual, "I",
3532     "the actual the fragmentation of the tag storage space (in parts per thousand)");
3533 
3534 /* sysctls for vm.mte.compresor_* */
3535 
3536 extern unsigned int vm_object_no_compressor_pager_for_mte_count;
3537 SYSCTL_INT(_vm_mte, OID_AUTO, no_compressor_pager_for_mte, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_no_compressor_pager_for_mte_count, 0, "");
3538 
3539 /* sysctls for MTE compression stats */
3540 
3541 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_compressed, compressor_tagged_pages_compressed, "");
3542 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_decompressed, compressor_tagged_pages_decompressed, "");
3543 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_freed, compressor_tagged_pages_freed, "");
3544 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_corrupted, compressor_tagged_pages_corrupted, "");
3545 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_overhead_bytes, compressor_tags_overhead_bytes, "");
3546 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages, compressor_tagged_pages, "");
3547 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_ts_pages_used, compressor_tag_storage_pages_in_pool,
3548     "the number of tag storage pages used in the compressor");
3549 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_non_ts_pages_used, compressor_non_tag_storage_pages_in_pool,
3550     "the number of non-tag storage pages used in the compressor");
3551 #if DEVELOPMENT || DEBUG
3552 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_all_zero, compressor_tags_all_zero, "");
3553 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_same_value, compressor_tags_same_value, "");
3554 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_below_align, compressor_tags_below_align, "");
3555 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_above_align, compressor_tags_above_align, "");
3556 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_incompressible, compressor_tags_incompressible, "");
3557 #endif /* DEVELOPMENT || DEBUG */
3558 
3559 #endif /* HAS_MTE */
3560 
3561 SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED,
3562     &vmtc_total, 0, "total text page corruptions detected");
3563 
3564 
3565 #if DEBUG || DEVELOPMENT
3566 /*
3567  * A sysctl that can be used to corrupt a text page with an illegal instruction.
3568  * Used for testing text page self healing.
3569  */
3570 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
3571 static int
corrupt_text_addr(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3572 corrupt_text_addr(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3573 {
3574 	uint64_t value = 0;
3575 	int error = sysctl_handle_quad(oidp, &value, 0, req);
3576 	if (error || !req->newptr) {
3577 		return error;
3578 	}
3579 
3580 	if (vm_corrupt_text_addr((uintptr_t)value) == KERN_SUCCESS) {
3581 		return 0;
3582 	} else {
3583 		return EINVAL;
3584 	}
3585 }
3586 
3587 SYSCTL_PROC(_vm, OID_AUTO, corrupt_text_addr,
3588     CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3589     0, 0, corrupt_text_addr, "-", "");
3590 #endif /* DEBUG || DEVELOPMENT */
3591 
3592 #if CONFIG_MAP_RANGES
3593 /*
3594  * vm.malloc_ranges
3595  *
3596  * space-separated list of <left:right> hexadecimal addresses.
3597  */
3598 static int
3599 vm_map_malloc_ranges SYSCTL_HANDLER_ARGS
3600 {
3601 	vm_map_t map = current_map();
3602 	struct mach_vm_range r1, r2;
3603 	char str[20 * 4];
3604 	int len;
3605 	mach_vm_offset_t right_hole_max;
3606 
3607 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_DEFAULT, &r1)) {
3608 		return ENOENT;
3609 	}
3610 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_HEAP, &r2)) {
3611 		return ENOENT;
3612 	}
3613 
3614 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
3615 	right_hole_max = MACH_VM_JUMBO_ADDRESS;
3616 #else /* !XNU_TARGET_OS_IOS || !EXTENDED_USER_VA_SUPPORT */
3617 	right_hole_max = get_map_max(map);
3618 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
3619 
3620 	len = scnprintf(str, sizeof(str), "0x%llx:0x%llx 0x%llx:0x%llx",
3621 	    r1.max_address, r2.min_address,
3622 	    r2.max_address, right_hole_max);
3623 
3624 	return SYSCTL_OUT(req, str, len);
3625 }
3626 
3627 SYSCTL_PROC(_vm, OID_AUTO, malloc_ranges,
3628     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3629     0, 0, &vm_map_malloc_ranges, "A", "");
3630 
3631 #if DEBUG || DEVELOPMENT
3632 static int
3633 vm_map_user_range_default SYSCTL_HANDLER_ARGS
3634 {
3635 #pragma unused(arg1, arg2, oidp)
3636 	struct mach_vm_range range;
3637 
3638 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_DEFAULT, &range)
3639 	    != KERN_SUCCESS) {
3640 		return EINVAL;
3641 	}
3642 
3643 	return SYSCTL_OUT(req, &range, sizeof(range));
3644 }
3645 
3646 static int
3647 vm_map_user_range_heap SYSCTL_HANDLER_ARGS
3648 {
3649 #pragma unused(arg1, arg2, oidp)
3650 	struct mach_vm_range range;
3651 
3652 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_HEAP, &range)
3653 	    != KERN_SUCCESS) {
3654 		return EINVAL;
3655 	}
3656 
3657 	return SYSCTL_OUT(req, &range, sizeof(range));
3658 }
3659 
3660 static int
3661 vm_map_user_range_large_file SYSCTL_HANDLER_ARGS
3662 {
3663 #pragma unused(arg1, arg2, oidp)
3664 	struct mach_vm_range range;
3665 
3666 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_LARGE_FILE, &range)
3667 	    != KERN_SUCCESS) {
3668 		return EINVAL;
3669 	}
3670 
3671 	return SYSCTL_OUT(req, &range, sizeof(range));
3672 }
3673 
3674 /*
3675  * A sysctl that can be used to return ranges for the current VM map.
3676  * Used for testing VM ranges.
3677  */
3678 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_default, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3679     0, 0, &vm_map_user_range_default, "S,mach_vm_range", "");
3680 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_heap, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3681     0, 0, &vm_map_user_range_heap, "S,mach_vm_range", "");
3682 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_large_file, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3683     0, 0, &vm_map_user_range_large_file, "S,mach_vm_range", "");
3684 
3685 #endif /* DEBUG || DEVELOPMENT */
3686 #endif /* CONFIG_MAP_RANGES */
3687 
3688 #if DEBUG || DEVELOPMENT
3689 #endif /* DEBUG || DEVELOPMENT */
3690 
3691 extern uint64_t vm_map_range_overflows_count;
3692 SYSCTL_QUAD(_vm, OID_AUTO, map_range_overflows_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_range_overflows_count, "");
3693 extern boolean_t vm_map_range_overflows_log;
3694 SYSCTL_INT(_vm, OID_AUTO, map_range_oveflows_log, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_range_overflows_log, 0, "");
3695 
3696 extern uint64_t c_seg_filled_no_contention;
3697 extern uint64_t c_seg_filled_contention;
3698 extern clock_sec_t c_seg_filled_contention_sec_max;
3699 extern clock_nsec_t c_seg_filled_contention_nsec_max;
3700 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_no_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_no_contention, "");
3701 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention, "");
3702 SYSCTL_ULONG(_vm, OID_AUTO, c_seg_filled_contention_sec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_sec_max, "");
3703 SYSCTL_UINT(_vm, OID_AUTO, c_seg_filled_contention_nsec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_nsec_max, 0, "");
3704 #if (XNU_TARGET_OS_OSX && __arm64__)
3705 extern clock_nsec_t c_process_major_report_over_ms; /* report if over ? ms */
3706 extern int c_process_major_yield_after; /* yield after moving ? segments */
3707 extern uint64_t c_process_major_reports;
3708 extern clock_sec_t c_process_major_max_sec;
3709 extern clock_nsec_t c_process_major_max_nsec;
3710 extern uint32_t c_process_major_peak_segcount;
3711 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_report_over_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_report_over_ms, 0, "");
3712 SYSCTL_INT(_vm, OID_AUTO, c_process_major_yield_after, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_yield_after, 0, "");
3713 SYSCTL_QUAD(_vm, OID_AUTO, c_process_major_reports, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_reports, "");
3714 SYSCTL_ULONG(_vm, OID_AUTO, c_process_major_max_sec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_sec, "");
3715 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_max_nsec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_nsec, 0, "");
3716 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_peak_segcount, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_peak_segcount, 0, "");
3717 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3718 
3719 #if DEVELOPMENT || DEBUG
3720 extern int panic_object_not_alive;
3721 SYSCTL_INT(_vm, OID_AUTO, panic_object_not_alive, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &panic_object_not_alive, 0, "");
3722 #endif /* DEVELOPMENT || DEBUG */
3723 
3724 #if FBDP_DEBUG_OBJECT_NO_PAGER
3725 extern int fbdp_no_panic;
3726 SYSCTL_INT(_vm, OID_AUTO, fbdp_no_panic, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &fbdp_no_panic, 0, "");
3727 #endif /* MACH_ASSERT */
3728 
3729 extern uint64_t cluster_direct_write_wired;
3730 SYSCTL_QUAD(_vm, OID_AUTO, cluster_direct_write_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &cluster_direct_write_wired, "");
3731 
3732 extern uint64_t vm_object_pageout_not_on_queue;
3733 extern uint64_t vm_object_pageout_not_pageable;
3734 extern uint64_t vm_object_pageout_pageable;
3735 extern uint64_t vm_object_pageout_active_local;
3736 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_on_queue, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_on_queue, "");
3737 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_pageable, "");
3738 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_pageable, "");
3739 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_active_local, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_active_local, "");
3740 
3741 
3742 #if DEVELOPMENT || DEBUG
3743 
3744 static uint32_t
sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)3745 sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)
3746 {
3747 #if HAS_MTE
3748 	if (with_data == VM_C_SERIALIZE_DATA_TAGS) {
3749 		return VM_C_SEGMENT_INFO_MAGIC_WITH_TAGS;
3750 	}
3751 #else
3752 #pragma unused(with_data)
3753 #endif /* HAS_MTE */
3754 	return VM_C_SEGMENT_INFO_MAGIC;
3755 }
3756 
3757 /* The largest possible single segment + its slots is
3758  * (sizeof(c_segment_info) + C_SLOT_MAX_INDEX * sizeof(c_slot_info)) + (data of a single segment) */
3759 #define SYSCTL_SEG_BUF_SIZE (8 * 1024 + 64 * 1024)
3760 
3761 extern uint32_t c_segments_available;
3762 
3763 struct sysctl_buf_header {
3764 	uint32_t magic;
3765 } __attribute__((packed));
3766 
3767 /* This sysctl iterates over the populated c_segments and writes some info about each one and its slots.
3768  * instead of doing everything here, the function calls a function vm_compressor.c. */
3769 static int
sysctl_compressor_segments_stream(struct sysctl_req * req,vm_c_serialize_add_data_t with_data)3770 sysctl_compressor_segments_stream(struct sysctl_req *req, vm_c_serialize_add_data_t with_data)
3771 {
3772 	char* buf = kalloc_data(SYSCTL_SEG_BUF_SIZE, Z_WAITOK | Z_ZERO);
3773 	if (!buf) {
3774 		return ENOMEM;
3775 	}
3776 	size_t offset = 0;
3777 	int error = 0;
3778 	int segno = 0;
3779 	/* 4 byte header to identify the version of the formatting of the data.
3780 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3781 	((struct sysctl_buf_header*)buf)->magic = sysctl_compressor_seg_magic(with_data);
3782 	offset += sizeof(uint32_t);
3783 
3784 	while (segno < c_segments_available) {
3785 		size_t left_sz = SYSCTL_SEG_BUF_SIZE - offset;
3786 		kern_return_t kr = vm_compressor_serialize_segment_debug_info(segno, buf + offset, &left_sz, with_data);
3787 		if (kr == KERN_NO_SPACE) {
3788 			/* failed to add another segment, push the current buffer out and try again */
3789 			if (offset == 0) {
3790 				error = EINVAL; /* no space to write but I didn't write anything, shouldn't really happen */
3791 				goto out;
3792 			}
3793 			/* write out chunk */
3794 			error = SYSCTL_OUT(req, buf, offset);
3795 			if (error) {
3796 				goto out;
3797 			}
3798 			offset = 0;
3799 			bzero(buf, SYSCTL_SEG_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3800 			/* don't increment segno, need to try again saving the current one */
3801 		} else if (kr != KERN_SUCCESS) {
3802 			error = EINVAL;
3803 			goto out;
3804 		} else {
3805 			offset += left_sz;
3806 			++segno;
3807 			assert(offset <= SYSCTL_SEG_BUF_SIZE);
3808 		}
3809 	}
3810 
3811 	if (offset > 0) { /* write last chunk */
3812 		error = SYSCTL_OUT(req, buf, offset);
3813 	}
3814 
3815 out:
3816 	kfree_data(buf, SYSCTL_SEG_BUF_SIZE)
3817 	return error;
3818 }
3819 
3820 static int
sysctl_compressor_segments(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3821 sysctl_compressor_segments(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3822 {
3823 	return sysctl_compressor_segments_stream(req, VM_C_SERIALIZE_DATA_NONE);
3824 }
3825 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments, "S", "");
3826 
3827 #if HAS_MTE
3828 static int
sysctl_compressor_segments_data(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3829 sysctl_compressor_segments_data(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3830 {
3831 	return sysctl_compressor_segments_stream(req, VM_C_SERIALIZE_DATA_TAGS);
3832 }
3833 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments_data, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments_data, "S", "");
3834 #endif /* HAS_MTE */
3835 
3836 extern uint32_t vm_compressor_fragmentation_level(void);
3837 
3838 static int
sysctl_compressor_fragmentation_level(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3839 sysctl_compressor_fragmentation_level(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3840 {
3841 	uint32_t value = vm_compressor_fragmentation_level();
3842 	return SYSCTL_OUT(req, &value, sizeof(value));
3843 }
3844 
3845 SYSCTL_PROC(_vm, OID_AUTO, compressor_fragmentation_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_fragmentation_level, "IU", "");
3846 
3847 extern uint32_t vm_compressor_incore_fragmentation_wasted_pages(void);
3848 
3849 static int
sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3850 sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3851 {
3852 	uint32_t value = vm_compressor_incore_fragmentation_wasted_pages();
3853 	return SYSCTL_OUT(req, &value, sizeof(value));
3854 }
3855 
3856 SYSCTL_PROC(_vm, OID_AUTO, compressor_incore_fragmentation_wasted_pages, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_incore_fragmentation_wasted_pages, "IU", "");
3857 
3858 
3859 
3860 #define SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE (8 * 1024)
3861 
3862 
3863 /* This sysctl iterates over all the entries of the vm_map of the a given process and write some info about the vm_object pointed by the entries.
3864  * This can be used for mapping where are all the pages of a process located in the compressor.
3865  */
3866 static int
sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)3867 sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3868 {
3869 	int error = 0;
3870 	char *buf = NULL;
3871 	proc_t p = PROC_NULL;
3872 	task_t task = TASK_NULL;
3873 	vm_map_t map = VM_MAP_NULL;
3874 	__block size_t offset = 0;
3875 
3876 	/* go from pid to proc to task to vm_map. see sysctl_procargsx() for another example of this procession */
3877 	int *name = arg1;
3878 	int namelen = arg2;
3879 	if (namelen < 1) {
3880 		return EINVAL;
3881 	}
3882 	int pid = name[0];
3883 	p = proc_find(pid);  /* this increments a reference to the proc */
3884 	if (p == PROC_NULL) {
3885 		return EINVAL;
3886 	}
3887 	task = proc_task(p);
3888 	proc_rele(p);  /* decrement ref of proc */
3889 	p = PROC_NULL;
3890 	if (task == TASK_NULL) {
3891 		return EINVAL;
3892 	}
3893 	/* convert proc reference to task reference */
3894 	task_reference(task);
3895 	/* task reference to map reference */
3896 	map = get_task_map_reference(task);
3897 	task_deallocate(task);
3898 
3899 	if (map == VM_MAP_NULL) {
3900 		return EINVAL;  /* nothing allocated yet */
3901 	}
3902 
3903 	buf = kalloc_data(SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE, Z_WAITOK | Z_ZERO);
3904 	if (!buf) {
3905 		error = ENOMEM;
3906 		goto out;
3907 	}
3908 
3909 	/* 4 byte header to identify the version of the formatting of the data.
3910 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3911 	((struct sysctl_buf_header*)buf)->magic = VM_MAP_ENTRY_INFO_MAGIC;
3912 	offset += sizeof(uint32_t);
3913 
3914 	kern_return_t (^write_header)(int) = ^kern_return_t (int nentries) {
3915 		/* write the header, happens only once at the beginning so we should have enough space */
3916 		assert(offset + sizeof(struct vm_map_info_hdr) < SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE);
3917 		struct vm_map_info_hdr* out_hdr = (struct vm_map_info_hdr*)(buf + offset);
3918 		out_hdr->vmi_nentries = nentries;
3919 		offset += sizeof(struct vm_map_info_hdr);
3920 		return KERN_SUCCESS;
3921 	};
3922 
3923 	kern_return_t (^write_entry)(void*) = ^kern_return_t (void* entry) {
3924 		while (true) { /* try up to 2 times, first try write the the current buffer, otherwise to a new buffer */
3925 			size_t left_sz = SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE - offset;
3926 			kern_return_t kr = vm_map_dump_entry_and_compressor_pager(entry, buf + offset, &left_sz);
3927 			if (kr == KERN_NO_SPACE) {
3928 				/* failed to write anything, flush the current buffer and try again */
3929 				if (offset == 0) {
3930 					return KERN_FAILURE; /* no space to write but I didn't write anything yet, shouldn't really happen */
3931 				}
3932 				/* write out chunk */
3933 				int out_error = SYSCTL_OUT(req, buf, offset);
3934 				if (out_error) {
3935 					return KERN_FAILURE;
3936 				}
3937 				offset = 0;
3938 				bzero(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3939 				continue; /* need to retry the entry dump again with the cleaned buffer */
3940 			} else if (kr != KERN_SUCCESS) {
3941 				return kr;
3942 			}
3943 			offset += left_sz;
3944 			break;
3945 		}
3946 		return KERN_SUCCESS;
3947 	};
3948 
3949 	/* this foreach first calls to the first callback with the number of entries, then calls the second for every entry
3950 	 * when the buffer is exhausted, it is flushed to the sysctl and restarted */
3951 	kern_return_t kr = vm_map_entries_foreach(map, write_header, write_entry);
3952 
3953 	if (kr != KERN_SUCCESS) {
3954 		goto out;
3955 	}
3956 
3957 	if (offset > 0) { /* last chunk */
3958 		error = SYSCTL_OUT(req, buf, offset);
3959 	}
3960 
3961 out:
3962 	if (buf != NULL) {
3963 		kfree_data(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE)
3964 	}
3965 	if (map != NULL) {
3966 		vm_map_deallocate(map);
3967 	}
3968 	return error;
3969 }
3970 
3971 SYSCTL_PROC(_vm, OID_AUTO, task_vm_objects_slotmap, CTLTYPE_NODE | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_task_vm_objects_slotmap, "S", "");
3972 
3973 #pragma mark VM Host Statistics
3974 
3975 SYSCTL_NODE(_vm, OID_AUTO, stat, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Host memory statistics");
3976 
3977 SYSCTL_SCALABLE_COUNTER(_vm_stat, zero_fills, vm_statistics_zero_fill_count, "Pages zero-filled");
3978 SYSCTL_SCALABLE_COUNTER(_vm_stat, reactivations, vm_statistics_reactivations, "Pages reactivated");
3979 SYSCTL_SCALABLE_COUNTER(_vm_stat, pageins, vm_statistics_pageins, "Pages paged-in (including speculation)");
3980 SYSCTL_SCALABLE_COUNTER(_vm_stat, pageins_requested, vm_statistics_pageins_requested, "Page-ins requested");
3981 SYSCTL_SCALABLE_COUNTER(_vm_stat, pageins_aborted, vm_statistics_pageins_aborted, "Pages aborted during page-in");
3982 SYSCTL_SCALABLE_COUNTER(_vm_stat, pageouts, vm_statistics_pageouts, "Pages paged-out");
3983 SYSCTL_SCALABLE_COUNTER(_vm_stat, faults, vm_statistics_faults, "Pages faulted");
3984 SYSCTL_SCALABLE_COUNTER(_vm_stat, cow_faults, vm_statistics_cow_faults, "Pages faulted due to copy-on-write");
3985 SYSCTL_SCALABLE_COUNTER(_vm_stat, obj_cache_lookups, vm_statistics_lookups, "Pages looked up in the object-cache");
3986 SYSCTL_SCALABLE_COUNTER(_vm_stat, obj_cache_hits, vm_statistics_hits, "Object-cache lookup hits");
3987 SYSCTL_SCALABLE_COUNTER(_vm_stat, purges, vm_statistics_purges, "Pages purged");
3988 SYSCTL_SCALABLE_COUNTER(_vm_stat, decompressions, vm_statistics_decompressions, "Pages decompressed");
3989 SYSCTL_SCALABLE_COUNTER(_vm_stat, compressions, vm_statistics_compressions, "Pages compressed");
3990 SYSCTL_SCALABLE_COUNTER(_vm_stat, swapins, vm_statistics_swapins, "Pages swapped in");
3991 SYSCTL_SCALABLE_COUNTER(_vm_stat, swapouts, vm_statistics_swapouts, "Pages swapped out");
3992 
3993 static int
3994 systctl_vm_reset_tag SYSCTL_HANDLER_ARGS
3995 {
3996 #pragma unused(oidp, arg1, arg2)
3997 	int error;
3998 	int tag;
3999 	kern_return_t kr;
4000 
4001 	/* Need to be root */
4002 	if (!kauth_cred_issuser(kauth_cred_get())) {
4003 		return EPERM;
4004 	}
4005 
4006 	error = SYSCTL_IN(req, &tag, sizeof(tag));
4007 	if (error) {
4008 		return error;
4009 	}
4010 
4011 	if (tag > VM_MAX_TAG_VALUE) {
4012 		return EINVAL;
4013 	}
4014 
4015 	kr = vm_tag_reset_peak((vm_tag_t)tag);
4016 
4017 	return mach_to_bsd_errno(kr);
4018 }
4019 
4020 SYSCTL_PROC(_vm, OID_AUTO, reset_tag,
4021     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED,
4022     0, 0, &systctl_vm_reset_tag, "I", "");
4023 
4024 static int
4025 systctl_vm_reset_all_tags SYSCTL_HANDLER_ARGS
4026 {
4027 #pragma unused(oidp, arg1, arg2)
4028 	/* Only reset the values if the sysctl is a write */
4029 	if (!req->newptr) {
4030 		return EINVAL;
4031 	}
4032 
4033 	/* Need to be root */
4034 	if (!kauth_cred_issuser(kauth_cred_get())) {
4035 		return EPERM;
4036 	}
4037 
4038 	vm_tag_reset_all_peaks();
4039 
4040 	return 0;
4041 }
4042 
4043 SYSCTL_PROC(_vm, OID_AUTO, reset_all_tags,
4044     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED,
4045     0, 0, &systctl_vm_reset_all_tags, "I", "");
4046 
4047 #endif /* DEVELOPMENT || DEBUG */
4048 
4049 SYSCTL_NODE(_vm, OID_AUTO, compressor, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor");
4050 
4051 SYSCTL_INT(_vm_compressor, OID_AUTO, mode, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_mode, 0, "");
4052 SYSCTL_INT(_vm_compressor, OID_AUTO, is_active, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_is_active, 0, "");
4053 SYSCTL_INT(_vm_compressor, OID_AUTO, is_available, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_available, 0, "");
4054 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed, CTLFLAG_RD | CTLFLAG_LOCKED,
4055     &c_segment_pages_compressed, 0, "The amount of uncompressed data stored in the compressor (in pages)");
4056 #if CONFIG_FREEZE
4057 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed_incore, CTLFLAG_RD | CTLFLAG_LOCKED,
4058     &c_segment_pages_compressed_incore, 0, "The amount of uncompressed data stored in the in-core compressor (in pages)");
4059 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed_incore_late_swapout, CTLFLAG_RD | CTLFLAG_LOCKED,
4060     &c_segment_pages_compressed_incore_late_swapout, 0, "The amount of uncompressed data stored in the in-core compressor and queued for swapout (in pages)");
4061 #endif
4062 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed_limit, CTLFLAG_RD | CTLFLAG_LOCKED,
4063     &c_segment_pages_compressed_limit, 0, "The limit on the amount of uncompressed data the compressor will store (in pages)");
4064 
4065 SYSCTL_NODE(_vm_compressor, OID_AUTO, segment, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor Segment Counts");
4066 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, total, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_count, 0, "Number of allocated segments");
4067 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, aging, CTLFLAG_RD | CTLFLAG_LOCKED, &c_age_count, 0, "Number of aging segments");
4068 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedin_early, CTLFLAG_RD | CTLFLAG_LOCKED, &c_early_swappedin_count, 0, "Number of (early) swapped-in segments");
4069 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedin_regular, CTLFLAG_RD | CTLFLAG_LOCKED, &c_regular_swappedin_count, 0, "Number of (regular) swapped-in segments");
4070 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedin_late, CTLFLAG_RD | CTLFLAG_LOCKED, &c_late_swappedin_count, 0, "Number of (late) swapped-in segments");
4071 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapout_early, CTLFLAG_RD | CTLFLAG_LOCKED, &c_early_swapout_count, 0, "Number of (early) ready-to-swap segments");
4072 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapout_regular, CTLFLAG_RD | CTLFLAG_LOCKED, &c_regular_swapout_count, 0, "Number of (regular) ready-to-swap segments");
4073 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapout_late, CTLFLAG_RD | CTLFLAG_LOCKED, &c_late_swapout_count, 0, "Number of (late) ready-to-swap segments");
4074 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapio, CTLFLAG_RD | CTLFLAG_LOCKED, &c_swapio_count, 0, "Number of swapping-out segments");
4075 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedout, CTLFLAG_RD | CTLFLAG_LOCKED, &c_swappedout_count, 0, "Number of (non-sparse) swapped-out segments");
4076 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedout_sparse, CTLFLAG_RD | CTLFLAG_LOCKED, &c_swappedout_sparse_count, 0, "Number of (sparse) swapped-out segments");
4077 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, majorcompact, CTLFLAG_RD | CTLFLAG_LOCKED, &c_major_count, 0, "Number of recently-compacted segments");
4078 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, minorcompact, CTLFLAG_RD | CTLFLAG_LOCKED, &c_minor_count, 0, "Number of segments queued for deferred minor compaction");
4079 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, filling, CTLFLAG_RD | CTLFLAG_LOCKED, &c_filling_count, 0, "Number of filling segments");
4080 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, empty, CTLFLAG_RD | CTLFLAG_LOCKED, &c_empty_count, 0, "Number of empty segments");
4081 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, bad, CTLFLAG_RD | CTLFLAG_LOCKED, &c_bad_count, 0, "Number of bad segments");
4082 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, limit, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segments_limit, 0, "Limit on the number of allocated segments");
4083 
4084 SYSCTL_NODE(_vm_compressor, OID_AUTO, svp, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor Single-Value");
4085 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, in_hash, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_in_hash, 0, "");
4086 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, hash_succeeded, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_hash_succeeded, 0, "");
4087 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, hash_failed, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_hash_failed, 0, "");
4088 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, zval_compressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_zero_compressions, 0, "");
4089 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, zval_decompressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_zero_decompressions, 0, "");
4090 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, nzval_compressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_nonzero_compressions, 0, "");
4091 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, nzval_decompressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_nonzero_decompressions, 0, "");
4092 
4093 SYSCTL_NODE(_vm_compressor, OID_AUTO, compactor, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor Compactor");
4094 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compactions_completed, CTLFLAG_RD | CTLFLAG_LOCKED,
4095     &vm_pageout_vminfo.vm_compactor_major_compactions_completed, "Major compactions completed");
4096 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compactions_considered, CTLFLAG_RD | CTLFLAG_LOCKED,
4097     &vm_pageout_vminfo.vm_compactor_major_compactions_considered, "Major compactions considered");
4098 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compactions_bailed, CTLFLAG_RD | CTLFLAG_LOCKED,
4099     &vm_pageout_vminfo.vm_compactor_major_compactions_bailed, "Major compactions bailed (due to contention)");
4100 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_bytes_moved, CTLFLAG_RD | CTLFLAG_LOCKED,
4101     &vm_pageout_vminfo.vm_compactor_major_compaction_bytes_moved, "Bytes moved between segments during major compactions");
4102 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_slots_moved, CTLFLAG_RD | CTLFLAG_LOCKED,
4103     &vm_pageout_vminfo.vm_compactor_major_compaction_slots_moved, "Slots moved between segments during major compactions");
4104 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_bytes_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
4105     &vm_pageout_vminfo.vm_compactor_major_compaction_bytes_freed, "Bytes freed as a result of major compaction");
4106 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_segments_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
4107     &vm_pageout_vminfo.vm_compactor_major_compaction_segments_freed, "Segments freed as a result of major compaction");
4108 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, swapouts_queued, CTLFLAG_RD | CTLFLAG_LOCKED,
4109     &vm_pageout_vminfo.vm_compactor_swapouts_queued, "The number of segments queued for swapout after a major compaction");
4110 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, swapout_bytes_wasted, CTLFLAG_RD | CTLFLAG_LOCKED,
4111     &vm_pageout_vminfo.vm_compactor_swapout_bytes_wasted, "The number of unused bytes in segments queued for swapout");
4112