xref: /xnu-12377.41.6/bsd/vm/vm_unix.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Mach Operating System
30  * Copyright (c) 1987 Carnegie-Mellon University
31  * All rights reserved.  The CMU software License Agreement specifies
32  * the terms and conditions for use and redistribution.
33  */
34 /*
35  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
36  * support for mandatory and extensible security protections.  This notice
37  * is included in support of clause 2.2 (b) of the Apple Public License,
38  * Version 2.0.
39  */
40 #include <vm/vm_options.h>
41 
42 #include <kern/ecc.h>
43 #include <kern/task.h>
44 #include <kern/thread.h>
45 #include <kern/debug.h>
46 #include <kern/extmod_statistics.h>
47 #include <mach/mach_traps.h>
48 #include <mach/port.h>
49 #include <mach/sdt.h>
50 #include <mach/task.h>
51 #include <mach/task_access.h>
52 #include <mach/task_special_ports.h>
53 #include <mach/time_value.h>
54 #include <mach/vm_map.h>
55 #include <mach/vm_param.h>
56 #include <mach/vm_prot.h>
57 #include <machine/machine_routines.h>
58 
59 #include <sys/file_internal.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/dir.h>
63 #include <sys/namei.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/vm.h>
67 #include <sys/file.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/mount.h>
70 #include <sys/xattr.h>
71 #include <sys/trace.h>
72 #include <sys/kernel.h>
73 #include <sys/ubc_internal.h>
74 #include <sys/user.h>
75 #include <sys/syslog.h>
76 #include <sys/stat.h>
77 #include <sys/sysproto.h>
78 #include <sys/mman.h>
79 #include <sys/sysctl.h>
80 #include <sys/cprotect.h>
81 #include <sys/kpi_socket.h>
82 #include <sys/kas_info.h>
83 #include <sys/socket.h>
84 #include <sys/socketvar.h>
85 #include <sys/random.h>
86 #include <sys/code_signing.h>
87 #if NECP
88 #include <net/necp.h>
89 #endif /* NECP */
90 #if SKYWALK
91 #include <skywalk/os_channel.h>
92 #endif /* SKYWALK */
93 
94 #include <security/audit/audit.h>
95 #include <security/mac.h>
96 #include <bsm/audit_kevents.h>
97 
98 #include <kern/kalloc.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_kern_xnu.h>
101 #include <vm/vm_pageout_xnu.h>
102 
103 #include <mach/shared_region.h>
104 #include <vm/vm_shared_region_internal.h>
105 
106 #include <vm/vm_dyld_pager_internal.h>
107 #include <vm/vm_protos_internal.h>
108 #include <vm/vm_compressor_info.h>         /* for c_segment_info */
109 #include <vm/vm_compressor_internal.h>
110 #include <vm/vm_compressor_xnu.h>          /* for vm_compressor_serialize_segment_debug_info() */
111 #include <vm/vm_object_xnu.h>              /* for vm_chead_select_t */
112 #include <vm/vm_memory_entry_xnu.h>
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_reclaim_xnu.h>
115 #if HAS_MTE
116 #include <vm/vm_compressor_xnu.h>
117 #include <vm/vm_mteinfo_internal.h>
118 #endif /* HAS_MTE */
119 
120 #include <sys/kern_memorystatus.h>
121 #include <sys/kern_memorystatus_freeze.h>
122 #include <sys/proc_internal.h>
123 
124 #include <mach-o/fixup-chains.h>
125 
126 #if CONFIG_MACF
127 #include <security/mac_framework.h>
128 #endif
129 
130 #include <kern/bits.h>
131 
132 #if CONFIG_CSR
133 #include <sys/csr.h>
134 #endif /* CONFIG_CSR */
135 #include <sys/trust_caches.h>
136 #include <libkern/amfi/amfi.h>
137 #include <IOKit/IOBSD.h>
138 
139 #if VM_MAP_DEBUG_APPLE_PROTECT
140 SYSCTL_INT(_vm, OID_AUTO, map_debug_apple_protect, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_apple_protect, 0, "");
141 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
142 
143 #if DEVELOPMENT || DEBUG
144 
145 extern int vm_object_cache_evict_all(void);
146 static int
147 sysctl_vm_object_cache_evict SYSCTL_HANDLER_ARGS
148 {
149 #pragma unused(arg1, arg2, req)
150 	(void) vm_object_cache_evict_all();
151 	return 0;
152 }
153 
154 SYSCTL_PROC(_vm, OID_AUTO, object_cache_evict, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
155     0, 0, &sysctl_vm_object_cache_evict, "I", "");
156 
157 static int
158 sysctl_kmem_alloc_contig SYSCTL_HANDLER_ARGS
159 {
160 #pragma unused(arg1, arg2)
161 	vm_offset_t     kaddr;
162 	kern_return_t   kr;
163 	int     error = 0;
164 	int     size = 0;
165 
166 	error = sysctl_handle_int(oidp, &size, 0, req);
167 	if (error || !req->newptr) {
168 		return error;
169 	}
170 
171 	kr = kmem_alloc_contig(kernel_map, &kaddr, (vm_size_t)size,
172 	    0, 0, 0, KMA_DATA, VM_KERN_MEMORY_IOKIT);
173 
174 	if (kr == KERN_SUCCESS) {
175 		kmem_free(kernel_map, kaddr, size);
176 	}
177 
178 	return error;
179 }
180 
181 SYSCTL_PROC(_vm, OID_AUTO, kmem_alloc_contig, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
182     0, 0, &sysctl_kmem_alloc_contig, "I", "");
183 
184 extern int vm_region_footprint;
185 SYSCTL_INT(_vm, OID_AUTO, region_footprint, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, &vm_region_footprint, 0, "");
186 
187 static int
188 sysctl_kmem_gobj_stats SYSCTL_HANDLER_ARGS
189 {
190 #pragma unused(arg1, arg2, oidp)
191 	kmem_gobj_stats stats = kmem_get_gobj_stats();
192 
193 	return SYSCTL_OUT(req, &stats, sizeof(stats));
194 }
195 
196 SYSCTL_PROC(_vm, OID_AUTO, kmem_gobj_stats,
197     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
198     0, 0, &sysctl_kmem_gobj_stats, "S,kmem_gobj_stats", "");
199 
200 #endif /* DEVELOPMENT || DEBUG */
201 
202 static int
203 sysctl_vm_self_region_footprint SYSCTL_HANDLER_ARGS
204 {
205 #pragma unused(arg1, arg2, oidp)
206 	int     error = 0;
207 	int     value;
208 
209 	value = task_self_region_footprint();
210 	error = SYSCTL_OUT(req, &value, sizeof(int));
211 	if (error) {
212 		return error;
213 	}
214 
215 	if (!req->newptr) {
216 		return 0;
217 	}
218 
219 	error = SYSCTL_IN(req, &value, sizeof(int));
220 	if (error) {
221 		return error;
222 	}
223 	task_self_region_footprint_set(value);
224 	return 0;
225 }
226 SYSCTL_PROC(_vm, OID_AUTO, self_region_footprint, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_footprint, "I", "");
227 
228 static int
229 sysctl_vm_self_region_page_size SYSCTL_HANDLER_ARGS
230 {
231 #pragma unused(arg1, arg2, oidp)
232 	int     error = 0;
233 	int     value;
234 
235 	value = (1 << thread_self_region_page_shift());
236 	error = SYSCTL_OUT(req, &value, sizeof(int));
237 	if (error) {
238 		return error;
239 	}
240 
241 	if (!req->newptr) {
242 		return 0;
243 	}
244 
245 	error = SYSCTL_IN(req, &value, sizeof(int));
246 	if (error) {
247 		return error;
248 	}
249 
250 	if (value != 0 && value != 4096 && value != 16384) {
251 		return EINVAL;
252 	}
253 
254 #if !__ARM_MIXED_PAGE_SIZE__
255 	if (value != vm_map_page_size(current_map())) {
256 		return EINVAL;
257 	}
258 #endif /* !__ARM_MIXED_PAGE_SIZE__ */
259 
260 	thread_self_region_page_shift_set(bit_first(value));
261 	return 0;
262 }
263 SYSCTL_PROC(_vm, OID_AUTO, self_region_page_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_page_size, "I", "");
264 
265 static int
266 sysctl_vm_self_region_info_flags SYSCTL_HANDLER_ARGS
267 {
268 #pragma unused(arg1, arg2, oidp)
269 	int     error = 0;
270 	int     value;
271 	kern_return_t kr;
272 
273 	value = task_self_region_info_flags();
274 	error = SYSCTL_OUT(req, &value, sizeof(int));
275 	if (error) {
276 		return error;
277 	}
278 
279 	if (!req->newptr) {
280 		return 0;
281 	}
282 
283 	error = SYSCTL_IN(req, &value, sizeof(int));
284 	if (error) {
285 		return error;
286 	}
287 
288 	kr = task_self_region_info_flags_set(value);
289 	if (kr != KERN_SUCCESS) {
290 		return EINVAL;
291 	}
292 
293 	return 0;
294 }
295 SYSCTL_PROC(_vm, OID_AUTO, self_region_info_flags, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_info_flags, "I", "");
296 
297 
298 #if DEVELOPMENT || DEBUG
299 extern int panic_on_unsigned_execute;
300 SYSCTL_INT(_vm, OID_AUTO, panic_on_unsigned_execute, CTLFLAG_RW | CTLFLAG_LOCKED, &panic_on_unsigned_execute, 0, "");
301 
302 extern int vm_log_xnu_user_debug;
303 SYSCTL_INT(_vm, OID_AUTO, log_xnu_user_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_xnu_user_debug, 0, "");
304 #endif /* DEVELOPMENT || DEBUG */
305 
306 extern int vm_log_map_delete_permanent_prot_none;
307 SYSCTL_INT(_vm, OID_AUTO, log_map_delete_permanent_prot_none, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_map_delete_permanent_prot_none, 0, "");
308 
309 extern int cs_executable_create_upl;
310 extern int cs_executable_wire;
311 SYSCTL_INT(_vm, OID_AUTO, cs_executable_create_upl, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_create_upl, 0, "");
312 SYSCTL_INT(_vm, OID_AUTO, cs_executable_wire, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_wire, 0, "");
313 
314 extern int apple_protect_pager_count;
315 extern int apple_protect_pager_count_mapped;
316 extern unsigned int apple_protect_pager_cache_limit;
317 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count, 0, "");
318 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count_mapped, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count_mapped, 0, "");
319 SYSCTL_UINT(_vm, OID_AUTO, apple_protect_pager_cache_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_cache_limit, 0, "");
320 
321 #if DEVELOPMENT || DEBUG
322 extern int radar_20146450;
323 SYSCTL_INT(_vm, OID_AUTO, radar_20146450, CTLFLAG_RW | CTLFLAG_LOCKED, &radar_20146450, 0, "");
324 
325 extern int macho_printf;
326 SYSCTL_INT(_vm, OID_AUTO, macho_printf, CTLFLAG_RW | CTLFLAG_LOCKED, &macho_printf, 0, "");
327 
328 extern int apple_protect_pager_data_request_debug;
329 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_data_request_debug, 0, "");
330 
331 extern unsigned int vm_object_copy_delayed_paging_wait_disable;
332 EXPERIMENT_FACTOR_LEGACY_UINT(_vm, vm_object_copy_delayed_paging_wait_disable, &vm_object_copy_delayed_paging_wait_disable, FALSE, TRUE, "");
333 
334 __enum_closed_decl(vm_submap_test_op, uint32_t, {
335 	vsto_make_submap = 1,  /* make submap from entries in current_map()
336 	                        * at start..end, offset ignored */
337 	vsto_remap_submap = 2, /* map in current_map() at start..end,
338 	                        * from parent address submap_base_address
339 	                        * and submap address offset */
340 	vsto_end
341 });
342 
343 static int
344 sysctl_vm_submap_test_ctl SYSCTL_HANDLER_ARGS
345 {
346 	int error;
347 	struct {
348 		vm_submap_test_op op;
349 		mach_vm_address_t submap_base_address;
350 		mach_vm_address_t start;
351 		mach_vm_address_t end;
352 		mach_vm_address_t offset;
353 	} args;
354 	if (req->newlen != sizeof(args)) {
355 		return EINVAL;
356 	}
357 	error = SYSCTL_IN(req, &args, sizeof(args));
358 	if (error) {
359 		return error;
360 	}
361 
362 	switch (args.op) {
363 	case vsto_make_submap:
364 		vm_map_testing_make_sealed_submap(current_map(), args.start, args.end);
365 		break;
366 	case vsto_remap_submap:
367 		vm_map_testing_remap_submap(current_map(),
368 		    args.submap_base_address, args.start, args.end, args.offset);
369 		break;
370 	default:
371 		return EINVAL;
372 	}
373 
374 	return 0;
375 }
376 SYSCTL_PROC(_vm, OID_AUTO, submap_test_ctl, CTLFLAG_WR | CTLFLAG_LOCKED, 0, 0, &sysctl_vm_submap_test_ctl, "-", "");
377 
378 #if __arm64__
379 /* These are meant to support the page table accounting unit test. */
380 extern unsigned int arm_hardware_page_size;
381 extern unsigned int arm_pt_desc_size;
382 extern unsigned int arm_pt_root_size;
383 extern unsigned int inuse_user_tteroot_count;
384 extern unsigned int inuse_kernel_tteroot_count;
385 extern unsigned int inuse_user_ttepages_count;
386 extern unsigned int inuse_kernel_ttepages_count;
387 extern unsigned int inuse_user_ptepages_count;
388 extern unsigned int inuse_kernel_ptepages_count;
389 SYSCTL_UINT(_vm, OID_AUTO, native_hw_pagesize, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_hardware_page_size, 0, "");
390 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_desc_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_desc_size, 0, "");
391 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_root_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_root_size, 0, "");
392 SYSCTL_UINT(_vm, OID_AUTO, user_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_tteroot_count, 0, "");
393 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_tteroot_count, 0, "");
394 SYSCTL_UINT(_vm, OID_AUTO, user_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ttepages_count, 0, "");
395 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ttepages_count, 0, "");
396 SYSCTL_UINT(_vm, OID_AUTO, user_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ptepages_count, 0, "");
397 SYSCTL_UINT(_vm, OID_AUTO, kernel_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ptepages_count, 0, "");
398 #if !CONFIG_SPTM
399 extern unsigned int free_page_size_tt_count;
400 extern unsigned int free_tt_count;
401 SYSCTL_UINT(_vm, OID_AUTO, free_1page_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_page_size_tt_count, 0, "");
402 SYSCTL_UINT(_vm, OID_AUTO, free_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_tt_count, 0, "");
403 #endif
404 #if DEVELOPMENT || DEBUG
405 extern unsigned long pmap_asid_flushes;
406 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_flushes, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_flushes, "");
407 extern unsigned long pmap_asid_hits;
408 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_hits, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_hits, "");
409 extern unsigned long pmap_asid_misses;
410 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_misses, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_misses, "");
411 extern unsigned long pmap_speculation_restrictions;
412 SYSCTL_ULONG(_vm, OID_AUTO, pmap_speculation_restrictions, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_speculation_restrictions, "");
413 #endif
414 #endif /* __arm64__ */
415 #endif /* DEVELOPMENT || DEBUG */
416 
417 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor, 0, "");
418 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor_pages, 0, "");
419 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate, 0, "");
420 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate_failure, 0, "");
421 SYSCTL_INT(_vm, OID_AUTO, vm_should_cow_but_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.should_cow_but_wired, 0, "");
422 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow, 0, "");
423 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow_pages, 0, "");
424 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_write, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_write, 0, "");
425 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_copy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_copy, 0, "");
426 #if VM_SCAN_FOR_SHADOW_CHAIN
427 static int vm_shadow_max_enabled = 0;    /* Disabled by default */
428 extern int proc_shadow_max(void);
429 static int
430 vm_shadow_max SYSCTL_HANDLER_ARGS
431 {
432 #pragma unused(arg1, arg2, oidp)
433 	int value = 0;
434 
435 	if (vm_shadow_max_enabled) {
436 		value = proc_shadow_max();
437 	}
438 
439 	return SYSCTL_OUT(req, &value, sizeof(value));
440 }
441 SYSCTL_PROC(_vm, OID_AUTO, vm_shadow_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
442     0, 0, &vm_shadow_max, "I", "");
443 
444 SYSCTL_INT(_vm, OID_AUTO, vm_shadow_max_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_shadow_max_enabled, 0, "");
445 
446 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
447 
448 SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, "");
449 
450 #if PAGE_SLEEP_WITH_INHERITOR
451 #if DEVELOPMENT || DEBUG
452 extern uint32_t page_worker_table_size;
453 SYSCTL_INT(_vm, OID_AUTO, page_worker_table_size, CTLFLAG_RD | CTLFLAG_LOCKED, &page_worker_table_size, 0, "");
454 SCALABLE_COUNTER_DECLARE(page_worker_hash_collisions);
455 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_hash_collisions, page_worker_hash_collisions, "");
456 SCALABLE_COUNTER_DECLARE(page_worker_inheritor_sleeps);
457 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_inheritor_sleeps, page_worker_inheritor_sleeps, "");
458 #endif /* DEVELOPMENT || DEBUG */
459 #endif /* PAGE_SLEEP_WITH_INHERITOR */
460 
461 #if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1
462 extern uint32_t vm_cheads;
463 extern vm_chead_select_t vm_chead_select;
464 extern boolean_t vm_chead_rehint;
465 #if DEVELOPMENT || DEBUG
466 SYSCTL_UINT(_vm, OID_AUTO, compressor_heads, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cheads, 0, "");
467 SYSCTL_UINT(_vm, OID_AUTO, compressor_head_select, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_chead_select, 0, "");
468 SYSCTL_INT(_vm, OID_AUTO, compressor_head_rehint, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_chead_rehint, 0, "");
469 #endif /* DEVELOPMENT || DEBUG */
470 EXPERIMENT_FACTOR_UINT(compressor_heads, &vm_cheads, 1, COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT, "");
471 EXPERIMENT_FACTOR_UINT(compressor_head_select, &vm_chead_select, CSEL_MIN, CSEL_MAX, "");
472 EXPERIMENT_FACTOR_INT(compressor_head_rehint, &vm_chead_rehint, 0, 1, "");
473 #endif /* COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 */
474 
475 /*
476  * Sysctl's related to data/stack execution.  See osfmk/vm/vm_map.c
477  */
478 
479 #if DEVELOPMENT || DEBUG
480 extern int allow_stack_exec, allow_data_exec;
481 
482 SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, "");
483 SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, "");
484 
485 #endif /* DEVELOPMENT || DEBUG */
486 
487 static const char *prot_values[] = {
488 	"none",
489 	"read-only",
490 	"write-only",
491 	"read-write",
492 	"execute-only",
493 	"read-execute",
494 	"write-execute",
495 	"read-write-execute"
496 };
497 
498 void
log_stack_execution_failure(addr64_t vaddr,vm_prot_t prot)499 log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot)
500 {
501 	printf("Data/Stack execution not permitted: %s[pid %d] at virtual address 0x%qx, protections were %s\n",
502 	    current_proc()->p_comm, proc_getpid(current_proc()), vaddr, prot_values[prot & VM_PROT_ALL]);
503 }
504 
505 /*
506  * shared_region_unnest_logging: level of logging of unnesting events
507  * 0	- no logging
508  * 1	- throttled logging of unexpected unnesting events (default)
509  * 2	- unthrottled logging of unexpected unnesting events
510  * 3+	- unthrottled logging of all unnesting events
511  */
512 int shared_region_unnest_logging = 1;
513 
514 SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED,
515     &shared_region_unnest_logging, 0, "");
516 
517 int vm_shared_region_unnest_log_interval = 10;
518 int shared_region_unnest_log_count_threshold = 5;
519 
520 
521 #if XNU_TARGET_OS_OSX
522 
523 #if defined (__x86_64__)
524 static int scdir_enforce = 1;
525 #else /* defined (__x86_64__) */
526 static int scdir_enforce = 0;   /* AOT caches live elsewhere */
527 #endif /* defined (__x86_64__) */
528 
529 static char *scdir_path[] = {
530 	"/System/Library/dyld/",
531 	"/System/Volumes/Preboot/Cryptexes/OS/System/Library/dyld",
532 	"/System/Cryptexes/OS/System/Library/dyld",
533 	NULL
534 };
535 
536 #else /* XNU_TARGET_OS_OSX */
537 
538 static int scdir_enforce = 0;
539 static char *scdir_path[] = {
540 	"/System/Library/Caches/com.apple.dyld/",
541 	"/private/preboot/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
542 	"/System/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
543 	NULL
544 };
545 
546 #endif /* XNU_TARGET_OS_OSX */
547 
548 static char *driverkit_scdir_path[] = {
549 	"/System/DriverKit/System/Library/dyld/",
550 #if XNU_TARGET_OS_OSX
551 	"/System/Volumes/Preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
552 #else
553 	"/private/preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
554 #endif /* XNU_TARGET_OS_OSX */
555 	"/System/Cryptexes/OS/System/DriverKit/System/Library/dyld",
556 	NULL
557 };
558 
559 #ifndef SECURE_KERNEL
560 static int sysctl_scdir_enforce SYSCTL_HANDLER_ARGS
561 {
562 #if CONFIG_CSR
563 	if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
564 		printf("Failed attempt to set vm.enforce_shared_cache_dir sysctl\n");
565 		return EPERM;
566 	}
567 #endif /* CONFIG_CSR */
568 	return sysctl_handle_int(oidp, arg1, arg2, req);
569 }
570 
571 SYSCTL_PROC(_vm, OID_AUTO, enforce_shared_cache_dir, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, sysctl_scdir_enforce, "I", "");
572 #endif
573 
574 /* These log rate throttling state variables aren't thread safe, but
575  * are sufficient unto the task.
576  */
577 static int64_t last_unnest_log_time = 0;
578 static int shared_region_unnest_log_count = 0;
579 
580 void
log_unnest_badness(vm_map_t m,vm_map_offset_t s,vm_map_offset_t e,boolean_t is_nested_map,vm_map_offset_t lowest_unnestable_addr)581 log_unnest_badness(
582 	vm_map_t        m,
583 	vm_map_offset_t s,
584 	vm_map_offset_t e,
585 	boolean_t       is_nested_map,
586 	vm_map_offset_t lowest_unnestable_addr)
587 {
588 	struct timeval  tv;
589 
590 	if (shared_region_unnest_logging == 0) {
591 		return;
592 	}
593 
594 	if (shared_region_unnest_logging <= 2 &&
595 	    is_nested_map &&
596 	    s >= lowest_unnestable_addr) {
597 		/*
598 		 * Unnesting of writable map entries is fine.
599 		 */
600 		return;
601 	}
602 
603 	if (shared_region_unnest_logging <= 1) {
604 		microtime(&tv);
605 		if ((tv.tv_sec - last_unnest_log_time) <
606 		    vm_shared_region_unnest_log_interval) {
607 			if (shared_region_unnest_log_count++ >
608 			    shared_region_unnest_log_count_threshold) {
609 				return;
610 			}
611 		} else {
612 			last_unnest_log_time = tv.tv_sec;
613 			shared_region_unnest_log_count = 0;
614 		}
615 	}
616 
617 	DTRACE_VM4(log_unnest_badness,
618 	    vm_map_t, m,
619 	    vm_map_offset_t, s,
620 	    vm_map_offset_t, e,
621 	    vm_map_offset_t, lowest_unnestable_addr);
622 	printf("%s[%d] triggered unnest of range 0x%qx->0x%qx of DYLD shared region in VM map %p. While not abnormal for debuggers, this increases system memory footprint until the target exits.\n", current_proc()->p_comm, proc_getpid(current_proc()), (uint64_t)s, (uint64_t)e, (void *) VM_KERNEL_ADDRPERM(m));
623 }
624 
625 uint64_t
vm_purge_filebacked_pagers(void)626 vm_purge_filebacked_pagers(void)
627 {
628 	uint64_t pages_purged;
629 
630 	pages_purged = 0;
631 	pages_purged += apple_protect_pager_purge_all();
632 	pages_purged += shared_region_pager_purge_all();
633 	pages_purged += dyld_pager_purge_all();
634 #if DEVELOPMENT || DEBUG
635 	printf("%s:%d pages purged: %llu\n", __FUNCTION__, __LINE__, pages_purged);
636 #endif /* DEVELOPMENT || DEBUG */
637 	return pages_purged;
638 }
639 
640 int
useracc(user_addr_ut addr_u,user_size_ut len_u,int prot)641 useracc(
642 	user_addr_ut    addr_u,
643 	user_size_ut    len_u,
644 	int             prot)
645 {
646 	vm_map_t        map;
647 	vm_prot_t       vm_prot = VM_PROT_WRITE;
648 
649 	map = current_map();
650 
651 	if (prot == B_READ) {
652 		vm_prot = VM_PROT_READ;
653 	}
654 
655 	return vm_map_check_protection(map, addr_u,
656 	           vm_sanitize_compute_ut_end(addr_u, len_u), vm_prot,
657 	           VM_SANITIZE_CALLER_USERACC);
658 }
659 
660 #if XNU_PLATFORM_MacOSX
661 static __attribute__((always_inline, warn_unused_result))
662 kern_return_t
vslock_sanitize(vm_map_t map,user_addr_ut addr_u,user_size_ut len_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)663 vslock_sanitize(
664 	vm_map_t                map,
665 	user_addr_ut            addr_u,
666 	user_size_ut            len_u,
667 	vm_sanitize_caller_t    vm_sanitize_caller,
668 	vm_map_offset_t        *start,
669 	vm_map_offset_t        *end,
670 	vm_map_size_t          *size)
671 {
672 	return vm_sanitize_addr_size(addr_u, len_u, vm_sanitize_caller,
673 	           map,
674 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
675 	           size);
676 }
677 #endif /* XNU_PLATFORM_MacOSX */
678 
679 int
vslock(user_addr_ut addr,user_size_ut len)680 vslock(user_addr_ut addr, user_size_ut len)
681 {
682 	kern_return_t kret;
683 
684 #if XNU_PLATFORM_MacOSX
685 	/*
686 	 * Preserve previous behavior on macOS for overflows due to bin
687 	 * compatibility i.e. return success for overflows without doing
688 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
689 	 * for overflow errors which gets converted to KERN_SUCCESS by
690 	 * vm_sanitize_get_kr.
691 	 */
692 	vm_map_offset_t start, end;
693 	vm_map_size_t   size;
694 
695 	kret = vslock_sanitize(current_map(),
696 	    addr,
697 	    len,
698 	    VM_SANITIZE_CALLER_VSLOCK,
699 	    &start,
700 	    &end,
701 	    &size);
702 	if (__improbable(kret != KERN_SUCCESS)) {
703 		switch (vm_sanitize_get_kr(kret)) {
704 		case KERN_SUCCESS:
705 			return 0;
706 		case KERN_INVALID_ADDRESS:
707 		case KERN_NO_SPACE:
708 			return ENOMEM;
709 		case KERN_PROTECTION_FAILURE:
710 			return EACCES;
711 		default:
712 			return EINVAL;
713 		}
714 	}
715 #endif /* XNU_PLATFORM_MacOSX */
716 
717 	kret = vm_map_wire_kernel(current_map(), addr,
718 	    vm_sanitize_compute_ut_end(addr, len),
719 	    vm_sanitize_wrap_prot(VM_PROT_READ | VM_PROT_WRITE),
720 	    VM_KERN_MEMORY_BSD,
721 	    FALSE);
722 
723 	switch (kret) {
724 	case KERN_SUCCESS:
725 		return 0;
726 	case KERN_INVALID_ADDRESS:
727 	case KERN_NO_SPACE:
728 		return ENOMEM;
729 	case KERN_PROTECTION_FAILURE:
730 		return EACCES;
731 	default:
732 		return EINVAL;
733 	}
734 }
735 
736 int
vsunlock(user_addr_ut addr,user_size_ut len,__unused int dirtied)737 vsunlock(user_addr_ut addr, user_size_ut len, __unused int dirtied)
738 {
739 #if FIXME  /* [ */
740 	pmap_t          pmap;
741 	vm_page_t       pg;
742 	vm_map_offset_t vaddr;
743 	ppnum_t         paddr;
744 #endif  /* FIXME ] */
745 	kern_return_t   kret;
746 	vm_map_t        map;
747 
748 	map = current_map();
749 
750 #if FIXME  /* [ */
751 	if (dirtied) {
752 		pmap = get_task_pmap(current_task());
753 		for (vaddr = vm_map_trunc_page(addr, PAGE_MASK);
754 		    vaddr < vm_map_round_page(addr + len, PAGE_MASK);
755 		    vaddr += PAGE_SIZE) {
756 			paddr = pmap_find_phys(pmap, vaddr);
757 			pg = PHYS_TO_VM_PAGE(paddr);
758 			vm_page_set_modified(pg);
759 		}
760 	}
761 #endif  /* FIXME ] */
762 #ifdef  lint
763 	dirtied++;
764 #endif  /* lint */
765 
766 #if XNU_PLATFORM_MacOSX
767 	/*
768 	 * Preserve previous behavior on macOS for overflows due to bin
769 	 * compatibility i.e. return success for overflows without doing
770 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
771 	 * for overflow errors which gets converted to KERN_SUCCESS by
772 	 * vm_sanitize_get_kr.
773 	 */
774 	vm_map_offset_t start, end;
775 	vm_map_size_t   size;
776 
777 	kret = vslock_sanitize(map,
778 	    addr,
779 	    len,
780 	    VM_SANITIZE_CALLER_VSUNLOCK,
781 	    &start,
782 	    &end,
783 	    &size);
784 	if (__improbable(kret != KERN_SUCCESS)) {
785 		switch (vm_sanitize_get_kr(kret)) {
786 		case KERN_SUCCESS:
787 			return 0;
788 		case KERN_INVALID_ADDRESS:
789 		case KERN_NO_SPACE:
790 			return ENOMEM;
791 		case KERN_PROTECTION_FAILURE:
792 			return EACCES;
793 		default:
794 			return EINVAL;
795 		}
796 	}
797 #endif /* XNU_PLATFORM_MacOSX */
798 
799 	kret = vm_map_unwire(map, addr,
800 	    vm_sanitize_compute_ut_end(addr, len), false);
801 	switch (kret) {
802 	case KERN_SUCCESS:
803 		return 0;
804 	case KERN_INVALID_ADDRESS:
805 	case KERN_NO_SPACE:
806 		return ENOMEM;
807 	case KERN_PROTECTION_FAILURE:
808 		return EACCES;
809 	default:
810 		return EINVAL;
811 	}
812 }
813 
814 int
subyte(user_addr_t addr,int byte)815 subyte(
816 	user_addr_t addr,
817 	int byte)
818 {
819 	char character;
820 
821 	character = (char)byte;
822 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
823 }
824 
825 int
suibyte(user_addr_t addr,int byte)826 suibyte(
827 	user_addr_t addr,
828 	int byte)
829 {
830 	char character;
831 
832 	character = (char)byte;
833 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
834 }
835 
836 int
fubyte(user_addr_t addr)837 fubyte(user_addr_t addr)
838 {
839 	unsigned char byte;
840 
841 	if (copyin(addr, (void *) &byte, sizeof(char))) {
842 		return -1;
843 	}
844 	return byte;
845 }
846 
847 int
fuibyte(user_addr_t addr)848 fuibyte(user_addr_t addr)
849 {
850 	unsigned char byte;
851 
852 	if (copyin(addr, (void *) &(byte), sizeof(char))) {
853 		return -1;
854 	}
855 	return byte;
856 }
857 
858 int
suword(user_addr_t addr,long word)859 suword(
860 	user_addr_t addr,
861 	long word)
862 {
863 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
864 }
865 
866 long
fuword(user_addr_t addr)867 fuword(user_addr_t addr)
868 {
869 	long word = 0;
870 
871 	if (copyin(addr, (void *) &word, sizeof(int))) {
872 		return -1;
873 	}
874 	return word;
875 }
876 
877 /* suiword and fuiword are the same as suword and fuword, respectively */
878 
879 int
suiword(user_addr_t addr,long word)880 suiword(
881 	user_addr_t addr,
882 	long word)
883 {
884 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
885 }
886 
887 long
fuiword(user_addr_t addr)888 fuiword(user_addr_t addr)
889 {
890 	long word = 0;
891 
892 	if (copyin(addr, (void *) &word, sizeof(int))) {
893 		return -1;
894 	}
895 	return word;
896 }
897 
898 /*
899  * With a 32-bit kernel and mixed 32/64-bit user tasks, this interface allows the
900  * fetching and setting of process-sized size_t and pointer values.
901  */
902 int
sulong(user_addr_t addr,int64_t word)903 sulong(user_addr_t addr, int64_t word)
904 {
905 	if (IS_64BIT_PROCESS(current_proc())) {
906 		return copyout((void *)&word, addr, sizeof(word)) == 0 ? 0 : -1;
907 	} else {
908 		return suiword(addr, (long)word);
909 	}
910 }
911 
912 int64_t
fulong(user_addr_t addr)913 fulong(user_addr_t addr)
914 {
915 	int64_t longword;
916 
917 	if (IS_64BIT_PROCESS(current_proc())) {
918 		if (copyin(addr, (void *)&longword, sizeof(longword)) != 0) {
919 			return -1;
920 		}
921 		return longword;
922 	} else {
923 		return (int64_t)fuiword(addr);
924 	}
925 }
926 
927 int
suulong(user_addr_t addr,uint64_t uword)928 suulong(user_addr_t addr, uint64_t uword)
929 {
930 	if (IS_64BIT_PROCESS(current_proc())) {
931 		return copyout((void *)&uword, addr, sizeof(uword)) == 0 ? 0 : -1;
932 	} else {
933 		return suiword(addr, (uint32_t)uword);
934 	}
935 }
936 
937 uint64_t
fuulong(user_addr_t addr)938 fuulong(user_addr_t addr)
939 {
940 	uint64_t ulongword;
941 
942 	if (IS_64BIT_PROCESS(current_proc())) {
943 		if (copyin(addr, (void *)&ulongword, sizeof(ulongword)) != 0) {
944 			return -1ULL;
945 		}
946 		return ulongword;
947 	} else {
948 		return (uint64_t)fuiword(addr);
949 	}
950 }
951 
952 int
swapon(__unused proc_t procp,__unused struct swapon_args * uap,__unused int * retval)953 swapon(__unused proc_t procp, __unused struct swapon_args *uap, __unused int *retval)
954 {
955 	return ENOTSUP;
956 }
957 
958 #if defined(SECURE_KERNEL)
959 static int kern_secure_kernel = 1;
960 #else
961 static int kern_secure_kernel = 0;
962 #endif
963 
964 SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_secure_kernel, 0, "");
965 SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW | CTLFLAG_LOCKED,
966     &shared_region_trace_level, 0, "");
967 SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD | CTLFLAG_LOCKED,
968     &shared_region_version, 0, "");
969 SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED,
970     &shared_region_persistence, 0, "");
971 
972 /*
973  * shared_region_check_np:
974  *
975  * This system call is intended for dyld.
976  *
977  * dyld calls this when any process starts to see if the process's shared
978  * region is already set up and ready to use.
979  * This call returns the base address of the first mapping in the
980  * process's shared region's first mapping.
981  * dyld will then check what's mapped at that address.
982  *
983  * If the shared region is empty, dyld will then attempt to map the shared
984  * cache file in the shared region via the shared_region_map_and_slide_2_np()
985  * system call.
986  *
987  * If something's already mapped in the shared region, dyld will check if it
988  * matches the shared cache it would like to use for that process.
989  * If it matches, evrything's ready and the process can proceed and use the
990  * shared region.
991  * If it doesn't match, dyld will unmap the shared region and map the shared
992  * cache into the process's address space via mmap().
993  *
994  * A NULL pointer argument can be used by dyld to indicate it has unmapped
995  * the shared region. We will remove the shared_region reference from the task.
996  *
997  * ERROR VALUES
998  * EINVAL	no shared region
999  * ENOMEM	shared region is empty
1000  * EFAULT	bad address for "start_address"
1001  */
1002 int
shared_region_check_np(__unused struct proc * p,struct shared_region_check_np_args * uap,__unused int * retvalp)1003 shared_region_check_np(
1004 	__unused struct proc                    *p,
1005 	struct shared_region_check_np_args      *uap,
1006 	__unused int                            *retvalp)
1007 {
1008 	vm_shared_region_t      shared_region;
1009 	mach_vm_offset_t        start_address = 0;
1010 	int                     error = 0;
1011 	kern_return_t           kr = KERN_FAILURE;
1012 	task_t                  task = current_task();
1013 
1014 	SHARED_REGION_TRACE_DEBUG(
1015 		("shared_region: %p [%d(%s)] -> check_np(0x%llx)\n",
1016 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1017 		proc_getpid(p), p->p_comm,
1018 		(uint64_t)uap->start_address));
1019 
1020 	/*
1021 	 * Special value of start_address used to indicate that map_with_linking() should
1022 	 * no longer be allowed in this process
1023 	 */
1024 	if (uap->start_address == (task_get_64bit_addr(task) ? DYLD_VM_END_MWL : (uint32_t)DYLD_VM_END_MWL)) {
1025 		p->p_disallow_map_with_linking = TRUE;
1026 		return 0;
1027 	}
1028 
1029 	/* retrieve the current task's shared region */
1030 	shared_region = vm_shared_region_get(task);
1031 	if (shared_region != NULL) {
1032 		/*
1033 		 * A NULL argument is used by dyld to indicate the task
1034 		 * has unmapped its shared region.
1035 		 */
1036 		if (uap->start_address == 0) {
1037 			/* unmap it first */
1038 			vm_shared_region_remove(task, shared_region);
1039 			vm_shared_region_set(task, NULL);
1040 		} else {
1041 			/* retrieve address of its first mapping... */
1042 			kr = vm_shared_region_start_address(shared_region, &start_address);
1043 			if (kr != KERN_SUCCESS) {
1044 				SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1045 				    "check_np(0x%llx) "
1046 				    "vm_shared_region_start_address() returned 0x%x\n",
1047 				    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1048 				    proc_getpid(p), p->p_comm,
1049 				    (uint64_t)uap->start_address, kr));
1050 				error = ENOMEM;
1051 			}
1052 			if (error == 0) {
1053 				/* Insert the shared region submap and various bits of debug info into the task. */
1054 				kr = vm_shared_region_update_task(task, shared_region, start_address);
1055 				if (kr != KERN_SUCCESS) {
1056 					SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1057 					    "check_np(0x%llx) "
1058 					    "vm_shared_update_task() returned 0x%x\n",
1059 					    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1060 					    proc_getpid(p), p->p_comm,
1061 					    (uint64_t)uap->start_address, kr));
1062 
1063 					error = ENOMEM;
1064 				}
1065 			}
1066 #if __has_feature(ptrauth_calls)
1067 			/*
1068 			 * Remap any section of the shared library that
1069 			 * has authenticated pointers into private memory.
1070 			 */
1071 			if ((error == 0) && (vm_shared_region_auth_remap(shared_region) != KERN_SUCCESS)) {
1072 				SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1073 				    "check_np(0x%llx) "
1074 				    "vm_shared_region_auth_remap() failed\n",
1075 				    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1076 				    proc_getpid(p), p->p_comm,
1077 				    (uint64_t)uap->start_address));
1078 				error = ENOMEM;
1079 			}
1080 #endif /* __has_feature(ptrauth_calls) */
1081 			/* Give the start address to the caller */
1082 			if (error == 0) {
1083 				error = copyout(&start_address,
1084 				    (user_addr_t) uap->start_address,
1085 				    sizeof(start_address));
1086 				if (error != 0) {
1087 					SHARED_REGION_TRACE_ERROR(
1088 						("shared_region: %p [%d(%s)] "
1089 						"check_np(0x%llx) "
1090 						"copyout(0x%llx) error %d\n",
1091 						(void *)VM_KERNEL_ADDRPERM(current_thread()),
1092 						proc_getpid(p), p->p_comm,
1093 						(uint64_t)uap->start_address, (uint64_t)start_address,
1094 						error));
1095 				}
1096 			}
1097 		}
1098 		vm_shared_region_deallocate(shared_region);
1099 	} else {
1100 		/* no shared region ! */
1101 		error = EINVAL;
1102 	}
1103 
1104 	SHARED_REGION_TRACE_DEBUG(
1105 		("shared_region: %p [%d(%s)] check_np(0x%llx) <- 0x%llx %d\n",
1106 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1107 		proc_getpid(p), p->p_comm,
1108 		(uint64_t)uap->start_address, (uint64_t)start_address, error));
1109 
1110 	return error;
1111 }
1112 
1113 
1114 static int
shared_region_copyin(struct proc * p,user_addr_t user_addr,unsigned int count,unsigned int element_size,void * kernel_data)1115 shared_region_copyin(
1116 	struct proc  *p,
1117 	user_addr_t  user_addr,
1118 	unsigned int count,
1119 	unsigned int element_size,
1120 	void         *kernel_data)
1121 {
1122 	int             error = 0;
1123 	vm_size_t       size = count * element_size;
1124 
1125 	error = copyin(user_addr, kernel_data, size);
1126 	if (error) {
1127 		SHARED_REGION_TRACE_ERROR(
1128 			("shared_region: %p [%d(%s)] map(): "
1129 			"copyin(0x%llx, %ld) failed (error=%d)\n",
1130 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1131 			proc_getpid(p), p->p_comm,
1132 			(uint64_t)user_addr, (long)size, error));
1133 	}
1134 	return error;
1135 }
1136 
1137 /*
1138  * A reasonable upper limit to prevent overflow of allocation/copyin.
1139  */
1140 #define _SR_FILE_MAPPINGS_MAX_FILES 256
1141 
1142 /* forward declaration */
1143 __attribute__((noinline))
1144 static void shared_region_map_and_slide_cleanup(
1145 	struct proc              *p,
1146 	uint32_t                 files_count,
1147 	struct _sr_file_mappings *sr_file_mappings,
1148 	struct vm_shared_region  *shared_region);
1149 
1150 /*
1151  * Setup part of _shared_region_map_and_slide().
1152  * It had to be broken out of _shared_region_map_and_slide() to
1153  * prevent compiler inlining from blowing out the stack.
1154  */
1155 __attribute__((noinline))
1156 static int
shared_region_map_and_slide_setup(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings,struct _sr_file_mappings ** sr_file_mappings,struct vm_shared_region ** shared_region_ptr,struct vnode * rdir_vp)1157 shared_region_map_and_slide_setup(
1158 	struct proc                         *p,
1159 	uint32_t                            files_count,
1160 	struct shared_file_np               *files,
1161 	uint32_t                            mappings_count,
1162 	struct shared_file_mapping_slide_np *mappings,
1163 	struct _sr_file_mappings            **sr_file_mappings,
1164 	struct vm_shared_region             **shared_region_ptr,
1165 	struct vnode                        *rdir_vp)
1166 {
1167 	int                             error = 0;
1168 	struct _sr_file_mappings        *srfmp;
1169 	uint32_t                        mappings_next;
1170 	struct vnode_attr               va;
1171 	off_t                           fs;
1172 #if CONFIG_MACF
1173 	vm_prot_t                       maxprot = VM_PROT_ALL;
1174 #endif
1175 	uint32_t                        i;
1176 	struct vm_shared_region         *shared_region = NULL;
1177 	boolean_t                       is_driverkit = task_is_driver(current_task());
1178 
1179 	SHARED_REGION_TRACE_DEBUG(
1180 		("shared_region: %p [%d(%s)] -> map_and_slide_setup\n",
1181 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1182 		proc_getpid(p), p->p_comm));
1183 
1184 	if (files_count > _SR_FILE_MAPPINGS_MAX_FILES) {
1185 		error = E2BIG;
1186 		goto done;
1187 	}
1188 	if (files_count == 0) {
1189 		error = EINVAL;
1190 		goto done;
1191 	}
1192 	*sr_file_mappings = kalloc_type(struct _sr_file_mappings, files_count,
1193 	    Z_WAITOK | Z_ZERO);
1194 	if (*sr_file_mappings == NULL) {
1195 		error = ENOMEM;
1196 		goto done;
1197 	}
1198 	mappings_next = 0;
1199 	for (i = 0; i < files_count; i++) {
1200 		srfmp = &(*sr_file_mappings)[i];
1201 		srfmp->fd = files[i].sf_fd;
1202 		srfmp->mappings_count = files[i].sf_mappings_count;
1203 		srfmp->mappings = &mappings[mappings_next];
1204 		mappings_next += srfmp->mappings_count;
1205 		if (mappings_next > mappings_count) {
1206 			error = EINVAL;
1207 			goto done;
1208 		}
1209 		srfmp->slide = files[i].sf_slide;
1210 	}
1211 
1212 	/* get the process's shared region (setup in vm_map_exec()) */
1213 	shared_region = vm_shared_region_get(current_task());
1214 	*shared_region_ptr = shared_region;
1215 	if (shared_region == NULL) {
1216 		SHARED_REGION_TRACE_ERROR(
1217 			("shared_region: %p [%d(%s)] map(): "
1218 			"no shared region\n",
1219 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1220 			proc_getpid(p), p->p_comm));
1221 		error = EINVAL;
1222 		goto done;
1223 	}
1224 
1225 	/*
1226 	 * Check the shared region matches the current root
1227 	 * directory of this process.  Deny the mapping to
1228 	 * avoid tainting the shared region with something that
1229 	 * doesn't quite belong into it.
1230 	 */
1231 	struct vnode *sr_vnode = vm_shared_region_root_dir(shared_region);
1232 	if (sr_vnode != NULL ?  rdir_vp != sr_vnode : rdir_vp != rootvnode) {
1233 		SHARED_REGION_TRACE_ERROR(
1234 			("shared_region: map(%p) root_dir mismatch\n",
1235 			(void *)VM_KERNEL_ADDRPERM(current_thread())));
1236 		error = EPERM;
1237 		goto done;
1238 	}
1239 
1240 
1241 	for (srfmp = &(*sr_file_mappings)[0];
1242 	    srfmp < &(*sr_file_mappings)[files_count];
1243 	    srfmp++) {
1244 		if (srfmp->mappings_count == 0) {
1245 			/* no mappings here... */
1246 			continue;
1247 		}
1248 
1249 		/*
1250 		 * A file descriptor of -1 is used to indicate that the data
1251 		 * to be put in the shared region for this mapping comes directly
1252 		 * from the processes address space. Ensure we have proper alignments.
1253 		 */
1254 		if (srfmp->fd == -1) {
1255 			/* only allow one mapping per fd */
1256 			if (srfmp->mappings_count > 1) {
1257 				SHARED_REGION_TRACE_ERROR(
1258 					("shared_region: %p [%d(%s)] map data >1 mapping\n",
1259 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1260 					proc_getpid(p), p->p_comm));
1261 				error = EINVAL;
1262 				goto done;
1263 			}
1264 
1265 			/*
1266 			 * The destination address and size must be page aligned.
1267 			 */
1268 			struct shared_file_mapping_slide_np *mapping = &srfmp->mappings[0];
1269 			mach_vm_address_t dest_addr = mapping->sms_address;
1270 			mach_vm_size_t    map_size = mapping->sms_size;
1271 			if (!vm_map_page_aligned(dest_addr, vm_map_page_mask(current_map()))) {
1272 				SHARED_REGION_TRACE_ERROR(
1273 					("shared_region: %p [%d(%s)] map data destination 0x%llx not aligned\n",
1274 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1275 					proc_getpid(p), p->p_comm, dest_addr));
1276 				error = EINVAL;
1277 				goto done;
1278 			}
1279 			if (!vm_map_page_aligned(map_size, vm_map_page_mask(current_map()))) {
1280 				SHARED_REGION_TRACE_ERROR(
1281 					("shared_region: %p [%d(%s)] map data size 0x%llx not aligned\n",
1282 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1283 					proc_getpid(p), p->p_comm, map_size));
1284 				error = EINVAL;
1285 				goto done;
1286 			}
1287 			continue;
1288 		}
1289 
1290 		/* get file structure from file descriptor */
1291 		error = fp_get_ftype(p, srfmp->fd, DTYPE_VNODE, EINVAL, &srfmp->fp);
1292 		if (error) {
1293 			SHARED_REGION_TRACE_ERROR(
1294 				("shared_region: %p [%d(%s)] map: "
1295 				"fd=%d lookup failed (error=%d)\n",
1296 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1297 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1298 			goto done;
1299 		}
1300 
1301 		/* we need at least read permission on the file */
1302 		if (!(srfmp->fp->fp_glob->fg_flag & FREAD)) {
1303 			SHARED_REGION_TRACE_ERROR(
1304 				("shared_region: %p [%d(%s)] map: "
1305 				"fd=%d not readable\n",
1306 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1307 				proc_getpid(p), p->p_comm, srfmp->fd));
1308 			error = EPERM;
1309 			goto done;
1310 		}
1311 
1312 		/* get vnode from file structure */
1313 		error = vnode_getwithref((vnode_t)fp_get_data(srfmp->fp));
1314 		if (error) {
1315 			SHARED_REGION_TRACE_ERROR(
1316 				("shared_region: %p [%d(%s)] map: "
1317 				"fd=%d getwithref failed (error=%d)\n",
1318 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1319 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1320 			goto done;
1321 		}
1322 		srfmp->vp = (struct vnode *)fp_get_data(srfmp->fp);
1323 
1324 		/* make sure the vnode is a regular file */
1325 		if (srfmp->vp->v_type != VREG) {
1326 			SHARED_REGION_TRACE_ERROR(
1327 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1328 				"not a file (type=%d)\n",
1329 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1330 				proc_getpid(p), p->p_comm,
1331 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1332 				srfmp->vp->v_name, srfmp->vp->v_type));
1333 			error = EINVAL;
1334 			goto done;
1335 		}
1336 
1337 #if CONFIG_MACF
1338 		/* pass in 0 for the offset argument because AMFI does not need the offset
1339 		 *       of the shared cache */
1340 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
1341 		    srfmp->fp->fp_glob, VM_PROT_ALL, MAP_FILE | MAP_PRIVATE | MAP_FIXED, 0, &maxprot);
1342 		if (error) {
1343 			goto done;
1344 		}
1345 #endif /* MAC */
1346 
1347 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1348 		/*
1349 		 * Check if the shared cache is in the trust cache;
1350 		 * if so, we can skip the root ownership check.
1351 		 */
1352 #if DEVELOPMENT || DEBUG
1353 		/*
1354 		 * Skip both root ownership and trust cache check if
1355 		 * enforcement is disabled.
1356 		 */
1357 		if (!cs_system_enforcement()) {
1358 			goto after_root_check;
1359 		}
1360 #endif /* DEVELOPMENT || DEBUG */
1361 		struct cs_blob *blob = csvnode_get_blob(srfmp->vp, 0);
1362 		if (blob == NULL) {
1363 			SHARED_REGION_TRACE_ERROR(
1364 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1365 				"missing CS blob\n",
1366 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1367 				proc_getpid(p), p->p_comm,
1368 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1369 				srfmp->vp->v_name));
1370 			goto root_check;
1371 		}
1372 		const uint8_t *cdhash = csblob_get_cdhash(blob);
1373 		if (cdhash == NULL) {
1374 			SHARED_REGION_TRACE_ERROR(
1375 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1376 				"missing cdhash\n",
1377 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1378 				proc_getpid(p), p->p_comm,
1379 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1380 				srfmp->vp->v_name));
1381 			goto root_check;
1382 		}
1383 
1384 		bool in_trust_cache = false;
1385 		TrustCacheQueryToken_t qt;
1386 		if (query_trust_cache(kTCQueryTypeAll, cdhash, &qt) == KERN_SUCCESS) {
1387 			TCType_t tc_type = kTCTypeInvalid;
1388 			TCReturn_t tc_ret = amfi->TrustCache.queryGetTCType(&qt, &tc_type);
1389 			in_trust_cache = (tc_ret.error == kTCReturnSuccess &&
1390 			    (tc_type == kTCTypeCryptex1BootOS ||
1391 			    tc_type == kTCTypeStatic ||
1392 			    tc_type == kTCTypeEngineering));
1393 		}
1394 		if (!in_trust_cache) {
1395 			SHARED_REGION_TRACE_ERROR(
1396 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1397 				"not in trust cache\n",
1398 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1399 				proc_getpid(p), p->p_comm,
1400 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1401 				srfmp->vp->v_name));
1402 			goto root_check;
1403 		}
1404 		goto after_root_check;
1405 root_check:
1406 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1407 
1408 		/* The shared cache file must be owned by root */
1409 		VATTR_INIT(&va);
1410 		VATTR_WANTED(&va, va_uid);
1411 		error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1412 		if (error) {
1413 			SHARED_REGION_TRACE_ERROR(
1414 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1415 				"vnode_getattr(%p) failed (error=%d)\n",
1416 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1417 				proc_getpid(p), p->p_comm,
1418 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1419 				srfmp->vp->v_name,
1420 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1421 				error));
1422 			goto done;
1423 		}
1424 		if (va.va_uid != 0) {
1425 			SHARED_REGION_TRACE_ERROR(
1426 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1427 				"owned by uid=%d instead of 0\n",
1428 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1429 				proc_getpid(p), p->p_comm,
1430 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1431 				srfmp->vp->v_name, va.va_uid));
1432 			error = EPERM;
1433 			goto done;
1434 		}
1435 
1436 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1437 after_root_check:
1438 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1439 
1440 #if CONFIG_CSR
1441 		if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
1442 			VATTR_INIT(&va);
1443 			VATTR_WANTED(&va, va_flags);
1444 			error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1445 			if (error) {
1446 				SHARED_REGION_TRACE_ERROR(
1447 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1448 					"vnode_getattr(%p) failed (error=%d)\n",
1449 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1450 					proc_getpid(p), p->p_comm,
1451 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1452 					srfmp->vp->v_name,
1453 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1454 					error));
1455 				goto done;
1456 			}
1457 
1458 			if (!(va.va_flags & SF_RESTRICTED)) {
1459 				/*
1460 				 * CSR is not configured in CSR_ALLOW_UNRESTRICTED_FS mode, and
1461 				 * the shared cache file is NOT SIP-protected, so reject the
1462 				 * mapping request
1463 				 */
1464 				SHARED_REGION_TRACE_ERROR(
1465 					("shared_region: %p [%d(%s)] map(%p:'%s'), "
1466 					"vnode is not SIP-protected. \n",
1467 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1468 					proc_getpid(p), p->p_comm,
1469 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1470 					srfmp->vp->v_name));
1471 				error = EPERM;
1472 				goto done;
1473 			}
1474 		}
1475 #else /* CONFIG_CSR */
1476 
1477 		/*
1478 		 * Devices without SIP/ROSP need to make sure that the shared cache
1479 		 * is either on the root volume or in the preboot cryptex volume.
1480 		 */
1481 		assert(rdir_vp != NULL);
1482 		if (srfmp->vp->v_mount != rdir_vp->v_mount) {
1483 			vnode_t preboot_vp = NULL;
1484 #if XNU_TARGET_OS_OSX
1485 #define PREBOOT_CRYPTEX_PATH "/System/Volumes/Preboot/Cryptexes"
1486 #else
1487 #define PREBOOT_CRYPTEX_PATH "/private/preboot/Cryptexes"
1488 #endif
1489 			error = vnode_lookup(PREBOOT_CRYPTEX_PATH, 0, &preboot_vp, vfs_context_current());
1490 			if (error || srfmp->vp->v_mount != preboot_vp->v_mount) {
1491 				SHARED_REGION_TRACE_ERROR(
1492 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1493 					"not on process' root volume nor preboot volume\n",
1494 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1495 					proc_getpid(p), p->p_comm,
1496 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1497 					srfmp->vp->v_name));
1498 				error = EPERM;
1499 				if (preboot_vp) {
1500 					(void)vnode_put(preboot_vp);
1501 				}
1502 				goto done;
1503 			} else if (preboot_vp) {
1504 				(void)vnode_put(preboot_vp);
1505 			}
1506 		}
1507 #endif /* CONFIG_CSR */
1508 
1509 		if (scdir_enforce) {
1510 			char **expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1511 			struct vnode *scdir_vp = NULL;
1512 			for (expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1513 			    *expected_scdir_path != NULL;
1514 			    expected_scdir_path++) {
1515 				/* get vnode for expected_scdir_path */
1516 				error = vnode_lookup(*expected_scdir_path, 0, &scdir_vp, vfs_context_current());
1517 				if (error) {
1518 					SHARED_REGION_TRACE_ERROR(
1519 						("shared_region: %p [%d(%s)]: "
1520 						"vnode_lookup(%s) failed (error=%d)\n",
1521 						(void *)VM_KERNEL_ADDRPERM(current_thread()),
1522 						proc_getpid(p), p->p_comm,
1523 						*expected_scdir_path, error));
1524 					continue;
1525 				}
1526 
1527 				/* check if parent is scdir_vp */
1528 				assert(scdir_vp != NULL);
1529 				if (vnode_parent(srfmp->vp) == scdir_vp) {
1530 					(void)vnode_put(scdir_vp);
1531 					scdir_vp = NULL;
1532 					goto scdir_ok;
1533 				}
1534 				(void)vnode_put(scdir_vp);
1535 				scdir_vp = NULL;
1536 			}
1537 			/* nothing matches */
1538 			SHARED_REGION_TRACE_ERROR(
1539 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1540 				"shared cache file not in expected directory\n",
1541 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1542 				proc_getpid(p), p->p_comm,
1543 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1544 				srfmp->vp->v_name));
1545 			error = EPERM;
1546 			goto done;
1547 		}
1548 scdir_ok:
1549 
1550 		/* get vnode size */
1551 		error = vnode_size(srfmp->vp, &fs, vfs_context_current());
1552 		if (error) {
1553 			SHARED_REGION_TRACE_ERROR(
1554 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1555 				"vnode_size(%p) failed (error=%d)\n",
1556 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1557 				proc_getpid(p), p->p_comm,
1558 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1559 				srfmp->vp->v_name,
1560 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp), error));
1561 			goto done;
1562 		}
1563 		srfmp->file_size = fs;
1564 
1565 		/* get the file's memory object handle */
1566 		srfmp->file_control = ubc_getobject(srfmp->vp, UBC_HOLDOBJECT);
1567 		if (srfmp->file_control == MEMORY_OBJECT_CONTROL_NULL) {
1568 			SHARED_REGION_TRACE_ERROR(
1569 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1570 				"no memory object\n",
1571 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1572 				proc_getpid(p), p->p_comm,
1573 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1574 				srfmp->vp->v_name));
1575 			error = EINVAL;
1576 			goto done;
1577 		}
1578 
1579 		/* check that the mappings are properly covered by code signatures */
1580 		if (!cs_system_enforcement()) {
1581 			/* code signing is not enforced: no need to check */
1582 		} else {
1583 			for (i = 0; i < srfmp->mappings_count; i++) {
1584 				if (srfmp->mappings[i].sms_init_prot & VM_PROT_ZF) {
1585 					/* zero-filled mapping: not backed by the file */
1586 					continue;
1587 				}
1588 				if (ubc_cs_is_range_codesigned(srfmp->vp,
1589 				    srfmp->mappings[i].sms_file_offset,
1590 				    srfmp->mappings[i].sms_size)) {
1591 					/* this mapping is fully covered by code signatures */
1592 					continue;
1593 				}
1594 				SHARED_REGION_TRACE_ERROR(
1595 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1596 					"mapping #%d/%d [0x%llx:0x%llx:0x%llx:0x%x:0x%x] "
1597 					"is not code-signed\n",
1598 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1599 					proc_getpid(p), p->p_comm,
1600 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1601 					srfmp->vp->v_name,
1602 					i, srfmp->mappings_count,
1603 					srfmp->mappings[i].sms_address,
1604 					srfmp->mappings[i].sms_size,
1605 					srfmp->mappings[i].sms_file_offset,
1606 					srfmp->mappings[i].sms_max_prot,
1607 					srfmp->mappings[i].sms_init_prot));
1608 				error = EINVAL;
1609 				goto done;
1610 			}
1611 		}
1612 	}
1613 done:
1614 	if (error != 0) {
1615 		shared_region_map_and_slide_cleanup(p, files_count, *sr_file_mappings, shared_region);
1616 		*sr_file_mappings = NULL;
1617 		*shared_region_ptr = NULL;
1618 	}
1619 	SHARED_REGION_TRACE_DEBUG(
1620 		("shared_region: %p [%d(%s)] map_and_slide_setup <- %d\n",
1621 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1622 		proc_getpid(p), p->p_comm, error));
1623 	return error;
1624 }
1625 
1626 /*
1627  * shared_region_map_np()
1628  *
1629  * This system call is intended for dyld.
1630  *
1631  * dyld uses this to map a shared cache file into a shared region.
1632  * This is usually done only the first time a shared cache is needed.
1633  * Subsequent processes will just use the populated shared region without
1634  * requiring any further setup.
1635  */
1636 static int
_shared_region_map_and_slide(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings)1637 _shared_region_map_and_slide(
1638 	struct proc                         *p,
1639 	uint32_t                            files_count,
1640 	struct shared_file_np               *files,
1641 	uint32_t                            mappings_count,
1642 	struct shared_file_mapping_slide_np *mappings)
1643 {
1644 	int                             error = 0;
1645 	kern_return_t                   kr = KERN_SUCCESS;
1646 	struct _sr_file_mappings        *sr_file_mappings = NULL;
1647 	struct vnode                    *rdir_vp = NULL;
1648 	struct vm_shared_region         *shared_region = NULL;
1649 
1650 	/*
1651 	 * Get a reference to the current proc's root dir.
1652 	 * Need this to prevent racing with chroot.
1653 	 */
1654 	proc_fdlock(p);
1655 	rdir_vp = p->p_fd.fd_rdir;
1656 	if (rdir_vp == NULL) {
1657 		rdir_vp = rootvnode;
1658 	}
1659 	assert(rdir_vp != NULL);
1660 	vnode_get(rdir_vp);
1661 	proc_fdunlock(p);
1662 
1663 	/*
1664 	 * Turn files, mappings into sr_file_mappings and other setup.
1665 	 */
1666 	error = shared_region_map_and_slide_setup(p, files_count,
1667 	    files, mappings_count, mappings,
1668 	    &sr_file_mappings, &shared_region, rdir_vp);
1669 	if (error != 0) {
1670 		vnode_put(rdir_vp);
1671 		return error;
1672 	}
1673 
1674 	/* map the file(s) into that shared region's submap */
1675 	kr = vm_shared_region_map_file(shared_region, files_count, sr_file_mappings);
1676 	if (kr != KERN_SUCCESS) {
1677 		SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] map(): "
1678 		    "vm_shared_region_map_file() failed kr=0x%x\n",
1679 		    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1680 		    proc_getpid(p), p->p_comm, kr));
1681 	}
1682 
1683 	/* convert kern_return_t to errno */
1684 	switch (kr) {
1685 	case KERN_SUCCESS:
1686 		error = 0;
1687 		break;
1688 	case KERN_INVALID_ADDRESS:
1689 		error = EFAULT;
1690 		break;
1691 	case KERN_PROTECTION_FAILURE:
1692 		error = EPERM;
1693 		break;
1694 	case KERN_NO_SPACE:
1695 		error = ENOMEM;
1696 		break;
1697 	case KERN_FAILURE:
1698 	case KERN_INVALID_ARGUMENT:
1699 	default:
1700 		error = EINVAL;
1701 		break;
1702 	}
1703 
1704 	/*
1705 	 * Mark that this process is now using split libraries.
1706 	 */
1707 	if (error == 0 && (p->p_flag & P_NOSHLIB)) {
1708 		OSBitAndAtomic(~((uint32_t)P_NOSHLIB), &p->p_flag);
1709 	}
1710 
1711 	vnode_put(rdir_vp);
1712 	shared_region_map_and_slide_cleanup(p, files_count, sr_file_mappings, shared_region);
1713 
1714 	SHARED_REGION_TRACE_DEBUG(
1715 		("shared_region: %p [%d(%s)] <- map\n",
1716 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1717 		proc_getpid(p), p->p_comm));
1718 
1719 	return error;
1720 }
1721 
1722 /*
1723  * Clean up part of _shared_region_map_and_slide()
1724  * It had to be broken out of _shared_region_map_and_slide() to
1725  * prevent compiler inlining from blowing out the stack.
1726  */
1727 __attribute__((noinline))
1728 static void
shared_region_map_and_slide_cleanup(struct proc * p,uint32_t files_count,struct _sr_file_mappings * sr_file_mappings,struct vm_shared_region * shared_region)1729 shared_region_map_and_slide_cleanup(
1730 	struct proc              *p,
1731 	uint32_t                 files_count,
1732 	struct _sr_file_mappings *sr_file_mappings,
1733 	struct vm_shared_region  *shared_region)
1734 {
1735 	struct _sr_file_mappings *srfmp;
1736 	struct vnode_attr        va;
1737 
1738 	if (sr_file_mappings != NULL) {
1739 		for (srfmp = &sr_file_mappings[0]; srfmp < &sr_file_mappings[files_count]; srfmp++) {
1740 			if (srfmp->vp != NULL) {
1741 				vnode_lock_spin(srfmp->vp);
1742 				srfmp->vp->v_flag |= VSHARED_DYLD;
1743 				vnode_unlock(srfmp->vp);
1744 
1745 				/* update the vnode's access time */
1746 				if (!(vnode_vfsvisflags(srfmp->vp) & MNT_NOATIME)) {
1747 					VATTR_INIT(&va);
1748 					nanotime(&va.va_access_time);
1749 					VATTR_SET_ACTIVE(&va, va_access_time);
1750 					vnode_setattr(srfmp->vp, &va, vfs_context_current());
1751 				}
1752 
1753 #if NAMEDSTREAMS
1754 				/*
1755 				 * If the shared cache is compressed, it may
1756 				 * have a namedstream vnode instantiated for
1757 				 * for it. That namedstream vnode will also
1758 				 * have to be marked with VSHARED_DYLD.
1759 				 */
1760 				if (vnode_hasnamedstreams(srfmp->vp)) {
1761 					vnode_t svp;
1762 					if (vnode_getnamedstream(srfmp->vp, &svp, XATTR_RESOURCEFORK_NAME,
1763 					    NS_OPEN, 0, vfs_context_kernel()) == 0) {
1764 						vnode_lock_spin(svp);
1765 						svp->v_flag |= VSHARED_DYLD;
1766 						vnode_unlock(svp);
1767 						vnode_put(svp);
1768 					}
1769 				}
1770 #endif /* NAMEDSTREAMS */
1771 				/*
1772 				 * release the vnode...
1773 				 * ubc_map() still holds it for us in the non-error case
1774 				 */
1775 				(void) vnode_put(srfmp->vp);
1776 				srfmp->vp = NULL;
1777 			}
1778 			if (srfmp->fp != NULL) {
1779 				/* release the file descriptor */
1780 				fp_drop(p, srfmp->fd, srfmp->fp, 0);
1781 				srfmp->fp = NULL;
1782 			}
1783 		}
1784 		kfree_type(struct _sr_file_mappings, files_count, sr_file_mappings);
1785 	}
1786 
1787 	if (shared_region != NULL) {
1788 		vm_shared_region_deallocate(shared_region);
1789 	}
1790 }
1791 
1792 /*
1793  * For each file mapped, we may have mappings for:
1794  *    TEXT, EXECUTE, LINKEDIT, DATA_CONST, __AUTH, DATA
1795  * so let's round up to 8 mappings per file.
1796  */
1797 #define SFM_MAX       (_SR_FILE_MAPPINGS_MAX_FILES * 8)     /* max mapping structs allowed to pass in */
1798 
1799 /*
1800  * This is the new interface for setting up shared region mappings.
1801  *
1802  * The slide used for shared regions setup using this interface is done differently
1803  * from the old interface. The slide value passed in the shared_files_np represents
1804  * a max value. The kernel will choose a random value based on that, then use it
1805  * for all shared regions.
1806  */
1807 #if defined (__x86_64__)
1808 #define SLIDE_AMOUNT_MASK ~FOURK_PAGE_MASK
1809 #else
1810 #define SLIDE_AMOUNT_MASK ~SIXTEENK_PAGE_MASK
1811 #endif
1812 
1813 static inline __result_use_check kern_return_t
shared_region_map_and_slide_2_np_sanitize(struct proc * p,user_addr_t mappings_userspace_addr,unsigned int count,shared_file_mapping_slide_np_t * mappings)1814 shared_region_map_and_slide_2_np_sanitize(
1815 	struct proc                         *p,
1816 	user_addr_t                         mappings_userspace_addr,
1817 	unsigned int                        count,
1818 	shared_file_mapping_slide_np_t      *mappings)
1819 {
1820 	kern_return_t kr;
1821 	vm_map_t map = current_map();
1822 	mach_vm_address_t addr, end;
1823 	mach_vm_offset_t offset, offset_end;
1824 	mach_vm_size_t size, offset_size;
1825 	user_addr_t slide_start, slide_end, slide_size;
1826 	vm_prot_t cur;
1827 	vm_prot_t max;
1828 
1829 	user_addr_t user_addr = mappings_userspace_addr;
1830 
1831 	for (size_t i = 0; i < count; i++) {
1832 		shared_file_mapping_slide_np_ut mapping_u;
1833 		/*
1834 		 * First we bring each mapping struct into our kernel stack to
1835 		 * avoid TOCTOU.
1836 		 */
1837 		kr = shared_region_copyin(
1838 			p,
1839 			user_addr,
1840 			1, // copy 1 element at a time
1841 			sizeof(shared_file_mapping_slide_np_ut),
1842 			&mapping_u);
1843 		if (__improbable(kr != KERN_SUCCESS)) {
1844 			return kr;
1845 		}
1846 
1847 		/*
1848 		 * Then, we sanitize the data on the kernel stack.
1849 		 */
1850 		kr = vm_sanitize_addr_size(
1851 			mapping_u.sms_address_u,
1852 			mapping_u.sms_size_u,
1853 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1854 			map,
1855 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1856 			| VM_SANITIZE_FLAGS_CHECK_ALIGNED_START
1857 			| VM_SANITIZE_FLAGS_CHECK_ALIGNED_SIZE),
1858 			&addr,
1859 			&end,
1860 			&size);
1861 		if (__improbable(kr != KERN_SUCCESS)) {
1862 			return kr;
1863 		}
1864 
1865 		kr = vm_sanitize_addr_size(
1866 			mapping_u.sms_file_offset_u,
1867 			mapping_u.sms_size_u,
1868 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1869 			PAGE_MASK,
1870 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1871 			| VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1872 			&offset,
1873 			&offset_end,
1874 			&offset_size);
1875 		if (__improbable(kr != KERN_SUCCESS)) {
1876 			return kr;
1877 		}
1878 		if (__improbable(0 != (offset & vm_map_page_mask(map)))) {
1879 			return KERN_INVALID_ARGUMENT;
1880 		}
1881 
1882 		/*
1883 		 * Unsafe access is immediately followed by wrap to
1884 		 * convert from addr to size.
1885 		 */
1886 		mach_vm_size_ut sms_slide_size_u =
1887 		    vm_sanitize_wrap_size(
1888 			VM_SANITIZE_UNSAFE_UNWRAP(
1889 				mapping_u.sms_slide_size_u));
1890 
1891 		kr = vm_sanitize_addr_size(
1892 			mapping_u.sms_slide_start_u,
1893 			sms_slide_size_u,
1894 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1895 			map,
1896 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1897 			| VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1898 			&slide_start,
1899 			&slide_end,
1900 			&slide_size);
1901 		if (__improbable(kr != KERN_SUCCESS)) {
1902 			return kr;
1903 		}
1904 
1905 		kr = vm_sanitize_cur_and_max_prots(
1906 			mapping_u.sms_init_prot_u,
1907 			mapping_u.sms_max_prot_u,
1908 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1909 			map,
1910 			VM_PROT_SFM_EXTENSIONS_MASK | VM_PROT_TPRO,
1911 			&cur,
1912 			&max);
1913 		if (__improbable(kr != KERN_SUCCESS)) {
1914 			return kr;
1915 		}
1916 
1917 		/*
1918 		 * Finally, we move the data from the kernel stack to our
1919 		 * caller-allocated kernel heap buffer.
1920 		 */
1921 		mappings[i].sms_address = addr;
1922 		mappings[i].sms_size = size;
1923 		mappings[i].sms_file_offset = offset;
1924 		mappings[i].sms_slide_size = slide_size;
1925 		mappings[i].sms_slide_start = slide_start;
1926 		mappings[i].sms_max_prot = max;
1927 		mappings[i].sms_init_prot = cur;
1928 
1929 		if (__improbable(os_add_overflow(
1930 			    user_addr,
1931 			    sizeof(shared_file_mapping_slide_np_ut),
1932 			    &user_addr))) {
1933 			return KERN_INVALID_ARGUMENT;
1934 		}
1935 	}
1936 
1937 	return KERN_SUCCESS;
1938 }
1939 
1940 int
shared_region_map_and_slide_2_np(struct proc * p,struct shared_region_map_and_slide_2_np_args * uap,__unused int * retvalp)1941 shared_region_map_and_slide_2_np(
1942 	struct proc                                  *p,
1943 	struct shared_region_map_and_slide_2_np_args *uap,
1944 	__unused int                                 *retvalp)
1945 {
1946 	unsigned int                  files_count;
1947 	struct shared_file_np         *shared_files = NULL;
1948 	unsigned int                  mappings_count;
1949 	struct shared_file_mapping_slide_np *mappings = NULL;
1950 	kern_return_t                 kr = KERN_SUCCESS;
1951 
1952 	files_count = uap->files_count;
1953 	mappings_count = uap->mappings_count;
1954 
1955 	SHARED_REGION_TRACE_DEBUG(
1956 		("shared_region: %p [%d(%s)] -> map_and_slide(0x%llx)\n",
1957 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1958 		proc_getpid(p), p->p_comm,
1959 		(uint64_t)uap->mappings_u));
1960 
1961 	if (files_count == 0) {
1962 		SHARED_REGION_TRACE_INFO(
1963 			("shared_region: %p [%d(%s)] map(): "
1964 			"no files\n",
1965 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1966 			proc_getpid(p), p->p_comm));
1967 		kr = 0; /* no files to map: we're done ! */
1968 		goto done;
1969 	} else if (files_count <= _SR_FILE_MAPPINGS_MAX_FILES) {
1970 		shared_files = kalloc_data(files_count * sizeof(shared_files[0]), Z_WAITOK);
1971 		if (shared_files == NULL) {
1972 			kr = KERN_RESOURCE_SHORTAGE;
1973 			goto done;
1974 		}
1975 	} else {
1976 		SHARED_REGION_TRACE_ERROR(
1977 			("shared_region: %p [%d(%s)] map(): "
1978 			"too many files (%d) max %d\n",
1979 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1980 			proc_getpid(p), p->p_comm,
1981 			files_count, _SR_FILE_MAPPINGS_MAX_FILES));
1982 		kr = KERN_FAILURE;
1983 		goto done;
1984 	}
1985 
1986 	if (mappings_count == 0) {
1987 		SHARED_REGION_TRACE_INFO(
1988 			("shared_region: %p [%d(%s)] map(): "
1989 			"no mappings\n",
1990 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1991 			proc_getpid(p), p->p_comm));
1992 		kr = 0; /* no mappings: we're done ! */
1993 		goto done;
1994 	} else if (mappings_count <= SFM_MAX) {
1995 		mappings = kalloc_data(mappings_count * sizeof(mappings[0]), Z_WAITOK);
1996 		if (mappings == NULL) {
1997 			kr = KERN_RESOURCE_SHORTAGE;
1998 			goto done;
1999 		}
2000 	} else {
2001 		SHARED_REGION_TRACE_ERROR(
2002 			("shared_region: %p [%d(%s)] map(): "
2003 			"too many mappings (%d) max %d\n",
2004 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2005 			proc_getpid(p), p->p_comm,
2006 			mappings_count, SFM_MAX));
2007 		kr = KERN_FAILURE;
2008 		goto done;
2009 	}
2010 
2011 	/*
2012 	 * struct shared_file_np does not have fields that are subject to
2013 	 * sanitization, it is thus copied from userspace as is.
2014 	 */
2015 	kr = shared_region_copyin(p, uap->files, files_count, sizeof(shared_files[0]), shared_files);
2016 	if (kr != KERN_SUCCESS) {
2017 		SHARED_REGION_TRACE_ERROR(
2018 			("shared_region: %p [%d(%s)] copyin() returned 0x%x\n",
2019 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2020 			proc_getpid(p), p->p_comm, kr));
2021 		goto done;
2022 	}
2023 
2024 	kr = shared_region_map_and_slide_2_np_sanitize(
2025 		p,
2026 		uap->mappings_u,
2027 		mappings_count,
2028 		mappings);
2029 	if (__improbable(kr != KERN_SUCCESS)) {
2030 		SHARED_REGION_TRACE_ERROR(
2031 			("shared_region: %p [%d(%s)] sanitize() returned 0x%x\n",
2032 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2033 			proc_getpid(p), p->p_comm, kr));
2034 		kr = vm_sanitize_get_kr(kr);
2035 		goto done;
2036 	}
2037 
2038 	uint32_t max_slide = shared_files[0].sf_slide;
2039 	uint32_t random_val;
2040 	uint32_t slide_amount;
2041 
2042 	if (max_slide != 0) {
2043 		read_random(&random_val, sizeof random_val);
2044 		slide_amount = ((random_val % max_slide) & SLIDE_AMOUNT_MASK);
2045 	} else {
2046 		slide_amount = 0;
2047 	}
2048 #if DEVELOPMENT || DEBUG
2049 	extern bool bootarg_disable_aslr;
2050 	if (bootarg_disable_aslr) {
2051 		slide_amount = 0;
2052 	}
2053 #endif /* DEVELOPMENT || DEBUG */
2054 
2055 	/*
2056 	 * Fix up the mappings to reflect the desired slide.
2057 	 */
2058 	unsigned int f;
2059 	unsigned int m = 0;
2060 	unsigned int i;
2061 	for (f = 0; f < files_count; ++f) {
2062 		shared_files[f].sf_slide = slide_amount;
2063 		for (i = 0; i < shared_files[f].sf_mappings_count; ++i, ++m) {
2064 			if (m >= mappings_count) {
2065 				SHARED_REGION_TRACE_ERROR(
2066 					("shared_region: %p [%d(%s)] map(): "
2067 					"mapping count argument was too small\n",
2068 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
2069 					proc_getpid(p), p->p_comm));
2070 				kr = KERN_FAILURE;
2071 				goto done;
2072 			}
2073 			if (__improbable(
2074 				    os_add_overflow(
2075 					    mappings[m].sms_address,
2076 					    slide_amount,
2077 					    &mappings[m].sms_address))) {
2078 				kr = KERN_INVALID_ARGUMENT;
2079 				goto done;
2080 			}
2081 			if (mappings[m].sms_slide_size != 0) {
2082 				mach_vm_address_t discard;
2083 				/* Slide and check that new start/size pairs do not overflow. */
2084 				if (__improbable(
2085 					    os_add_overflow(
2086 						    mappings[m].sms_slide_start,
2087 						    slide_amount,
2088 						    &mappings[m].sms_slide_start) ||
2089 					    os_add_overflow(
2090 						    mappings[m].sms_slide_start,
2091 						    mappings[m].sms_slide_size,
2092 						    &discard))) {
2093 					kr = KERN_INVALID_ARGUMENT;
2094 					goto done;
2095 				}
2096 			}
2097 		}
2098 	}
2099 
2100 	kr = _shared_region_map_and_slide(p, files_count, shared_files, mappings_count, mappings);
2101 done:
2102 	kfree_data(shared_files, files_count * sizeof(shared_files[0]));
2103 	kfree_data(mappings, mappings_count * sizeof(mappings[0]));
2104 
2105 	SHARED_REGION_TRACE_DEBUG(
2106 		("shared_region: %p [%d(%s)] map_and_slide(0x%llx) <- 0x%x\n",
2107 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
2108 		proc_getpid(p), p->p_comm,
2109 		(uint64_t)uap->mappings_u, kr));
2110 
2111 	return kr;
2112 }
2113 
2114 /*
2115  * A syscall for dyld to use to map data pages that need load time relocation fixups.
2116  * The fixups are performed by a custom pager during page-in, so the pages still appear
2117  * "clean" and hence are easily discarded under memory pressure. They can be re-paged-in
2118  * on demand later, all w/o using the compressor.
2119  *
2120  * Note these page are treated as MAP_PRIVATE. So if the application dirties any pages while
2121  * running, they are COW'd as normal.
2122  */
2123 int
map_with_linking_np(struct proc * p,struct map_with_linking_np_args * uap,__unused int * retvalp)2124 map_with_linking_np(
2125 	struct proc                     *p,
2126 	struct map_with_linking_np_args *uap,
2127 	__unused int                    *retvalp)
2128 {
2129 	uint32_t                        region_count;
2130 	uint32_t                        r;
2131 	struct mwl_region               *regions = NULL;
2132 	struct mwl_region               *rp;
2133 	uint32_t                        link_info_size;
2134 	void                            *link_info = NULL;      /* starts with a struct mwl_info_hdr */
2135 	struct mwl_info_hdr             *info_hdr = NULL;
2136 	uint64_t                        binds_size;
2137 	int                             fd;
2138 	struct fileproc                 *fp = NULL;
2139 	struct vnode                    *vp = NULL;
2140 	size_t                          file_size;
2141 	off_t                           fs;
2142 	struct vnode_attr               va;
2143 	memory_object_control_t         file_control = NULL;
2144 	int                             error;
2145 	kern_return_t                   kr = KERN_SUCCESS;
2146 
2147 	/*
2148 	 * Check if dyld has told us it finished with this call.
2149 	 */
2150 	if (p->p_disallow_map_with_linking) {
2151 		printf("%s: [%d(%s)]: map__with_linking() was disabled\n",
2152 		    __func__, proc_getpid(p), p->p_comm);
2153 		kr = KERN_FAILURE;
2154 		goto done;
2155 	}
2156 
2157 	/*
2158 	 * First we do some sanity checking on what dyld has passed us.
2159 	 */
2160 	region_count = uap->region_count;
2161 	link_info_size = uap->link_info_size;
2162 	if (region_count == 0) {
2163 		printf("%s: [%d(%s)]: region_count == 0\n",
2164 		    __func__, proc_getpid(p), p->p_comm);
2165 		kr = KERN_FAILURE;
2166 		goto done;
2167 	}
2168 	if (region_count > MWL_MAX_REGION_COUNT) {
2169 		printf("%s: [%d(%s)]: region_count too big %d\n",
2170 		    __func__, proc_getpid(p), p->p_comm, region_count);
2171 		kr = KERN_FAILURE;
2172 		goto done;
2173 	}
2174 
2175 	if (link_info_size <= MWL_MIN_LINK_INFO_SIZE) {
2176 		printf("%s: [%d(%s)]: link_info_size too small\n",
2177 		    __func__, proc_getpid(p), p->p_comm);
2178 		kr = KERN_FAILURE;
2179 		goto done;
2180 	}
2181 	if (link_info_size >= MWL_MAX_LINK_INFO_SIZE) {
2182 		printf("%s: [%d(%s)]: link_info_size too big %d\n",
2183 		    __func__, proc_getpid(p), p->p_comm, link_info_size);
2184 		kr = KERN_FAILURE;
2185 		goto done;
2186 	}
2187 
2188 	/*
2189 	 * Allocate and copyin the regions and link info
2190 	 */
2191 	regions = kalloc_data(region_count * sizeof(regions[0]), Z_WAITOK);
2192 	if (regions == NULL) {
2193 		printf("%s: [%d(%s)]: failed to allocate regions\n",
2194 		    __func__, proc_getpid(p), p->p_comm);
2195 		kr = KERN_RESOURCE_SHORTAGE;
2196 		goto done;
2197 	}
2198 	kr = shared_region_copyin(p, uap->regions, region_count, sizeof(regions[0]), regions);
2199 	if (kr != KERN_SUCCESS) {
2200 		printf("%s: [%d(%s)]: failed to copyin regions kr=%d\n",
2201 		    __func__, proc_getpid(p), p->p_comm, kr);
2202 		goto done;
2203 	}
2204 
2205 	link_info = kalloc_data(link_info_size, Z_WAITOK);
2206 	if (link_info == NULL) {
2207 		printf("%s: [%d(%s)]: failed to allocate link_info\n",
2208 		    __func__, proc_getpid(p), p->p_comm);
2209 		kr = KERN_RESOURCE_SHORTAGE;
2210 		goto done;
2211 	}
2212 	kr = shared_region_copyin(p, uap->link_info, 1, link_info_size, link_info);
2213 	if (kr != KERN_SUCCESS) {
2214 		printf("%s: [%d(%s)]: failed to copyin link_info kr=%d\n",
2215 		    __func__, proc_getpid(p), p->p_comm, kr);
2216 		goto done;
2217 	}
2218 
2219 	/*
2220 	 * Do some verification the data structures.
2221 	 */
2222 	info_hdr = (struct mwl_info_hdr *)link_info;
2223 	if (info_hdr->mwli_version != MWL_INFO_VERS) {
2224 		printf("%s: [%d(%s)]: unrecognized mwli_version=%d\n",
2225 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_version);
2226 		kr = KERN_FAILURE;
2227 		goto done;
2228 	}
2229 
2230 	if (info_hdr->mwli_binds_offset > link_info_size) {
2231 		printf("%s: [%d(%s)]: mwli_binds_offset too large %d\n",
2232 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_offset);
2233 		kr = KERN_FAILURE;
2234 		goto done;
2235 	}
2236 
2237 	/* some older devs have s/w page size > h/w page size, no need to support them */
2238 	if (info_hdr->mwli_page_size != PAGE_SIZE) {
2239 		/* no printf, since this is expected on some devices */
2240 		kr = KERN_INVALID_ARGUMENT;
2241 		goto done;
2242 	}
2243 
2244 	binds_size = (uint64_t)info_hdr->mwli_binds_count *
2245 	    ((info_hdr->mwli_pointer_format == DYLD_CHAINED_PTR_32) ? 4 : 8);
2246 	if (binds_size > link_info_size - info_hdr->mwli_binds_offset) {
2247 		printf("%s: [%d(%s)]: mwli_binds_count too large %d\n",
2248 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_count);
2249 		kr = KERN_FAILURE;
2250 		goto done;
2251 	}
2252 
2253 	if (info_hdr->mwli_chains_offset > link_info_size) {
2254 		printf("%s: [%d(%s)]: mwli_chains_offset too large %d\n",
2255 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_offset);
2256 		kr = KERN_FAILURE;
2257 		goto done;
2258 	}
2259 
2260 
2261 	/*
2262 	 * Ensure the chained starts in the link info and make sure the
2263 	 * segment info offsets are within bounds.
2264 	 */
2265 	if (info_hdr->mwli_chains_size < sizeof(struct dyld_chained_starts_in_image)) {
2266 		printf("%s: [%d(%s)]: mwli_chains_size too small %d\n",
2267 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2268 		kr = KERN_FAILURE;
2269 		goto done;
2270 	}
2271 	if (info_hdr->mwli_chains_size > link_info_size - info_hdr->mwli_chains_offset) {
2272 		printf("%s: [%d(%s)]: mwli_chains_size too large %d\n",
2273 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2274 		kr = KERN_FAILURE;
2275 		goto done;
2276 	}
2277 
2278 	/* Note that more verification of offsets is done in the pager itself */
2279 
2280 	/*
2281 	 * Ensure we've only been given one FD and verify valid protections.
2282 	 */
2283 	fd = regions[0].mwlr_fd;
2284 	for (r = 0; r < region_count; ++r) {
2285 		if (regions[r].mwlr_fd != fd) {
2286 			printf("%s: [%d(%s)]: mwlr_fd mismatch %d and %d\n",
2287 			    __func__, proc_getpid(p), p->p_comm, fd, regions[r].mwlr_fd);
2288 			kr = KERN_FAILURE;
2289 			goto done;
2290 		}
2291 
2292 		/*
2293 		 * Only allow data mappings and not zero fill. Permit TPRO
2294 		 * mappings only when VM_PROT_READ | VM_PROT_WRITE.
2295 		 */
2296 		if (regions[r].mwlr_protections & VM_PROT_EXECUTE) {
2297 			printf("%s: [%d(%s)]: mwlr_protections EXECUTE not allowed\n",
2298 			    __func__, proc_getpid(p), p->p_comm);
2299 			kr = KERN_FAILURE;
2300 			goto done;
2301 		}
2302 		if (regions[r].mwlr_protections & VM_PROT_ZF) {
2303 			printf("%s: [%d(%s)]: region %d, found VM_PROT_ZF not allowed\n",
2304 			    __func__, proc_getpid(p), p->p_comm, r);
2305 			kr = KERN_FAILURE;
2306 			goto done;
2307 		}
2308 		if ((regions[r].mwlr_protections & VM_PROT_TPRO) &&
2309 		    !(regions[r].mwlr_protections & VM_PROT_WRITE)) {
2310 			printf("%s: [%d(%s)]: region %d, found VM_PROT_TPRO without VM_PROT_WRITE\n",
2311 			    __func__, proc_getpid(p), p->p_comm, r);
2312 			kr = KERN_FAILURE;
2313 			goto done;
2314 		}
2315 	}
2316 
2317 
2318 	/* get file structure from file descriptor */
2319 	error = fp_get_ftype(p, fd, DTYPE_VNODE, EINVAL, &fp);
2320 	if (error) {
2321 		printf("%s: [%d(%s)]: fp_get_ftype() failed, error %d\n",
2322 		    __func__, proc_getpid(p), p->p_comm, error);
2323 		kr = KERN_FAILURE;
2324 		goto done;
2325 	}
2326 
2327 	/* We need at least read permission on the file */
2328 	if (!(fp->fp_glob->fg_flag & FREAD)) {
2329 		printf("%s: [%d(%s)]: not readable\n",
2330 		    __func__, proc_getpid(p), p->p_comm);
2331 		kr = KERN_FAILURE;
2332 		goto done;
2333 	}
2334 
2335 	/* Get the vnode from file structure */
2336 	vp = (struct vnode *)fp_get_data(fp);
2337 	error = vnode_getwithref(vp);
2338 	if (error) {
2339 		printf("%s: [%d(%s)]: failed to get vnode, error %d\n",
2340 		    __func__, proc_getpid(p), p->p_comm, error);
2341 		kr = KERN_FAILURE;
2342 		vp = NULL; /* just to be sure */
2343 		goto done;
2344 	}
2345 
2346 	/* Make sure the vnode is a regular file */
2347 	if (vp->v_type != VREG) {
2348 		printf("%s: [%d(%s)]: vnode not VREG\n",
2349 		    __func__, proc_getpid(p), p->p_comm);
2350 		kr = KERN_FAILURE;
2351 		goto done;
2352 	}
2353 
2354 	/* get vnode size */
2355 	error = vnode_size(vp, &fs, vfs_context_current());
2356 	if (error) {
2357 		goto done;
2358 	}
2359 	file_size = fs;
2360 
2361 	/* get the file's memory object handle */
2362 	file_control = ubc_getobject(vp, UBC_HOLDOBJECT);
2363 	if (file_control == MEMORY_OBJECT_CONTROL_NULL) {
2364 		printf("%s: [%d(%s)]: no memory object\n",
2365 		    __func__, proc_getpid(p), p->p_comm);
2366 		kr = KERN_FAILURE;
2367 		goto done;
2368 	}
2369 
2370 	for (r = 0; r < region_count; ++r) {
2371 		rp = &regions[r];
2372 
2373 #if CONFIG_MACF
2374 		vm_prot_t prot = (rp->mwlr_protections & VM_PROT_ALL);
2375 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
2376 		    fp->fp_glob, prot, MAP_FILE | MAP_PRIVATE | MAP_FIXED, rp->mwlr_file_offset, &prot);
2377 		if (error) {
2378 			printf("%s: [%d(%s)]: mac_file_check_mmap() failed, region %d, error %d\n",
2379 			    __func__, proc_getpid(p), p->p_comm, r, error);
2380 			kr = KERN_FAILURE;
2381 			goto done;
2382 		}
2383 #endif /* MAC */
2384 
2385 		/* check that the mappings are properly covered by code signatures */
2386 		if (cs_system_enforcement()) {
2387 			if (!ubc_cs_is_range_codesigned(vp, rp->mwlr_file_offset, rp->mwlr_size)) {
2388 				printf("%s: [%d(%s)]: region %d, not code signed\n",
2389 				    __func__, proc_getpid(p), p->p_comm, r);
2390 				kr = KERN_FAILURE;
2391 				goto done;
2392 			}
2393 		}
2394 	}
2395 
2396 	/* update the vnode's access time */
2397 	if (!(vnode_vfsvisflags(vp) & MNT_NOATIME)) {
2398 		VATTR_INIT(&va);
2399 		nanotime(&va.va_access_time);
2400 		VATTR_SET_ACTIVE(&va, va_access_time);
2401 		vnode_setattr(vp, &va, vfs_context_current());
2402 	}
2403 
2404 	/* get the VM to do the work */
2405 	kr = vm_map_with_linking(proc_task(p), regions, region_count, &link_info, link_info_size, file_control);
2406 
2407 done:
2408 	if (fp != NULL) {
2409 		/* release the file descriptor */
2410 		fp_drop(p, fd, fp, 0);
2411 	}
2412 	if (vp != NULL) {
2413 		(void)vnode_put(vp);
2414 	}
2415 	if (regions != NULL) {
2416 		kfree_data(regions, region_count * sizeof(regions[0]));
2417 	}
2418 	/* link info is NULL if it is used in the pager, if things worked */
2419 	if (link_info != NULL) {
2420 		kfree_data(link_info, link_info_size);
2421 	}
2422 
2423 	switch (kr) {
2424 	case KERN_SUCCESS:
2425 		return 0;
2426 	case KERN_RESOURCE_SHORTAGE:
2427 		return ENOMEM;
2428 	default:
2429 		return EINVAL;
2430 	}
2431 }
2432 
2433 #if DEBUG || DEVELOPMENT
2434 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count,
2435     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count, 0, "");
2436 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count_max,
2437     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count_max, 0, "");
2438 #endif /* DEBUG || DEVELOPMENT */
2439 
2440 /* sysctl overflow room */
2441 
2442 SYSCTL_INT(_vm, OID_AUTO, pagesize, CTLFLAG_RD | CTLFLAG_LOCKED,
2443     (int *) &page_size, 0, "vm page size");
2444 
2445 /* vm_page_free_target is provided as a makeshift solution for applications that want to
2446  *       allocate buffer space, possibly purgeable memory, but not cause inactive pages to be
2447  *       reclaimed. It allows the app to calculate how much memory is free outside the free target. */
2448 extern unsigned int     vm_page_free_target;
2449 SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED,
2450     &vm_page_free_target, 0, "Pageout daemon free target");
2451 
2452 SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED,
2453     &vm_pageout_state.vm_memory_pressure, 0, "Memory pressure indicator");
2454 
2455 static int
2456 vm_ctl_page_free_wanted SYSCTL_HANDLER_ARGS
2457 {
2458 #pragma unused(oidp, arg1, arg2)
2459 	unsigned int page_free_wanted;
2460 
2461 	page_free_wanted = mach_vm_ctl_page_free_wanted();
2462 	return SYSCTL_OUT(req, &page_free_wanted, sizeof(page_free_wanted));
2463 }
2464 SYSCTL_PROC(_vm, OID_AUTO, page_free_wanted,
2465     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
2466     0, 0, vm_ctl_page_free_wanted, "I", "");
2467 
2468 extern unsigned int     vm_page_purgeable_count;
2469 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2470     &vm_page_purgeable_count, 0, "Purgeable page count");
2471 
2472 extern unsigned int     vm_page_purgeable_wired_count;
2473 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2474     &vm_page_purgeable_wired_count, 0, "Wired purgeable page count");
2475 
2476 extern unsigned int vm_page_kern_lpage_count;
2477 SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2478     &vm_page_kern_lpage_count, 0, "kernel used large pages");
2479 
2480 SCALABLE_COUNTER_DECLARE(vm_page_grab_count);
2481 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed");
2482 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_kern);
2483 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_kern, vm_page_grab_count_kern, "Total pages grabbed (kernel)");
2484 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_iopl);
2485 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_iopl, vm_page_grab_count_iopl, "Total pages grabbed (iopl)");
2486 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_upl);
2487 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_upl, vm_page_grab_count_upl, "Total pages grabbed (upl)");
2488 
2489 
2490 #if DEVELOPMENT || DEBUG
2491 SCALABLE_COUNTER_DECLARE(vm_page_deactivate_behind_count);
2492 SYSCTL_SCALABLE_COUNTER(_vm, pages_deactivated_behind, vm_page_deactivate_behind_count,
2493     "Number of pages deactivated behind");
2494 #endif
2495 
2496 #if DEVELOPMENT || DEBUG
2497 #if __ARM_MIXED_PAGE_SIZE__
2498 static int vm_mixed_pagesize_supported = 1;
2499 #else
2500 static int vm_mixed_pagesize_supported = 0;
2501 #endif /*__ARM_MIXED_PAGE_SIZE__ */
2502 SYSCTL_INT(_debug, OID_AUTO, vm_mixed_pagesize_supported, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED,
2503     &vm_mixed_pagesize_supported, 0, "kernel support for mixed pagesize");
2504 
2505 SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
2506     &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed");
2507 
2508 SYSCTL_INT(_vm, OID_AUTO, pageout_purged_objects, CTLFLAG_RD | CTLFLAG_LOCKED,
2509     &vm_pageout_debug.vm_pageout_purged_objects, 0, "System purged object count");
2510 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED,
2511     &vm_pageout_debug.vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)");
2512 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED,
2513     &vm_pageout_debug.vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)");
2514 
2515 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2516     &vm_pageout_debug.vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated");
2517 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2518     &vm_pageout_debug.vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated");
2519 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2520     &vm_pageout_debug.vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated");         /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */
2521 SYSCTL_ULONG(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2522     &vm_pageout_vminfo.vm_pageout_freed_cleaned, "Cleaned pages freed");
2523 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2524     &vm_pageout_debug.vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated");
2525 SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2526     &vm_pageout_debug.vm_pageout_enqueued_cleaned, 0, "");         /* sum of next two */
2527 #endif /* DEVELOPMENT || DEBUG */
2528 
2529 extern int madvise_free_debug;
2530 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
2531     &madvise_free_debug, 0, "zero-fill on madvise(MADV_FREE*)");
2532 extern int madvise_free_debug_sometimes;
2533 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug_sometimes, CTLFLAG_RW | CTLFLAG_LOCKED,
2534     &madvise_free_debug_sometimes, 0, "sometimes zero-fill on madvise(MADV_FREE*)");
2535 
2536 SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2537     &vm_page_stats_reusable.reusable_count, 0, "Reusable page count");
2538 SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2539     &vm_page_stats_reusable.reusable_pages_success, "");
2540 SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2541     &vm_page_stats_reusable.reusable_pages_failure, "");
2542 SYSCTL_QUAD(_vm, OID_AUTO, reusable_pages_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2543     &vm_page_stats_reusable.reusable_pages_shared, "");
2544 SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2545     &vm_page_stats_reusable.all_reusable_calls, "");
2546 SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2547     &vm_page_stats_reusable.partial_reusable_calls, "");
2548 SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2549     &vm_page_stats_reusable.reuse_pages_success, "");
2550 SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2551     &vm_page_stats_reusable.reuse_pages_failure, "");
2552 SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2553     &vm_page_stats_reusable.all_reuse_calls, "");
2554 SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2555     &vm_page_stats_reusable.partial_reuse_calls, "");
2556 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2557     &vm_page_stats_reusable.can_reuse_success, "");
2558 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2559     &vm_page_stats_reusable.can_reuse_failure, "");
2560 SYSCTL_QUAD(_vm, OID_AUTO, reusable_reclaimed, CTLFLAG_RD | CTLFLAG_LOCKED,
2561     &vm_page_stats_reusable.reusable_reclaimed, "");
2562 SYSCTL_QUAD(_vm, OID_AUTO, reusable_nonwritable, CTLFLAG_RD | CTLFLAG_LOCKED,
2563     &vm_page_stats_reusable.reusable_nonwritable, "");
2564 SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2565     &vm_page_stats_reusable.reusable_shared, "");
2566 SYSCTL_QUAD(_vm, OID_AUTO, free_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2567     &vm_page_stats_reusable.free_shared, "");
2568 
2569 
2570 extern unsigned int vm_page_free_count, vm_page_speculative_count;
2571 SYSCTL_UINT(_vm, OID_AUTO, page_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_count, 0, "");
2572 SYSCTL_UINT(_vm, OID_AUTO, page_speculative_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_speculative_count, 0, "");
2573 
2574 extern unsigned int vm_page_cleaned_count;
2575 SYSCTL_UINT(_vm, OID_AUTO, page_cleaned_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_cleaned_count, 0, "Cleaned queue size");
2576 
2577 extern unsigned int vm_page_pageable_internal_count, vm_page_pageable_external_count;
2578 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_internal_count, 0, "");
2579 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, "");
2580 
2581 /* pageout counts */
2582 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_clean, 0, "");
2583 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_used, 0, "");
2584 
2585 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, "");
2586 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_external, "");
2587 SYSCTL_ULONG(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2588 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_external, "");
2589 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2590 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_cleaned, "");
2591 
2592 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_sharedcache, "");
2593 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache, "");
2594 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_realtime, "");
2595 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime, "");
2596 extern unsigned int vm_page_realtime_count;
2597 SYSCTL_UINT(_vm, OID_AUTO, page_realtime_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_realtime_count, 0, "");
2598 extern int vm_pageout_protect_realtime;
2599 SYSCTL_INT(_vm, OID_AUTO, pageout_protect_realtime, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_protect_realtime, 0, "");
2600 
2601 /* counts of pages prefaulted when entering a memory object */
2602 extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout;
2603 extern int64_t vm_prefault_nb_no_page, vm_prefault_nb_wrong_page;
2604 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_pages, "");
2605 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_bailout, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_bailout, "");
2606 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_no_page, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_no_page, "");
2607 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_wrong_page, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_wrong_page, "");
2608 
2609 #if defined (__x86_64__)
2610 extern unsigned int vm_clump_promote_threshold;
2611 SYSCTL_UINT(_vm, OID_AUTO, vm_clump_promote_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_clump_promote_threshold, 0, "clump size threshold for promotes");
2612 #if DEVELOPMENT || DEBUG
2613 extern unsigned long vm_clump_stats[];
2614 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[1], "free page allocations from clump of 1 page");
2615 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[2], "free page allocations from clump of 2 pages");
2616 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[3], "free page allocations from clump of 3 pages");
2617 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats4, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[4], "free page allocations from clump of 4 pages");
2618 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats5, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[5], "free page allocations from clump of 5 pages");
2619 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats6, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[6], "free page allocations from clump of 6 pages");
2620 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats7, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[7], "free page allocations from clump of 7 pages");
2621 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats8, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[8], "free page allocations from clump of 8 pages");
2622 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats9, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[9], "free page allocations from clump of 9 pages");
2623 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats10, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[10], "free page allocations from clump of 10 pages");
2624 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats11, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[11], "free page allocations from clump of 11 pages");
2625 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats12, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[12], "free page allocations from clump of 12 pages");
2626 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats13, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[13], "free page allocations from clump of 13 pages");
2627 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats14, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[14], "free page allocations from clump of 14 pages");
2628 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats15, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[15], "free page allocations from clump of 15 pages");
2629 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats16, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[16], "free page allocations from clump of 16 pages");
2630 extern unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
2631 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_alloc, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_allocs, "free page allocations");
2632 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inserts, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inserts, "free page insertions");
2633 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inrange, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inrange, "free page insertions that are part of vm_pages");
2634 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_promotes, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_promotes, "pages promoted to head");
2635 #endif  /* if DEVELOPMENT || DEBUG */
2636 #endif  /* #if defined (__x86_64__) */
2637 
2638 #if CONFIG_SECLUDED_MEMORY
2639 
2640 SYSCTL_UINT(_vm, OID_AUTO, num_tasks_can_use_secluded_mem, CTLFLAG_RD | CTLFLAG_LOCKED, &num_tasks_can_use_secluded_mem, 0, "");
2641 extern unsigned int vm_page_secluded_target;
2642 extern unsigned int vm_page_secluded_count;
2643 extern unsigned int vm_page_secluded_count_free;
2644 extern unsigned int vm_page_secluded_count_inuse;
2645 extern unsigned int vm_page_secluded_count_over_target;
2646 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, "");
2647 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, "");
2648 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, "");
2649 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, "");
2650 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_over_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_over_target, 0, "");
2651 
2652 extern struct vm_page_secluded_data vm_page_secluded;
2653 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, "");
2654 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_free, 0, "");
2655 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_other, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_other, 0, "");
2656 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_locked, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_locked, 0, "");
2657 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_state, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_state, 0, "");
2658 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_realtime, 0, "");
2659 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_dirty, 0, "");
2660 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, "");
2661 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, "");
2662 
2663 #endif /* CONFIG_SECLUDED_MEMORY */
2664 
2665 #if CONFIG_DEFERRED_RECLAIM
2666 #pragma mark Deferred Reclaim
2667 SYSCTL_NODE(_vm, OID_AUTO, reclaim, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Deferred Memory Reclamation");
2668 #if DEVELOPMENT || DEBUG
2669 /*
2670  * VM reclaim testing
2671  */
2672 extern bool vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task);
2673 
2674 static int
2675 sysctl_vm_reclaim_wait_for_pid SYSCTL_HANDLER_ARGS
2676 {
2677 	int error = EINVAL, pid = 0;
2678 	/*
2679 	 * Only send on write
2680 	 */
2681 	error = sysctl_handle_int(oidp, &pid, 0, req);
2682 	if (error || !req->newptr) {
2683 		return error;
2684 	}
2685 	if (pid <= 0) {
2686 		return EINVAL;
2687 	}
2688 	proc_t p = proc_find(pid);
2689 	if (p == PROC_NULL) {
2690 		return ESRCH;
2691 	}
2692 	task_t t = proc_task(p);
2693 	if (t == TASK_NULL) {
2694 		proc_rele(p);
2695 		return ESRCH;
2696 	}
2697 	task_reference(t);
2698 	proc_rele(p);
2699 
2700 	bool success = vm_deferred_reclamation_block_until_task_has_been_reclaimed(t);
2701 	if (success) {
2702 		error = 0;
2703 	}
2704 	task_deallocate(t);
2705 
2706 	return error;
2707 }
2708 
2709 SYSCTL_PROC(_vm_reclaim, OID_AUTO, wait_for_pid,
2710     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2711     &sysctl_vm_reclaim_wait_for_pid, "I",
2712     "Block until the given pid has been drained by kernel GC");
2713 
2714 static int
2715 sysctl_vm_reclaim_drain_pid SYSCTL_HANDLER_ARGS
2716 {
2717 	int error = EINVAL;
2718 	kern_return_t kr;
2719 	pid_t pid;
2720 	error = sysctl_handle_int(oidp, &pid, 0, req);
2721 	/* Only reclaim on write */
2722 	if (error || !req->newptr) {
2723 		return error;
2724 	}
2725 	if (pid <= 0) {
2726 		return EINVAL;
2727 	}
2728 	proc_t p = proc_find(pid);
2729 	if (p == PROC_NULL) {
2730 		return ESRCH;
2731 	}
2732 	task_t t = proc_task(p);
2733 	if (t == TASK_NULL) {
2734 		proc_rele(p);
2735 		return ESRCH;
2736 	}
2737 	task_reference(t);
2738 	proc_rele(p);
2739 	kr = vm_deferred_reclamation_task_drain(t, RECLAIM_OPTIONS_NONE);
2740 	task_deallocate(t);
2741 	return mach_to_bsd_errno(kr);
2742 }
2743 
2744 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_pid,
2745     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2746     &sysctl_vm_reclaim_drain_pid, "I",
2747     "Drain the deferred reclamation buffer for a pid");
2748 
2749 static int
proc_filter_reclaimable(proc_t p,__unused void * arg)2750 proc_filter_reclaimable(proc_t p, __unused void *arg)
2751 {
2752 	task_t task = proc_task(p);
2753 	return vm_deferred_reclamation_task_has_ring(task);
2754 }
2755 
2756 static int
proc_reclaim_drain(proc_t p,__unused void * arg)2757 proc_reclaim_drain(proc_t p, __unused void *arg)
2758 {
2759 	kern_return_t kr;
2760 	task_t task = proc_task(p);
2761 	kr = vm_deferred_reclamation_task_drain(task, RECLAIM_OPTIONS_NONE);
2762 	return mach_to_bsd_errno(kr);
2763 }
2764 
2765 static int
2766 sysctl_vm_reclaim_drain_all SYSCTL_HANDLER_ARGS
2767 {
2768 	int error;
2769 	int val;
2770 	if (!req->newptr) {
2771 		return EINVAL;
2772 	}
2773 	error = sysctl_handle_int(oidp, &val, 0, req);
2774 	if (error || val == FALSE) {
2775 		return error;
2776 	}
2777 	proc_iterate(PROC_ALLPROCLIST, proc_reclaim_drain, NULL,
2778 	    proc_filter_reclaimable, NULL);
2779 	return 0;
2780 }
2781 
2782 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_all,
2783     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2784     &sysctl_vm_reclaim_drain_all, "I",
2785     "Fully reclaim from every deferred reclamation buffer on the system");
2786 
2787 extern uint32_t vm_reclaim_buffer_count;
2788 extern uint64_t vm_reclaim_gc_epoch;
2789 extern uint64_t vm_reclaim_gc_reclaim_count;
2790 extern uint64_t vm_reclaim_sampling_period_abs;
2791 extern uint64_t vm_reclaim_sampling_period_ns;
2792 extern bool vm_reclaim_debug;
2793 extern bool vm_reclaim_enabled;
2794 extern uint32_t vm_reclaim_autotrim_pct_normal;
2795 extern uint32_t vm_reclaim_autotrim_pct_pressure;
2796 extern uint32_t vm_reclaim_autotrim_pct_critical;
2797 extern uint32_t vm_reclaim_wma_weight_base;
2798 extern uint32_t vm_reclaim_wma_weight_cur;
2799 extern uint32_t vm_reclaim_wma_denom;
2800 extern uint64_t vm_reclaim_abandonment_threshold;
2801 
2802 SYSCTL_UINT(_vm_reclaim, OID_AUTO, reclaim_buffer_count,
2803     CTLFLAG_RD | CTLFLAG_LOCKED, (uint32_t *)&vm_reclaim_buffer_count, 0,
2804     "The number of deferred memory buffers currently alive");
2805 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_epoch,
2806     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_epoch,
2807     "Number of times the global GC thread has run");
2808 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_reclaim_count,
2809     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_reclaim_count,
2810     "Number of times the global GC thread has reclaimed from a buffer");
2811 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, debug,
2812     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_debug, 0,
2813     "Debug logs for vm.reclaim");
2814 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, enabled,
2815     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_enabled, 0,
2816     "Whether deferred memory reclamation is enabled on this system");
2817 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_normal,
2818     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_normal, 0,
2819     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2820     "to engage auto-trim when the system is operating normally");
2821 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_pressure,
2822     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_pressure, 0,
2823     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2824     "to engage auto-trim when the system is under memory pressure");
2825 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_critical,
2826     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_critical, 0,
2827     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2828     "to engage auto-trim when the system is under critical memory pressure");
2829 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_base,
2830     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_base, 0,
2831     "Weight applied to historical minimum buffer size samples");
2832 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_cur,
2833     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_cur, 0,
2834     "Weight applied to current sampled minimum buffer size");
2835 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_denom,
2836     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_denom, 0,
2837     "Denominator for weighted moving average calculation");
2838 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, abandonment_threshold,
2839     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_abandonment_threshold,
2840     "The number of sampling periods between accounting updates that may elapse "
2841     "before the buffer is considered \"abandoned\"");
2842 
2843 static int
2844 sysctl_vm_reclaim_sampling_period SYSCTL_HANDLER_ARGS
2845 {
2846 	uint64_t new_val_ns;
2847 	uint64_t old_val_ns = vm_reclaim_sampling_period_ns;
2848 	int err = sysctl_io_number(req, vm_reclaim_sampling_period_ns,
2849 	    sizeof(vm_reclaim_sampling_period_ns), &new_val_ns, NULL);
2850 	if (err || !req->newptr) {
2851 		return err;
2852 	}
2853 	if (new_val_ns != old_val_ns) {
2854 		vm_reclaim_sampling_period_ns = new_val_ns;
2855 		nanoseconds_to_absolutetime(vm_reclaim_sampling_period_ns, &vm_reclaim_sampling_period_abs);
2856 	}
2857 	return 0;
2858 }
2859 
2860 SYSCTL_PROC(_vm_reclaim, OID_AUTO, sampling_period_ns,
2861     CTLFLAG_RW | CTLTYPE_QUAD | CTLFLAG_LOCKED, NULL, 0, sysctl_vm_reclaim_sampling_period, "QU",
2862     "Interval (nanoseconds) at which to sample the minimum buffer size and "
2863     "consider trimming excess");
2864 #endif /* DEVELOPMENT || DEBUG */
2865 #endif /* CONFIG_DEFERRED_RECLAIM */
2866 
2867 #include <kern/thread.h>
2868 #include <sys/user.h>
2869 
2870 void vm_pageout_io_throttle(void);
2871 
2872 void
vm_pageout_io_throttle(void)2873 vm_pageout_io_throttle(void)
2874 {
2875 	struct uthread *uthread = current_uthread();
2876 
2877 	/*
2878 	 * thread is marked as a low priority I/O type
2879 	 * and the I/O we issued while in this cleaning operation
2880 	 * collided with normal I/O operations... we'll
2881 	 * delay in order to mitigate the impact of this
2882 	 * task on the normal operation of the system
2883 	 */
2884 
2885 	if (uthread->uu_lowpri_window) {
2886 		throttle_lowpri_io(1);
2887 	}
2888 }
2889 
2890 int
vm_pressure_monitor(__unused struct proc * p,struct vm_pressure_monitor_args * uap,int * retval)2891 vm_pressure_monitor(
2892 	__unused struct proc *p,
2893 	struct vm_pressure_monitor_args *uap,
2894 	int *retval)
2895 {
2896 	kern_return_t   kr;
2897 	uint32_t        pages_reclaimed;
2898 	uint32_t        pages_wanted;
2899 
2900 	kr = mach_vm_pressure_monitor(
2901 		(boolean_t) uap->wait_for_pressure,
2902 		uap->nsecs_monitored,
2903 		(uap->pages_reclaimed) ? &pages_reclaimed : NULL,
2904 		&pages_wanted);
2905 
2906 	switch (kr) {
2907 	case KERN_SUCCESS:
2908 		break;
2909 	case KERN_ABORTED:
2910 		return EINTR;
2911 	default:
2912 		return EINVAL;
2913 	}
2914 
2915 	if (uap->pages_reclaimed) {
2916 		if (copyout((void *)&pages_reclaimed,
2917 		    uap->pages_reclaimed,
2918 		    sizeof(pages_reclaimed)) != 0) {
2919 			return EFAULT;
2920 		}
2921 	}
2922 
2923 	*retval = (int) pages_wanted;
2924 	return 0;
2925 }
2926 
2927 int
kas_info(struct proc * p,struct kas_info_args * uap,int * retval __unused)2928 kas_info(struct proc *p,
2929     struct kas_info_args *uap,
2930     int *retval __unused)
2931 {
2932 #ifndef CONFIG_KAS_INFO
2933 	(void)p;
2934 	(void)uap;
2935 	return ENOTSUP;
2936 #else /* CONFIG_KAS_INFO */
2937 	int                     selector = uap->selector;
2938 	user_addr_t     valuep = uap->value;
2939 	user_addr_t     sizep = uap->size;
2940 	user_size_t size, rsize;
2941 	int                     error;
2942 
2943 	if (!kauth_cred_issuser(kauth_cred_get())) {
2944 		return EPERM;
2945 	}
2946 
2947 #if CONFIG_MACF
2948 	error = mac_system_check_kas_info(kauth_cred_get(), selector);
2949 	if (error) {
2950 		return error;
2951 	}
2952 #endif
2953 
2954 	if (IS_64BIT_PROCESS(p)) {
2955 		user64_size_t size64;
2956 		error = copyin(sizep, &size64, sizeof(size64));
2957 		size = (user_size_t)size64;
2958 	} else {
2959 		user32_size_t size32;
2960 		error = copyin(sizep, &size32, sizeof(size32));
2961 		size = (user_size_t)size32;
2962 	}
2963 	if (error) {
2964 		return error;
2965 	}
2966 
2967 	switch (selector) {
2968 	case KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR:
2969 	{
2970 		uint64_t slide = vm_kernel_slide;
2971 
2972 		if (sizeof(slide) != size) {
2973 			return EINVAL;
2974 		}
2975 
2976 		error = copyout(&slide, valuep, sizeof(slide));
2977 		if (error) {
2978 			return error;
2979 		}
2980 		rsize = size;
2981 	}
2982 	break;
2983 	case KAS_INFO_KERNEL_SEGMENT_VMADDR_SELECTOR:
2984 	{
2985 		uint32_t i;
2986 		kernel_mach_header_t *mh = &_mh_execute_header;
2987 		struct load_command *cmd;
2988 		cmd = (struct load_command*) &mh[1];
2989 		uint64_t *bases;
2990 		rsize = mh->ncmds * sizeof(uint64_t);
2991 
2992 		/*
2993 		 * Return the size if no data was passed
2994 		 */
2995 		if (valuep == 0) {
2996 			break;
2997 		}
2998 
2999 		if (rsize > size) {
3000 			return EINVAL;
3001 		}
3002 
3003 		bases = kalloc_data(rsize, Z_WAITOK | Z_ZERO);
3004 
3005 		for (i = 0; i < mh->ncmds; i++) {
3006 			if (cmd->cmd == LC_SEGMENT_KERNEL) {
3007 				__IGNORE_WCASTALIGN(kernel_segment_command_t * sg = (kernel_segment_command_t *) cmd);
3008 				bases[i] = (uint64_t)sg->vmaddr;
3009 			}
3010 			cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize);
3011 		}
3012 
3013 		error = copyout(bases, valuep, rsize);
3014 
3015 		kfree_data(bases, rsize);
3016 
3017 		if (error) {
3018 			return error;
3019 		}
3020 	}
3021 	break;
3022 	case KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR:
3023 	case KAS_INFO_TXM_TEXT_SLIDE_SELECTOR:
3024 	{
3025 #if CONFIG_SPTM
3026 		const uint64_t slide =
3027 		    (selector == KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR) ? vm_sptm_offsets.slide : vm_txm_offsets.slide;
3028 #else
3029 		const uint64_t slide = 0;
3030 #endif
3031 
3032 		if (sizeof(slide) != size) {
3033 			return EINVAL;
3034 		}
3035 
3036 		error = copyout(&slide, valuep, sizeof(slide));
3037 		if (error) {
3038 			return error;
3039 		}
3040 		rsize = size;
3041 	}
3042 	break;
3043 	default:
3044 		return EINVAL;
3045 	}
3046 
3047 	if (IS_64BIT_PROCESS(p)) {
3048 		user64_size_t size64 = (user64_size_t)rsize;
3049 		error = copyout(&size64, sizep, sizeof(size64));
3050 	} else {
3051 		user32_size_t size32 = (user32_size_t)rsize;
3052 		error = copyout(&size32, sizep, sizeof(size32));
3053 	}
3054 
3055 	return error;
3056 #endif /* CONFIG_KAS_INFO */
3057 }
3058 
3059 #pragma clang diagnostic push
3060 #pragma clang diagnostic ignored "-Wcast-qual"
3061 #pragma clang diagnostic ignored "-Wunused-function"
3062 
3063 static void
asserts()3064 asserts()
3065 {
3066 	static_assert(sizeof(vm_min_kernel_address) == sizeof(unsigned long));
3067 	static_assert(sizeof(vm_max_kernel_address) == sizeof(unsigned long));
3068 }
3069 
3070 SYSCTL_ULONG(_vm, OID_AUTO, vm_min_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_min_kernel_address, "");
3071 SYSCTL_ULONG(_vm, OID_AUTO, vm_max_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_max_kernel_address, "");
3072 #pragma clang diagnostic pop
3073 
3074 extern uint32_t vm_page_pages;
3075 SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0, "");
3076 
3077 extern uint32_t vm_page_busy_absent_skipped;
3078 SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, "");
3079 
3080 extern uint32_t vm_page_upl_tainted;
3081 SYSCTL_UINT(_vm, OID_AUTO, upl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_upl_tainted, 0, "");
3082 
3083 extern uint32_t vm_page_iopl_tainted;
3084 SYSCTL_UINT(_vm, OID_AUTO, iopl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_iopl_tainted, 0, "");
3085 
3086 #if __arm64__ && (DEVELOPMENT || DEBUG)
3087 extern int vm_footprint_suspend_allowed;
3088 SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, "");
3089 
3090 extern void pmap_footprint_suspend(vm_map_t map, boolean_t suspend);
3091 static int
3092 sysctl_vm_footprint_suspend SYSCTL_HANDLER_ARGS
3093 {
3094 #pragma unused(oidp, arg1, arg2)
3095 	int error = 0;
3096 	int new_value;
3097 
3098 	if (req->newptr == USER_ADDR_NULL) {
3099 		return 0;
3100 	}
3101 	error = SYSCTL_IN(req, &new_value, sizeof(int));
3102 	if (error) {
3103 		return error;
3104 	}
3105 	if (!vm_footprint_suspend_allowed) {
3106 		if (new_value != 0) {
3107 			/* suspends are not allowed... */
3108 			return 0;
3109 		}
3110 		/* ... but let resumes proceed */
3111 	}
3112 	DTRACE_VM2(footprint_suspend,
3113 	    vm_map_t, current_map(),
3114 	    int, new_value);
3115 
3116 	pmap_footprint_suspend(current_map(), new_value);
3117 
3118 	return 0;
3119 }
3120 SYSCTL_PROC(_vm, OID_AUTO, footprint_suspend,
3121     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3122     0, 0, &sysctl_vm_footprint_suspend, "I", "");
3123 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
3124 
3125 extern uint64_t vm_map_corpse_footprint_count;
3126 extern uint64_t vm_map_corpse_footprint_size_avg;
3127 extern uint64_t vm_map_corpse_footprint_size_max;
3128 extern uint64_t vm_map_corpse_footprint_full;
3129 extern uint64_t vm_map_corpse_footprint_no_buf;
3130 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_count,
3131     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_count, "");
3132 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_avg,
3133     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_avg, "");
3134 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_max,
3135     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_max, "");
3136 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_full,
3137     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_full, "");
3138 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_no_buf,
3139     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_no_buf, "");
3140 
3141 #if CODE_SIGNING_MONITOR
3142 extern uint64_t vm_cs_defer_to_csm;
3143 extern uint64_t vm_cs_defer_to_csm_not;
3144 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm,
3145     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm, "");
3146 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm_not,
3147     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm_not, "");
3148 #endif /* CODE_SIGNING_MONITOR */
3149 
3150 extern uint64_t shared_region_pager_copied;
3151 extern uint64_t shared_region_pager_slid;
3152 extern uint64_t shared_region_pager_slid_error;
3153 extern uint64_t shared_region_pager_reclaimed;
3154 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_copied,
3155     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_copied, "");
3156 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid,
3157     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid, "");
3158 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid_error,
3159     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid_error, "");
3160 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed,
3161     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_reclaimed, "");
3162 extern int shared_region_destroy_delay;
3163 SYSCTL_INT(_vm, OID_AUTO, shared_region_destroy_delay,
3164     CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_destroy_delay, 0, "");
3165 
3166 #if MACH_ASSERT
3167 extern int pmap_ledgers_panic_leeway;
3168 SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
3169 #endif /* MACH_ASSERT */
3170 
3171 
3172 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_count;
3173 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_size;
3174 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_max;
3175 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart;
3176 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_error;
3177 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_count;
3178 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_size;
3179 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_max;
3180 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart;
3181 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_error;
3182 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_count;
3183 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_size;
3184 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_max;
3185 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_count,
3186     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_count, "");
3187 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_size,
3188     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_size, "");
3189 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_max,
3190     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_max, "");
3191 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_restart,
3192     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_restart, "");
3193 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_error,
3194     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_error, "");
3195 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_count,
3196     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_count, "");
3197 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_size,
3198     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_size, "");
3199 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_max,
3200     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_max, "");
3201 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_restart,
3202     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_restart, "");
3203 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_error,
3204     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_error, "");
3205 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_count,
3206     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_count, "");
3207 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_size,
3208     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_size, "");
3209 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_max,
3210     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_max, "");
3211 
3212 extern int vm_protect_privileged_from_untrusted;
3213 SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted,
3214     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, "");
3215 extern uint64_t vm_copied_on_read;
3216 extern uint64_t vm_copied_on_read_kernel_map;
3217 extern uint64_t vm_copied_on_read_platform_map;
3218 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read,
3219     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, "");
3220 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read_kernel_map,
3221     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read_kernel_map, "");
3222 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read_platform_map,
3223     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read_platform_map, "");
3224 
3225 extern int vm_shared_region_count;
3226 extern int vm_shared_region_peak;
3227 SYSCTL_INT(_vm, OID_AUTO, shared_region_count,
3228     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_count, 0, "");
3229 SYSCTL_INT(_vm, OID_AUTO, shared_region_peak,
3230     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_peak, 0, "");
3231 #if DEVELOPMENT || DEBUG
3232 extern unsigned int shared_region_pagers_resident_count;
3233 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_count,
3234     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_count, 0, "");
3235 extern unsigned int shared_region_pagers_resident_peak;
3236 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_peak,
3237     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_peak, 0, "");
3238 extern int shared_region_pager_count;
3239 SYSCTL_INT(_vm, OID_AUTO, shared_region_pager_count,
3240     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_count, 0, "");
3241 #if __has_feature(ptrauth_calls)
3242 extern int shared_region_key_count;
3243 SYSCTL_INT(_vm, OID_AUTO, shared_region_key_count,
3244     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_key_count, 0, "");
3245 extern int vm_shared_region_reslide_count;
3246 SYSCTL_INT(_vm, OID_AUTO, shared_region_reslide_count,
3247     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_reslide_count, 0, "");
3248 #endif /* __has_feature(ptrauth_calls) */
3249 #endif /* DEVELOPMENT || DEBUG */
3250 
3251 #if MACH_ASSERT
3252 extern int debug4k_filter;
3253 SYSCTL_INT(_vm, OID_AUTO, debug4k_filter, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_filter, 0, "");
3254 extern int debug4k_panic_on_terminate;
3255 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_terminate, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_terminate, 0, "");
3256 extern int debug4k_panic_on_exception;
3257 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_exception, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_exception, 0, "");
3258 extern int debug4k_panic_on_misaligned_sharing;
3259 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_misaligned_sharing, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_misaligned_sharing, 0, "");
3260 #endif /* MACH_ASSERT */
3261 
3262 extern uint64_t vm_map_set_size_limit_count;
3263 extern uint64_t vm_map_set_data_limit_count;
3264 extern uint64_t vm_map_enter_RLIMIT_AS_count;
3265 extern uint64_t vm_map_enter_RLIMIT_DATA_count;
3266 SYSCTL_QUAD(_vm, OID_AUTO, map_set_size_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_size_limit_count, "");
3267 SYSCTL_QUAD(_vm, OID_AUTO, map_set_data_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_data_limit_count, "");
3268 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_AS_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_AS_count, "");
3269 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_DATA_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_DATA_count, "");
3270 
3271 extern uint64_t vm_fault_resilient_media_initiate;
3272 extern uint64_t vm_fault_resilient_media_retry;
3273 extern uint64_t vm_fault_resilient_media_proceed;
3274 extern uint64_t vm_fault_resilient_media_release;
3275 extern uint64_t vm_fault_resilient_media_abort1;
3276 extern uint64_t vm_fault_resilient_media_abort2;
3277 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_initiate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_initiate, "");
3278 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_retry, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_retry, "");
3279 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_proceed, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_proceed, "");
3280 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_release, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_release, "");
3281 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort1, "");
3282 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort2, "");
3283 #if MACH_ASSERT
3284 extern int vm_fault_resilient_media_inject_error1_rate;
3285 extern int vm_fault_resilient_media_inject_error1;
3286 extern int vm_fault_resilient_media_inject_error2_rate;
3287 extern int vm_fault_resilient_media_inject_error2;
3288 extern int vm_fault_resilient_media_inject_error3_rate;
3289 extern int vm_fault_resilient_media_inject_error3;
3290 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1_rate, 0, "");
3291 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1, 0, "");
3292 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2_rate, 0, "");
3293 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2, 0, "");
3294 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3_rate, 0, "");
3295 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3, 0, "");
3296 #endif /* MACH_ASSERT */
3297 
3298 extern uint64_t pmap_query_page_info_retries;
3299 SYSCTL_QUAD(_vm, OID_AUTO, pmap_query_page_info_retries, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_query_page_info_retries, "");
3300 
3301 /*
3302  * A sysctl which causes all existing shared regions to become stale. They
3303  * will no longer be used by anything new and will be torn down as soon as
3304  * the last existing user exits. A write of non-zero value causes that to happen.
3305  * This should only be used by launchd, so we check that this is initproc.
3306  */
3307 static int
shared_region_pivot(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3308 shared_region_pivot(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3309 {
3310 	unsigned int value = 0;
3311 	int changed = 0;
3312 	int error = sysctl_io_number(req, 0, sizeof(value), &value, &changed);
3313 	if (error || !changed) {
3314 		return error;
3315 	}
3316 	if (current_proc() != initproc) {
3317 		return EPERM;
3318 	}
3319 
3320 	vm_shared_region_pivot();
3321 
3322 	return 0;
3323 }
3324 
3325 SYSCTL_PROC(_vm, OID_AUTO, shared_region_pivot,
3326     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
3327     0, 0, shared_region_pivot, "I", "");
3328 
3329 extern uint64_t vm_object_shadow_forced;
3330 extern uint64_t vm_object_shadow_skipped;
3331 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
3332     &vm_object_shadow_forced, "");
3333 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_skipped, CTLFLAG_RD | CTLFLAG_LOCKED,
3334     &vm_object_shadow_skipped, "");
3335 
3336 extern uint64_t vm_object_upl_throttle_cnt;
3337 SYSCTL_QUAD(_vm, OID_AUTO, object_upl_throttle_cnt, CTLFLAG_RD | CTLFLAG_LOCKED,
3338     &vm_object_upl_throttle_cnt,
3339     "The number of times in which a UPL write was throttled due to pageout starvation");
3340 
3341 #if HAS_MTE
3342 #pragma mark MTE
3343 
3344 SYSCTL_NODE(_vm, OID_AUTO, mte, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "mte");
3345 
3346 /* sysctls for vm.mte.* counters. */
3347 
3348 SYSCTL_UINT(_vm_mte, OID_AUTO, tagged, CTLFLAG_RD,
3349     &vm_page_tagged_count, 0, "tagged pages in use");
3350 
3351 SYSCTL_QUAD(_vm_mte, OID_AUTO, refill_thread_wakeups, CTLFLAG_RD,
3352     &vm_mte_refill_thread_wakeups,
3353     "the number of times the refill thread was woken up");
3354 
3355 /* sysctls for vm.mte.free.* counters. */
3356 
3357 SYSCTL_NODE(_vm_mte, OID_AUTO, free, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "free counts");
3358 
3359 SYSCTL_UINT(_vm_mte_free, OID_AUTO, total, CTLFLAG_RD,
3360     &vm_page_free_count, 0,
3361     "total free pages (same as vm.page_free_count)");
3362 SYSCTL_UINT(_vm_mte_free, OID_AUTO, taggable, CTLFLAG_RD,
3363     &vm_page_free_taggable_count, 0,
3364     "free taggable pages in the MTE free queue");
3365 SYSCTL_UINT(_vm_mte_free, OID_AUTO, claimable, CTLFLAG_RD,
3366     &mte_claimable_queue.vmpfq_count, 0,
3367     "free tag storage pages on the MTE claimable queue");
3368 
3369 SYSCTL_SCALABLE_COUNTER(_vm_mte_free, cpu_untagged, vm_cpu_free_count,
3370     "free untagged pages in CPU lists");
3371 SYSCTL_SCALABLE_COUNTER(_vm_mte_free, cpu_claimed, vm_cpu_free_claimed_count,
3372     "free claimed pages in CPU lists");
3373 SYSCTL_SCALABLE_COUNTER(_vm_mte_free, cpu_tagged, vm_cpu_free_tagged_count,
3374     "free tagged pages in CPU lists");
3375 
3376 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_0, CTLFLAG_RD,
3377     &mte_free_queues[MTE_FREE_UNTAGGABLE_0].vmpfq_count, 0,
3378     "disabled/pinned/deactivating/claimed (with 16 free pages or less) tag storage pages")
3379 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_1, CTLFLAG_RD,
3380     &mte_free_queues[MTE_FREE_UNTAGGABLE_1].vmpfq_count, 0,
3381     "claimed (with 17 free pages or more) or disabled (with 16 pages or less) tag storage pages")
3382 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_2, CTLFLAG_RD,
3383     &mte_free_queues[MTE_FREE_UNTAGGABLE_2].vmpfq_count, 0,
3384     "disabled (with 17 pages or more) tag storage pages")
3385 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_0, CTLFLAG_RD,
3386     &mte_free_queues[MTE_FREE_ACTIVE_0].vmpfq_count, 0,
3387     "active tag storages with free covered pages (bucket 0)");
3388 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_1, CTLFLAG_RD,
3389     &mte_free_queues[MTE_FREE_ACTIVE_1].vmpfq_count, 0,
3390     "active tag storages with free covered pages (bucket 1)");
3391 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_2, CTLFLAG_RD,
3392     &mte_free_queues[MTE_FREE_ACTIVE_2].vmpfq_count, 0,
3393     "active tag storages with free covered pages (bucket 2)");
3394 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_3, CTLFLAG_RD,
3395     &mte_free_queues[MTE_FREE_ACTIVE_3].vmpfq_count, 0,
3396     "active tag storages with free covered pages (bucket 3)");
3397 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_activating, CTLFLAG_RD,
3398     &mte_free_queues[MTE_FREE_UNTAGGABLE_ACTIVATING].vmpfq_count, 0,
3399     "activating/reclaiming tag storages with free covered pages");
3400 
3401 /* sysctls for vm.mte.tag_storage.cell_* counters. */
3402 
3403 SYSCTL_NODE(_vm_mte, OID_AUTO, cell, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "mte cell");
3404 
3405 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, disabled, CTLFLAG_RD,
3406     &mte_info_lists[MTE_LIST_DISABLED_IDX].count, 0,
3407     "free inactive tag storage pages");
3408 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, disabled_recursive, CTLFLAG_RD,
3409     &vm_page_recursive_tag_storage_count, 0,
3410     "recursive tag storage pages");
3411 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, disabled_unmanaged, CTLFLAG_RD,
3412     &vm_page_unmanaged_tag_storage_count, 0,
3413     "unmanaged tag storage pages");
3414 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, retired, CTLFLAG_RD,
3415     &vm_page_retired_tag_storage_count, 0,
3416     "retired tag storage pages");
3417 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, pinned, CTLFLAG_RD,
3418     &mte_info_lists[MTE_LIST_PINNED_IDX].count, 0,
3419     "unreclaimable tag storage pages");
3420 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, deactivating, CTLFLAG_RD,
3421     &mte_info_lists[MTE_LIST_DEACTIVATING_IDX].count, 0,
3422     "deactivating tag storage pages");
3423 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, claimed, CTLFLAG_RD,
3424     &mte_info_lists[MTE_LIST_CLAIMED_IDX].count, 0,
3425     "claimed tag storage pages");
3426 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, inactive, CTLFLAG_RD,
3427     &mte_info_lists[MTE_LIST_INACTIVE_IDX].count, 0,
3428     "free inactive tag storage pages");
3429 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, reclaiming, CTLFLAG_RD,
3430     &mte_info_lists[MTE_LIST_RECLAIMING_IDX].count, 0,
3431     "reclaiming tag storage pages");
3432 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, activating, CTLFLAG_RD,
3433     &mte_info_lists[MTE_LIST_ACTIVATING_IDX].count, 0,
3434     "activating tag storage pages");
3435 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, active_0, CTLFLAG_RD,
3436     &mte_info_lists[MTE_LIST_ACTIVE_0_IDX].count, 0,
3437     "active tag storage pages with no used page tagged");
3438 static int
3439 tag_storage_active SYSCTL_HANDLER_ARGS
3440 {
3441 #pragma unused(arg1, arg2, oidp)
3442 	uint32_t value = mteinfo_tag_storage_active(false);
3443 
3444 	return SYSCTL_OUT(req, &value, sizeof(value));
3445 }
3446 SYSCTL_PROC(_vm_mte_cell, OID_AUTO, active,
3447     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3448     0, 0, &tag_storage_active, "I",
3449     "active tag storage pages");
3450 
3451 /* sysctls for vm.mte.tag_storage.* counters. */
3452 
3453 SYSCTL_NODE(_vm_mte, OID_AUTO, tag_storage, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "mte tag storage");
3454 
3455 SYSCTL_UINT(_vm_mte_tag_storage, OID_AUTO, reserved, CTLFLAG_RD,
3456     &vm_page_tag_storage_reserved, 0,
3457     "free tag storage pages reserve");
3458 SYSCTL_UINT(_vm_mte_tag_storage, OID_AUTO, wired, CTLFLAG_RD,
3459     &vm_page_wired_tag_storage_count, 0,
3460     "wired tag storage pages");
3461 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, activations, CTLFLAG_RD,
3462     &vm_page_tag_storage_activation_count,
3463     "tag storage activations (inactive/claimed -> active)");
3464 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, deactivations, CTLFLAG_RD,
3465     &vm_page_tag_storage_deactivation_count,
3466     "tag storage deactivations (active -> inactive)");
3467 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaims, CTLFLAG_RD,
3468     &vm_page_tag_storage_reclaim_success_count,
3469     "successful tag storage reclamations");
3470 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaims_from_cpu, CTLFLAG_RD,
3471     &vm_page_tag_storage_reclaim_from_cpu_count,
3472     "successful tag storage reclamations from the cpu free lists");
3473 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaim_failures, CTLFLAG_RD,
3474     &vm_page_tag_storage_reclaim_failure_count,
3475     "failed tag storage reclamations");
3476 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaim_wired_failures, CTLFLAG_RD,
3477     &vm_page_tag_storage_reclaim_wired_failure_count,
3478     "failed tag storage reclamations due to tag storage being wired");
3479 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, wire_relocations, CTLFLAG_RD,
3480     &vm_page_tag_storage_wire_relocation_count,
3481     "tag storage relocations due to wiring");
3482 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaim_compressor_failures, CTLFLAG_RD,
3483     &vm_page_tag_storage_reclaim_compressor_failure_count,
3484     "failed tag storage reclamations due to tag storage used in compressor pool");
3485 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, compressor_relocations, CTLFLAG_RD,
3486     &vm_page_tag_storage_compressor_relocation_count,
3487     "tag storage relocations due to compressor pool");
3488 SYSCTL_UINT(_vm_mte_tag_storage, OID_AUTO, free_unmanaged, CTLFLAG_RD,
3489     &vm_page_free_unmanaged_tag_storage_count, 0,
3490     "number of free unmanaged tag storage pages");
3491 
3492 SYSCTL_SCALABLE_COUNTER(_vm_mte_tag_storage, cpu_allocated_claimed,
3493     vm_cpu_claimed_count, "claimed tag storage pages allocated");
3494 
3495 static int
3496 tag_storage_fragmentation SYSCTL_HANDLER_ARGS
3497 {
3498 #pragma unused(arg1, arg2, oidp)
3499 	uint32_t value = mteinfo_tag_storage_fragmentation(false);
3500 
3501 	return SYSCTL_OUT(req, &value, sizeof(value));
3502 }
3503 SYSCTL_PROC(_vm_mte_tag_storage, OID_AUTO, fragmentation,
3504     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3505     0, 0, &tag_storage_fragmentation, "I",
3506     "the achievable the fragmentation of the tag storage space (in parts per thousand)");
3507 
3508 static int
3509 tag_storage_fragmentation_actual SYSCTL_HANDLER_ARGS
3510 {
3511 #pragma unused(arg1, arg2, oidp)
3512 	uint32_t value = mteinfo_tag_storage_fragmentation(true);
3513 
3514 	return SYSCTL_OUT(req, &value, sizeof(value));
3515 }
3516 SYSCTL_PROC(_vm_mte_tag_storage, OID_AUTO, fragmentation_actual,
3517     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3518     0, 0, &tag_storage_fragmentation_actual, "I",
3519     "the actual the fragmentation of the tag storage space (in parts per thousand)");
3520 
3521 /* sysctls for vm.mte.compresor_* */
3522 
3523 extern unsigned int vm_object_no_compressor_pager_for_mte_count;
3524 SYSCTL_INT(_vm_mte, OID_AUTO, no_compressor_pager_for_mte, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_no_compressor_pager_for_mte_count, 0, "");
3525 
3526 /* sysctls for MTE compression stats */
3527 
3528 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_compressed, compressor_tagged_pages_compressed, "");
3529 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_decompressed, compressor_tagged_pages_decompressed, "");
3530 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_freed, compressor_tagged_pages_freed, "");
3531 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_corrupted, compressor_tagged_pages_corrupted, "");
3532 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_overhead_bytes, compressor_tags_overhead_bytes, "");
3533 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages, compressor_tagged_pages, "");
3534 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_ts_pages_used, compressor_tag_storage_pages_in_pool,
3535     "the number of tag storage pages used in the compressor");
3536 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_non_ts_pages_used, compressor_non_tag_storage_pages_in_pool,
3537     "the number of non-tag storage pages used in the compressor");
3538 #if DEVELOPMENT || DEBUG
3539 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_all_zero, compressor_tags_all_zero, "");
3540 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_same_value, compressor_tags_same_value, "");
3541 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_below_align, compressor_tags_below_align, "");
3542 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_above_align, compressor_tags_above_align, "");
3543 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_incompressible, compressor_tags_incompressible, "");
3544 #endif /* DEVELOPMENT || DEBUG */
3545 
3546 #endif /* HAS_MTE */
3547 
3548 SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED,
3549     &vmtc_total, 0, "total text page corruptions detected");
3550 
3551 
3552 #if DEBUG || DEVELOPMENT
3553 /*
3554  * A sysctl that can be used to corrupt a text page with an illegal instruction.
3555  * Used for testing text page self healing.
3556  */
3557 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
3558 static int
corrupt_text_addr(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3559 corrupt_text_addr(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3560 {
3561 	uint64_t value = 0;
3562 	int error = sysctl_handle_quad(oidp, &value, 0, req);
3563 	if (error || !req->newptr) {
3564 		return error;
3565 	}
3566 
3567 	if (vm_corrupt_text_addr((uintptr_t)value) == KERN_SUCCESS) {
3568 		return 0;
3569 	} else {
3570 		return EINVAL;
3571 	}
3572 }
3573 
3574 SYSCTL_PROC(_vm, OID_AUTO, corrupt_text_addr,
3575     CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3576     0, 0, corrupt_text_addr, "-", "");
3577 #endif /* DEBUG || DEVELOPMENT */
3578 
3579 #if CONFIG_MAP_RANGES
3580 /*
3581  * vm.malloc_ranges
3582  *
3583  * space-separated list of <left:right> hexadecimal addresses.
3584  */
3585 static int
3586 vm_map_malloc_ranges SYSCTL_HANDLER_ARGS
3587 {
3588 	vm_map_t map = current_map();
3589 	struct mach_vm_range r1, r2;
3590 	char str[20 * 4];
3591 	int len;
3592 	mach_vm_offset_t right_hole_max;
3593 
3594 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_DEFAULT, &r1)) {
3595 		return ENOENT;
3596 	}
3597 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_HEAP, &r2)) {
3598 		return ENOENT;
3599 	}
3600 
3601 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
3602 	right_hole_max = MACH_VM_JUMBO_ADDRESS;
3603 #else /* !XNU_TARGET_OS_IOS || !EXTENDED_USER_VA_SUPPORT */
3604 	right_hole_max = get_map_max(map);
3605 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
3606 
3607 	len = scnprintf(str, sizeof(str), "0x%llx:0x%llx 0x%llx:0x%llx",
3608 	    r1.max_address, r2.min_address,
3609 	    r2.max_address, right_hole_max);
3610 
3611 	return SYSCTL_OUT(req, str, len);
3612 }
3613 
3614 SYSCTL_PROC(_vm, OID_AUTO, malloc_ranges,
3615     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3616     0, 0, &vm_map_malloc_ranges, "A", "");
3617 
3618 #if DEBUG || DEVELOPMENT
3619 static int
3620 vm_map_user_range_default SYSCTL_HANDLER_ARGS
3621 {
3622 #pragma unused(arg1, arg2, oidp)
3623 	struct mach_vm_range range;
3624 
3625 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_DEFAULT, &range)
3626 	    != KERN_SUCCESS) {
3627 		return EINVAL;
3628 	}
3629 
3630 	return SYSCTL_OUT(req, &range, sizeof(range));
3631 }
3632 
3633 static int
3634 vm_map_user_range_heap SYSCTL_HANDLER_ARGS
3635 {
3636 #pragma unused(arg1, arg2, oidp)
3637 	struct mach_vm_range range;
3638 
3639 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_HEAP, &range)
3640 	    != KERN_SUCCESS) {
3641 		return EINVAL;
3642 	}
3643 
3644 	return SYSCTL_OUT(req, &range, sizeof(range));
3645 }
3646 
3647 static int
3648 vm_map_user_range_large_file SYSCTL_HANDLER_ARGS
3649 {
3650 #pragma unused(arg1, arg2, oidp)
3651 	struct mach_vm_range range;
3652 
3653 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_LARGE_FILE, &range)
3654 	    != KERN_SUCCESS) {
3655 		return EINVAL;
3656 	}
3657 
3658 	return SYSCTL_OUT(req, &range, sizeof(range));
3659 }
3660 
3661 /*
3662  * A sysctl that can be used to return ranges for the current VM map.
3663  * Used for testing VM ranges.
3664  */
3665 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_default, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3666     0, 0, &vm_map_user_range_default, "S,mach_vm_range", "");
3667 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_heap, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3668     0, 0, &vm_map_user_range_heap, "S,mach_vm_range", "");
3669 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_large_file, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3670     0, 0, &vm_map_user_range_large_file, "S,mach_vm_range", "");
3671 
3672 #endif /* DEBUG || DEVELOPMENT */
3673 #endif /* CONFIG_MAP_RANGES */
3674 
3675 #if DEBUG || DEVELOPMENT
3676 #endif /* DEBUG || DEVELOPMENT */
3677 
3678 extern uint64_t vm_map_range_overflows_count;
3679 SYSCTL_QUAD(_vm, OID_AUTO, map_range_overflows_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_range_overflows_count, "");
3680 extern boolean_t vm_map_range_overflows_log;
3681 SYSCTL_INT(_vm, OID_AUTO, map_range_oveflows_log, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_range_overflows_log, 0, "");
3682 
3683 extern uint64_t c_seg_filled_no_contention;
3684 extern uint64_t c_seg_filled_contention;
3685 extern clock_sec_t c_seg_filled_contention_sec_max;
3686 extern clock_nsec_t c_seg_filled_contention_nsec_max;
3687 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_no_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_no_contention, "");
3688 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention, "");
3689 SYSCTL_ULONG(_vm, OID_AUTO, c_seg_filled_contention_sec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_sec_max, "");
3690 SYSCTL_UINT(_vm, OID_AUTO, c_seg_filled_contention_nsec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_nsec_max, 0, "");
3691 #if (XNU_TARGET_OS_OSX && __arm64__)
3692 extern clock_nsec_t c_process_major_report_over_ms; /* report if over ? ms */
3693 extern int c_process_major_yield_after; /* yield after moving ? segments */
3694 extern uint64_t c_process_major_reports;
3695 extern clock_sec_t c_process_major_max_sec;
3696 extern clock_nsec_t c_process_major_max_nsec;
3697 extern uint32_t c_process_major_peak_segcount;
3698 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_report_over_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_report_over_ms, 0, "");
3699 SYSCTL_INT(_vm, OID_AUTO, c_process_major_yield_after, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_yield_after, 0, "");
3700 SYSCTL_QUAD(_vm, OID_AUTO, c_process_major_reports, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_reports, "");
3701 SYSCTL_ULONG(_vm, OID_AUTO, c_process_major_max_sec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_sec, "");
3702 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_max_nsec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_nsec, 0, "");
3703 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_peak_segcount, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_peak_segcount, 0, "");
3704 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3705 
3706 #if DEVELOPMENT || DEBUG
3707 extern int panic_object_not_alive;
3708 SYSCTL_INT(_vm, OID_AUTO, panic_object_not_alive, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &panic_object_not_alive, 0, "");
3709 #endif /* DEVELOPMENT || DEBUG */
3710 
3711 #if FBDP_DEBUG_OBJECT_NO_PAGER
3712 extern int fbdp_no_panic;
3713 SYSCTL_INT(_vm, OID_AUTO, fbdp_no_panic, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &fbdp_no_panic, 0, "");
3714 #endif /* MACH_ASSERT */
3715 
3716 extern uint64_t cluster_direct_write_wired;
3717 SYSCTL_QUAD(_vm, OID_AUTO, cluster_direct_write_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &cluster_direct_write_wired, "");
3718 
3719 extern uint64_t vm_object_pageout_not_on_queue;
3720 extern uint64_t vm_object_pageout_not_pageable;
3721 extern uint64_t vm_object_pageout_pageable;
3722 extern uint64_t vm_object_pageout_active_local;
3723 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_on_queue, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_on_queue, "");
3724 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_pageable, "");
3725 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_pageable, "");
3726 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_active_local, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_active_local, "");
3727 
3728 
3729 #if DEVELOPMENT || DEBUG
3730 
3731 static uint32_t
sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)3732 sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)
3733 {
3734 #if HAS_MTE
3735 	if (with_data == VM_C_SERIALIZE_DATA_TAGS) {
3736 		return VM_C_SEGMENT_INFO_MAGIC_WITH_TAGS;
3737 	}
3738 #else
3739 #pragma unused(with_data)
3740 #endif /* HAS_MTE */
3741 	return VM_C_SEGMENT_INFO_MAGIC;
3742 }
3743 
3744 /* The largest possible single segment + its slots is
3745  * (sizeof(c_segment_info) + C_SLOT_MAX_INDEX * sizeof(c_slot_info)) + (data of a single segment) */
3746 #define SYSCTL_SEG_BUF_SIZE (8 * 1024 + 64 * 1024)
3747 
3748 extern uint32_t c_segments_available;
3749 
3750 struct sysctl_buf_header {
3751 	uint32_t magic;
3752 } __attribute__((packed));
3753 
3754 /* This sysctl iterates over the populated c_segments and writes some info about each one and its slots.
3755  * instead of doing everything here, the function calls a function vm_compressor.c. */
3756 static int
sysctl_compressor_segments_stream(struct sysctl_req * req,vm_c_serialize_add_data_t with_data)3757 sysctl_compressor_segments_stream(struct sysctl_req *req, vm_c_serialize_add_data_t with_data)
3758 {
3759 	char* buf = kalloc_data(SYSCTL_SEG_BUF_SIZE, Z_WAITOK | Z_ZERO);
3760 	if (!buf) {
3761 		return ENOMEM;
3762 	}
3763 	size_t offset = 0;
3764 	int error = 0;
3765 	int segno = 0;
3766 	/* 4 byte header to identify the version of the formatting of the data.
3767 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3768 	((struct sysctl_buf_header*)buf)->magic = sysctl_compressor_seg_magic(with_data);
3769 	offset += sizeof(uint32_t);
3770 
3771 	while (segno < c_segments_available) {
3772 		size_t left_sz = SYSCTL_SEG_BUF_SIZE - offset;
3773 		kern_return_t kr = vm_compressor_serialize_segment_debug_info(segno, buf + offset, &left_sz, with_data);
3774 		if (kr == KERN_NO_SPACE) {
3775 			/* failed to add another segment, push the current buffer out and try again */
3776 			if (offset == 0) {
3777 				error = EINVAL; /* no space to write but I didn't write anything, shouldn't really happen */
3778 				goto out;
3779 			}
3780 			/* write out chunk */
3781 			error = SYSCTL_OUT(req, buf, offset);
3782 			if (error) {
3783 				goto out;
3784 			}
3785 			offset = 0;
3786 			bzero(buf, SYSCTL_SEG_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3787 			/* don't increment segno, need to try again saving the current one */
3788 		} else if (kr != KERN_SUCCESS) {
3789 			error = EINVAL;
3790 			goto out;
3791 		} else {
3792 			offset += left_sz;
3793 			++segno;
3794 			assert(offset <= SYSCTL_SEG_BUF_SIZE);
3795 		}
3796 	}
3797 
3798 	if (offset > 0) { /* write last chunk */
3799 		error = SYSCTL_OUT(req, buf, offset);
3800 	}
3801 
3802 out:
3803 	kfree_data(buf, SYSCTL_SEG_BUF_SIZE)
3804 	return error;
3805 }
3806 
3807 static int
sysctl_compressor_segments(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3808 sysctl_compressor_segments(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3809 {
3810 	return sysctl_compressor_segments_stream(req, VM_C_SERIALIZE_DATA_NONE);
3811 }
3812 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments, "S", "");
3813 
3814 #if HAS_MTE
3815 static int
sysctl_compressor_segments_data(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3816 sysctl_compressor_segments_data(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3817 {
3818 	return sysctl_compressor_segments_stream(req, VM_C_SERIALIZE_DATA_TAGS);
3819 }
3820 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments_data, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments_data, "S", "");
3821 #endif /* HAS_MTE */
3822 
3823 extern uint32_t vm_compressor_fragmentation_level(void);
3824 
3825 static int
sysctl_compressor_fragmentation_level(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3826 sysctl_compressor_fragmentation_level(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3827 {
3828 	uint32_t value = vm_compressor_fragmentation_level();
3829 	return SYSCTL_OUT(req, &value, sizeof(value));
3830 }
3831 
3832 SYSCTL_PROC(_vm, OID_AUTO, compressor_fragmentation_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_fragmentation_level, "IU", "");
3833 
3834 extern uint32_t vm_compressor_incore_fragmentation_wasted_pages(void);
3835 
3836 static int
sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3837 sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3838 {
3839 	uint32_t value = vm_compressor_incore_fragmentation_wasted_pages();
3840 	return SYSCTL_OUT(req, &value, sizeof(value));
3841 }
3842 
3843 SYSCTL_PROC(_vm, OID_AUTO, compressor_incore_fragmentation_wasted_pages, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_incore_fragmentation_wasted_pages, "IU", "");
3844 
3845 
3846 
3847 #define SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE (8 * 1024)
3848 
3849 
3850 /* This sysctl iterates over all the entries of the vm_map of the a given process and write some info about the vm_object pointed by the entries.
3851  * This can be used for mapping where are all the pages of a process located in the compressor.
3852  */
3853 static int
sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)3854 sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3855 {
3856 	int error = 0;
3857 	char *buf = NULL;
3858 	proc_t p = PROC_NULL;
3859 	task_t task = TASK_NULL;
3860 	vm_map_t map = VM_MAP_NULL;
3861 	__block size_t offset = 0;
3862 
3863 	/* go from pid to proc to task to vm_map. see sysctl_procargsx() for another example of this procession */
3864 	int *name = arg1;
3865 	int namelen = arg2;
3866 	if (namelen < 1) {
3867 		return EINVAL;
3868 	}
3869 	int pid = name[0];
3870 	p = proc_find(pid);  /* this increments a reference to the proc */
3871 	if (p == PROC_NULL) {
3872 		return EINVAL;
3873 	}
3874 	task = proc_task(p);
3875 	proc_rele(p);  /* decrement ref of proc */
3876 	p = PROC_NULL;
3877 	if (task == TASK_NULL) {
3878 		return EINVAL;
3879 	}
3880 	/* convert proc reference to task reference */
3881 	task_reference(task);
3882 	/* task reference to map reference */
3883 	map = get_task_map_reference(task);
3884 	task_deallocate(task);
3885 
3886 	if (map == VM_MAP_NULL) {
3887 		return EINVAL;  /* nothing allocated yet */
3888 	}
3889 
3890 	buf = kalloc_data(SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE, Z_WAITOK | Z_ZERO);
3891 	if (!buf) {
3892 		error = ENOMEM;
3893 		goto out;
3894 	}
3895 
3896 	/* 4 byte header to identify the version of the formatting of the data.
3897 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3898 	((struct sysctl_buf_header*)buf)->magic = VM_MAP_ENTRY_INFO_MAGIC;
3899 	offset += sizeof(uint32_t);
3900 
3901 	kern_return_t (^write_header)(int) = ^kern_return_t (int nentries) {
3902 		/* write the header, happens only once at the beginning so we should have enough space */
3903 		assert(offset + sizeof(struct vm_map_info_hdr) < SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE);
3904 		struct vm_map_info_hdr* out_hdr = (struct vm_map_info_hdr*)(buf + offset);
3905 		out_hdr->vmi_nentries = nentries;
3906 		offset += sizeof(struct vm_map_info_hdr);
3907 		return KERN_SUCCESS;
3908 	};
3909 
3910 	kern_return_t (^write_entry)(void*) = ^kern_return_t (void* entry) {
3911 		while (true) { /* try up to 2 times, first try write the the current buffer, otherwise to a new buffer */
3912 			size_t left_sz = SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE - offset;
3913 			kern_return_t kr = vm_map_dump_entry_and_compressor_pager(entry, buf + offset, &left_sz);
3914 			if (kr == KERN_NO_SPACE) {
3915 				/* failed to write anything, flush the current buffer and try again */
3916 				if (offset == 0) {
3917 					return KERN_FAILURE; /* no space to write but I didn't write anything yet, shouldn't really happen */
3918 				}
3919 				/* write out chunk */
3920 				int out_error = SYSCTL_OUT(req, buf, offset);
3921 				if (out_error) {
3922 					return KERN_FAILURE;
3923 				}
3924 				offset = 0;
3925 				bzero(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3926 				continue; /* need to retry the entry dump again with the cleaned buffer */
3927 			} else if (kr != KERN_SUCCESS) {
3928 				return kr;
3929 			}
3930 			offset += left_sz;
3931 			break;
3932 		}
3933 		return KERN_SUCCESS;
3934 	};
3935 
3936 	/* this foreach first calls to the first callback with the number of entries, then calls the second for every entry
3937 	 * when the buffer is exhausted, it is flushed to the sysctl and restarted */
3938 	kern_return_t kr = vm_map_entries_foreach(map, write_header, write_entry);
3939 
3940 	if (kr != KERN_SUCCESS) {
3941 		goto out;
3942 	}
3943 
3944 	if (offset > 0) { /* last chunk */
3945 		error = SYSCTL_OUT(req, buf, offset);
3946 	}
3947 
3948 out:
3949 	if (buf != NULL) {
3950 		kfree_data(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE)
3951 	}
3952 	if (map != NULL) {
3953 		vm_map_deallocate(map);
3954 	}
3955 	return error;
3956 }
3957 
3958 SYSCTL_PROC(_vm, OID_AUTO, task_vm_objects_slotmap, CTLTYPE_NODE | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_task_vm_objects_slotmap, "S", "");
3959 static int
3960 systctl_vm_reset_tag SYSCTL_HANDLER_ARGS
3961 {
3962 #pragma unused(oidp, arg1, arg2)
3963 	int error;
3964 	int tag;
3965 	kern_return_t kr;
3966 
3967 	/* Need to be root */
3968 	if (!kauth_cred_issuser(kauth_cred_get())) {
3969 		return EPERM;
3970 	}
3971 
3972 	error = SYSCTL_IN(req, &tag, sizeof(tag));
3973 	if (error) {
3974 		return error;
3975 	}
3976 
3977 	if (tag > VM_MAX_TAG_VALUE) {
3978 		return EINVAL;
3979 	}
3980 
3981 	kr = vm_tag_reset_peak((vm_tag_t)tag);
3982 
3983 	return mach_to_bsd_errno(kr);
3984 }
3985 
3986 SYSCTL_PROC(_vm, OID_AUTO, reset_tag,
3987     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED,
3988     0, 0, &systctl_vm_reset_tag, "I", "");
3989 
3990 static int
3991 systctl_vm_reset_all_tags SYSCTL_HANDLER_ARGS
3992 {
3993 #pragma unused(oidp, arg1, arg2)
3994 	/* Only reset the values if the sysctl is a write */
3995 	if (!req->newptr) {
3996 		return EINVAL;
3997 	}
3998 
3999 	/* Need to be root */
4000 	if (!kauth_cred_issuser(kauth_cred_get())) {
4001 		return EPERM;
4002 	}
4003 
4004 	vm_tag_reset_all_peaks();
4005 
4006 	return 0;
4007 }
4008 
4009 SYSCTL_PROC(_vm, OID_AUTO, reset_all_tags,
4010     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED,
4011     0, 0, &systctl_vm_reset_all_tags, "I", "");
4012 
4013 #endif /* DEVELOPMENT || DEBUG */
4014 
4015 SYSCTL_NODE(_vm, OID_AUTO, compressor, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor");
4016 
4017 SYSCTL_INT(_vm_compressor, OID_AUTO, mode, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_mode, 0, "");
4018 SYSCTL_INT(_vm_compressor, OID_AUTO, is_active, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_is_active, 0, "");
4019 SYSCTL_INT(_vm_compressor, OID_AUTO, is_available, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_available, 0, "");
4020 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed, CTLFLAG_RD | CTLFLAG_LOCKED,
4021     &c_segment_pages_compressed, 0, "The amount of uncompressed data stored in the compressor (in pages)");
4022 #if CONFIG_FREEZE
4023 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed_incore, CTLFLAG_RD | CTLFLAG_LOCKED,
4024     &c_segment_pages_compressed_incore, 0, "The amount of uncompressed data stored in the in-core compressor (in pages)");
4025 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed_incore_late_swapout, CTLFLAG_RD | CTLFLAG_LOCKED,
4026     &c_segment_pages_compressed_incore_late_swapout, 0, "The amount of uncompressed data stored in the in-core compressor and queued for swapout (in pages)");
4027 #endif
4028 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed_limit, CTLFLAG_RD | CTLFLAG_LOCKED,
4029     &c_segment_pages_compressed_limit, 0, "The limit on the amount of uncompressed data the compressor will store (in pages)");
4030 
4031 SYSCTL_NODE(_vm_compressor, OID_AUTO, segment, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor Segment Counts");
4032 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, total, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_count, 0, "Number of allocated segments");
4033 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, aging, CTLFLAG_RD | CTLFLAG_LOCKED, &c_age_count, 0, "Number of aging segments");
4034 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedin_early, CTLFLAG_RD | CTLFLAG_LOCKED, &c_early_swappedin_count, 0, "Number of (early) swapped-in segments");
4035 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedin_regular, CTLFLAG_RD | CTLFLAG_LOCKED, &c_regular_swappedin_count, 0, "Number of (regular) swapped-in segments");
4036 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedin_late, CTLFLAG_RD | CTLFLAG_LOCKED, &c_late_swappedin_count, 0, "Number of (late) swapped-in segments");
4037 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapout_early, CTLFLAG_RD | CTLFLAG_LOCKED, &c_early_swapout_count, 0, "Number of (early) ready-to-swap segments");
4038 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapout_regular, CTLFLAG_RD | CTLFLAG_LOCKED, &c_regular_swapout_count, 0, "Number of (regular) ready-to-swap segments");
4039 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapout_late, CTLFLAG_RD | CTLFLAG_LOCKED, &c_late_swapout_count, 0, "Number of (late) ready-to-swap segments");
4040 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapio, CTLFLAG_RD | CTLFLAG_LOCKED, &c_swapio_count, 0, "Number of swapping-out segments");
4041 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedout, CTLFLAG_RD | CTLFLAG_LOCKED, &c_swappedout_count, 0, "Number of (non-sparse) swapped-out segments");
4042 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedout_sparse, CTLFLAG_RD | CTLFLAG_LOCKED, &c_swappedout_sparse_count, 0, "Number of (sparse) swapped-out segments");
4043 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, majorcompact, CTLFLAG_RD | CTLFLAG_LOCKED, &c_major_count, 0, "Number of recently-compacted segments");
4044 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, minorcompact, CTLFLAG_RD | CTLFLAG_LOCKED, &c_minor_count, 0, "Number of segments queued for deferred minor compaction");
4045 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, filling, CTLFLAG_RD | CTLFLAG_LOCKED, &c_filling_count, 0, "Number of filling segments");
4046 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, empty, CTLFLAG_RD | CTLFLAG_LOCKED, &c_empty_count, 0, "Number of empty segments");
4047 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, bad, CTLFLAG_RD | CTLFLAG_LOCKED, &c_bad_count, 0, "Number of bad segments");
4048 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, limit, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segments_limit, 0, "Limit on the number of allocated segments");
4049 
4050 SYSCTL_NODE(_vm_compressor, OID_AUTO, svp, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor Single-Value");
4051 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, in_hash, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_in_hash, 0, "");
4052 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, hash_succeeded, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_hash_succeeded, 0, "");
4053 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, hash_failed, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_hash_failed, 0, "");
4054 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, zval_compressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_zero_compressions, 0, "");
4055 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, zval_decompressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_zero_decompressions, 0, "");
4056 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, nzval_compressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_nonzero_compressions, 0, "");
4057 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, nzval_decompressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_nonzero_decompressions, 0, "");
4058 
4059 SYSCTL_NODE(_vm_compressor, OID_AUTO, compactor, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor Compactor");
4060 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compactions_completed, CTLFLAG_RD | CTLFLAG_LOCKED,
4061     &vm_pageout_vminfo.vm_compactor_major_compactions_completed, "Major compactions completed");
4062 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compactions_considered, CTLFLAG_RD | CTLFLAG_LOCKED,
4063     &vm_pageout_vminfo.vm_compactor_major_compactions_considered, "Major compactions considered");
4064 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compactions_bailed, CTLFLAG_RD | CTLFLAG_LOCKED,
4065     &vm_pageout_vminfo.vm_compactor_major_compactions_bailed, "Major compactions bailed (due to contention)");
4066 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_bytes_moved, CTLFLAG_RD | CTLFLAG_LOCKED,
4067     &vm_pageout_vminfo.vm_compactor_major_compaction_bytes_moved, "Bytes moved between segments during major compactions");
4068 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_slots_moved, CTLFLAG_RD | CTLFLAG_LOCKED,
4069     &vm_pageout_vminfo.vm_compactor_major_compaction_slots_moved, "Slots moved between segments during major compactions");
4070 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_bytes_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
4071     &vm_pageout_vminfo.vm_compactor_major_compaction_bytes_freed, "Bytes freed as a result of major compaction");
4072 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_segments_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
4073     &vm_pageout_vminfo.vm_compactor_major_compaction_segments_freed, "Segments freed as a result of major compaction");
4074 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, swapouts_queued, CTLFLAG_RD | CTLFLAG_LOCKED,
4075     &vm_pageout_vminfo.vm_compactor_swapouts_queued, "The number of segments queued for swapout after a major compaction");
4076 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, swapout_bytes_wasted, CTLFLAG_RD | CTLFLAG_LOCKED,
4077     &vm_pageout_vminfo.vm_compactor_swapout_bytes_wasted, "The number of unused bytes in segments queued for swapout");
4078