xref: /xnu-12377.81.4/bsd/vm/vm_unix.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Mach Operating System
30  * Copyright (c) 1987 Carnegie-Mellon University
31  * All rights reserved.  The CMU software License Agreement specifies
32  * the terms and conditions for use and redistribution.
33  */
34 /*
35  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
36  * support for mandatory and extensible security protections.  This notice
37  * is included in support of clause 2.2 (b) of the Apple Public License,
38  * Version 2.0.
39  */
40 #include <vm/vm_options.h>
41 
42 #include <kern/ecc.h>
43 #include <kern/task.h>
44 #include <kern/thread.h>
45 #include <kern/debug.h>
46 #include <kern/extmod_statistics.h>
47 #include <mach/mach_traps.h>
48 #include <mach/port.h>
49 #include <mach/sdt.h>
50 #include <mach/task.h>
51 #include <mach/task_access.h>
52 #include <mach/task_special_ports.h>
53 #include <mach/time_value.h>
54 #include <mach/vm_map.h>
55 #include <mach/vm_param.h>
56 #include <mach/vm_prot.h>
57 #include <machine/machine_routines.h>
58 
59 #include <sys/file_internal.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/dir.h>
63 #include <sys/namei.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/vm.h>
67 #include <sys/file.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/mount.h>
70 #include <sys/xattr.h>
71 #include <sys/trace.h>
72 #include <sys/kernel.h>
73 #include <sys/ubc_internal.h>
74 #include <sys/user.h>
75 #include <sys/syslog.h>
76 #include <sys/stat.h>
77 #include <sys/sysproto.h>
78 #include <sys/mman.h>
79 #include <sys/sysctl.h>
80 #include <sys/cprotect.h>
81 #include <sys/kpi_socket.h>
82 #include <sys/kas_info.h>
83 #include <sys/socket.h>
84 #include <sys/socketvar.h>
85 #include <sys/random.h>
86 #include <sys/code_signing.h>
87 #if NECP
88 #include <net/necp.h>
89 #endif /* NECP */
90 #if SKYWALK
91 #include <skywalk/os_channel.h>
92 #endif /* SKYWALK */
93 
94 #include <security/audit/audit.h>
95 #include <security/mac.h>
96 #include <bsm/audit_kevents.h>
97 
98 #include <kern/kalloc.h>
99 #include <kern/host_statistics.h>
100 
101 #include <vm/vm_map_internal.h>
102 #include <vm/vm_kern_xnu.h>
103 #include <vm/vm_pageout_xnu.h>
104 
105 #include <mach/shared_region.h>
106 #include <vm/vm_shared_region_internal.h>
107 
108 #include <vm/vm_dyld_pager_internal.h>
109 #include <vm/vm_protos_internal.h>
110 #include <vm/vm_compressor_info.h>         /* for c_segment_info */
111 #include <vm/vm_compressor_internal.h>
112 #include <vm/vm_compressor_xnu.h>          /* for vm_compressor_serialize_segment_debug_info() */
113 #include <vm/vm_object_xnu.h>              /* for vm_chead_select_t */
114 #include <vm/vm_memory_entry_xnu.h>
115 #include <vm/vm_iokit.h>
116 #include <vm/vm_reclaim_xnu.h>
117 #if HAS_MTE
118 #include <arm64/mte_xnu.h>
119 #include <vm/vm_compressor_xnu.h>
120 #include <vm/vm_mteinfo_internal.h>
121 #include <sys/ubc.h>                        /* for mach_to_bsd_errno() */
122 #endif /* HAS_MTE */
123 
124 #include <sys/kern_memorystatus.h>
125 #include <sys/kern_memorystatus_freeze.h>
126 #include <sys/proc_internal.h>
127 
128 #include <mach-o/fixup-chains.h>
129 
130 #if CONFIG_MACF
131 #include <security/mac_framework.h>
132 #endif
133 
134 #include <kern/bits.h>
135 
136 #if CONFIG_CSR
137 #include <sys/csr.h>
138 #endif /* CONFIG_CSR */
139 #include <sys/trust_caches.h>
140 #include <libkern/amfi/amfi.h>
141 #include <IOKit/IOBSD.h>
142 
143 #if VM_MAP_DEBUG_APPLE_PROTECT
144 SYSCTL_INT(_vm, OID_AUTO, map_debug_apple_protect, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_apple_protect, 0, "");
145 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
146 
147 #if DEVELOPMENT || DEBUG
148 
149 extern int vm_object_cache_evict_all(void);
150 static int
151 sysctl_vm_object_cache_evict SYSCTL_HANDLER_ARGS
152 {
153 #pragma unused(arg1, arg2, req)
154 	(void) vm_object_cache_evict_all();
155 	return 0;
156 }
157 
158 SYSCTL_PROC(_vm, OID_AUTO, object_cache_evict, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
159     0, 0, &sysctl_vm_object_cache_evict, "I", "");
160 
161 static int
162 sysctl_kmem_alloc_contig SYSCTL_HANDLER_ARGS
163 {
164 #pragma unused(arg1, arg2)
165 	vm_offset_t     kaddr;
166 	kern_return_t   kr;
167 	int     error = 0;
168 	int     size = 0;
169 
170 	error = sysctl_handle_int(oidp, &size, 0, req);
171 	if (error || !req->newptr) {
172 		return error;
173 	}
174 
175 	kr = kmem_alloc_contig(kernel_map, &kaddr, (vm_size_t)size,
176 	    0, 0, 0, KMA_DATA, VM_KERN_MEMORY_IOKIT);
177 
178 	if (kr == KERN_SUCCESS) {
179 		kmem_free(kernel_map, kaddr, size);
180 	}
181 
182 	return error;
183 }
184 
185 SYSCTL_PROC(_vm, OID_AUTO, kmem_alloc_contig, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
186     0, 0, &sysctl_kmem_alloc_contig, "I", "");
187 
188 extern int vm_region_footprint;
189 SYSCTL_INT(_vm, OID_AUTO, region_footprint, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, &vm_region_footprint, 0, "");
190 
191 static int
192 sysctl_kmem_gobj_stats SYSCTL_HANDLER_ARGS
193 {
194 #pragma unused(arg1, arg2, oidp)
195 	kmem_gobj_stats stats = kmem_get_gobj_stats();
196 
197 	return SYSCTL_OUT(req, &stats, sizeof(stats));
198 }
199 
200 SYSCTL_PROC(_vm, OID_AUTO, kmem_gobj_stats,
201     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
202     0, 0, &sysctl_kmem_gobj_stats, "S,kmem_gobj_stats", "");
203 
204 #endif /* DEVELOPMENT || DEBUG */
205 
206 static int
207 sysctl_vm_self_region_footprint SYSCTL_HANDLER_ARGS
208 {
209 #pragma unused(arg1, arg2, oidp)
210 	int     error = 0;
211 	int     value;
212 
213 	value = task_self_region_footprint();
214 	error = SYSCTL_OUT(req, &value, sizeof(int));
215 	if (error) {
216 		return error;
217 	}
218 
219 	if (!req->newptr) {
220 		return 0;
221 	}
222 
223 	error = SYSCTL_IN(req, &value, sizeof(int));
224 	if (error) {
225 		return error;
226 	}
227 	task_self_region_footprint_set(value);
228 	return 0;
229 }
230 SYSCTL_PROC(_vm, OID_AUTO, self_region_footprint, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_footprint, "I", "");
231 
232 static int
233 sysctl_vm_self_region_page_size SYSCTL_HANDLER_ARGS
234 {
235 #pragma unused(arg1, arg2, oidp)
236 	int     error = 0;
237 	int     value;
238 
239 	value = (1 << thread_self_region_page_shift());
240 	error = SYSCTL_OUT(req, &value, sizeof(int));
241 	if (error) {
242 		return error;
243 	}
244 
245 	if (!req->newptr) {
246 		return 0;
247 	}
248 
249 	error = SYSCTL_IN(req, &value, sizeof(int));
250 	if (error) {
251 		return error;
252 	}
253 
254 	if (value != 0 && value != 4096 && value != 16384) {
255 		return EINVAL;
256 	}
257 
258 #if !__ARM_MIXED_PAGE_SIZE__
259 	if (value != vm_map_page_size(current_map())) {
260 		return EINVAL;
261 	}
262 #endif /* !__ARM_MIXED_PAGE_SIZE__ */
263 
264 	thread_self_region_page_shift_set(bit_first(value));
265 	return 0;
266 }
267 SYSCTL_PROC(_vm, OID_AUTO, self_region_page_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_page_size, "I", "");
268 
269 static int
270 sysctl_vm_self_region_info_flags SYSCTL_HANDLER_ARGS
271 {
272 #pragma unused(arg1, arg2, oidp)
273 	int     error = 0;
274 	int     value;
275 	kern_return_t kr;
276 
277 	value = task_self_region_info_flags();
278 	error = SYSCTL_OUT(req, &value, sizeof(int));
279 	if (error) {
280 		return error;
281 	}
282 
283 	if (!req->newptr) {
284 		return 0;
285 	}
286 
287 	error = SYSCTL_IN(req, &value, sizeof(int));
288 	if (error) {
289 		return error;
290 	}
291 
292 	kr = task_self_region_info_flags_set(value);
293 	if (kr != KERN_SUCCESS) {
294 		return EINVAL;
295 	}
296 
297 	return 0;
298 }
299 SYSCTL_PROC(_vm, OID_AUTO, self_region_info_flags, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_info_flags, "I", "");
300 
301 
302 #if DEVELOPMENT || DEBUG
303 extern int panic_on_unsigned_execute;
304 SYSCTL_INT(_vm, OID_AUTO, panic_on_unsigned_execute, CTLFLAG_RW | CTLFLAG_LOCKED, &panic_on_unsigned_execute, 0, "");
305 
306 extern int vm_log_xnu_user_debug;
307 SYSCTL_INT(_vm, OID_AUTO, log_xnu_user_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_xnu_user_debug, 0, "");
308 #endif /* DEVELOPMENT || DEBUG */
309 
310 extern int vm_log_map_delete_permanent_prot_none;
311 SYSCTL_INT(_vm, OID_AUTO, log_map_delete_permanent_prot_none, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_map_delete_permanent_prot_none, 0, "");
312 
313 extern int cs_executable_create_upl;
314 extern int cs_executable_wire;
315 SYSCTL_INT(_vm, OID_AUTO, cs_executable_create_upl, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_create_upl, 0, "");
316 SYSCTL_INT(_vm, OID_AUTO, cs_executable_wire, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_wire, 0, "");
317 
318 extern int apple_protect_pager_count;
319 extern int apple_protect_pager_count_mapped;
320 extern unsigned int apple_protect_pager_cache_limit;
321 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count, 0, "");
322 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count_mapped, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count_mapped, 0, "");
323 SYSCTL_UINT(_vm, OID_AUTO, apple_protect_pager_cache_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_cache_limit, 0, "");
324 
325 #if DEVELOPMENT || DEBUG
326 extern int radar_20146450;
327 SYSCTL_INT(_vm, OID_AUTO, radar_20146450, CTLFLAG_RW | CTLFLAG_LOCKED, &radar_20146450, 0, "");
328 
329 extern int macho_printf;
330 SYSCTL_INT(_vm, OID_AUTO, macho_printf, CTLFLAG_RW | CTLFLAG_LOCKED, &macho_printf, 0, "");
331 
332 extern int apple_protect_pager_data_request_debug;
333 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_data_request_debug, 0, "");
334 
335 extern unsigned int vm_object_copy_delayed_paging_wait_disable;
336 EXPERIMENT_FACTOR_LEGACY_UINT(_vm, vm_object_copy_delayed_paging_wait_disable, &vm_object_copy_delayed_paging_wait_disable, FALSE, TRUE, "");
337 
338 __enum_closed_decl(vm_submap_test_op, uint32_t, {
339 	vsto_make_submap = 1,  /* make submap from entries in current_map()
340 	                        * at start..end, offset ignored */
341 	vsto_remap_submap = 2, /* map in current_map() at start..end,
342 	                        * from parent address submap_base_address
343 	                        * and submap address offset */
344 	vsto_end
345 });
346 
347 static int
348 sysctl_vm_submap_test_ctl SYSCTL_HANDLER_ARGS
349 {
350 	int error;
351 	struct {
352 		vm_submap_test_op op;
353 		mach_vm_address_t submap_base_address;
354 		mach_vm_address_t start;
355 		mach_vm_address_t end;
356 		mach_vm_address_t offset;
357 	} args;
358 	if (req->newlen != sizeof(args)) {
359 		return EINVAL;
360 	}
361 	error = SYSCTL_IN(req, &args, sizeof(args));
362 	if (error) {
363 		return error;
364 	}
365 
366 	switch (args.op) {
367 	case vsto_make_submap:
368 		vm_map_testing_make_sealed_submap(current_map(), args.start, args.end);
369 		break;
370 	case vsto_remap_submap:
371 		vm_map_testing_remap_submap(current_map(),
372 		    args.submap_base_address, args.start, args.end, args.offset);
373 		break;
374 	default:
375 		return EINVAL;
376 	}
377 
378 	return 0;
379 }
380 SYSCTL_PROC(_vm, OID_AUTO, submap_test_ctl, CTLFLAG_WR | CTLFLAG_LOCKED, 0, 0, &sysctl_vm_submap_test_ctl, "-", "");
381 
382 #if __arm64__
383 /* These are meant to support the page table accounting unit test. */
384 extern unsigned int arm_hardware_page_size;
385 extern unsigned int arm_pt_desc_size;
386 extern unsigned int arm_pt_root_size;
387 extern unsigned int inuse_user_tteroot_count;
388 extern unsigned int inuse_kernel_tteroot_count;
389 extern unsigned int inuse_user_ttepages_count;
390 extern unsigned int inuse_kernel_ttepages_count;
391 extern unsigned int inuse_user_ptepages_count;
392 extern unsigned int inuse_kernel_ptepages_count;
393 SYSCTL_UINT(_vm, OID_AUTO, native_hw_pagesize, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_hardware_page_size, 0, "");
394 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_desc_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_desc_size, 0, "");
395 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_root_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_root_size, 0, "");
396 SYSCTL_UINT(_vm, OID_AUTO, user_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_tteroot_count, 0, "");
397 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_tteroot_count, 0, "");
398 SYSCTL_UINT(_vm, OID_AUTO, user_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ttepages_count, 0, "");
399 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ttepages_count, 0, "");
400 SYSCTL_UINT(_vm, OID_AUTO, user_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ptepages_count, 0, "");
401 SYSCTL_UINT(_vm, OID_AUTO, kernel_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ptepages_count, 0, "");
402 #if !CONFIG_SPTM
403 extern unsigned int free_page_size_tt_count;
404 extern unsigned int free_tt_count;
405 SYSCTL_UINT(_vm, OID_AUTO, free_1page_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_page_size_tt_count, 0, "");
406 SYSCTL_UINT(_vm, OID_AUTO, free_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_tt_count, 0, "");
407 #endif
408 #if DEVELOPMENT || DEBUG
409 extern unsigned long pmap_asid_flushes;
410 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_flushes, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_flushes, "");
411 extern unsigned long pmap_asid_hits;
412 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_hits, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_hits, "");
413 extern unsigned long pmap_asid_misses;
414 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_misses, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_misses, "");
415 extern unsigned long pmap_speculation_restrictions;
416 SYSCTL_ULONG(_vm, OID_AUTO, pmap_speculation_restrictions, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_speculation_restrictions, "");
417 #endif
418 #endif /* __arm64__ */
419 #endif /* DEVELOPMENT || DEBUG */
420 
421 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor, 0, "");
422 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor_pages, 0, "");
423 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate, 0, "");
424 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate_failure, 0, "");
425 SYSCTL_INT(_vm, OID_AUTO, vm_should_cow_but_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.should_cow_but_wired, 0, "");
426 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow, 0, "");
427 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow_pages, 0, "");
428 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_write, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_write, 0, "");
429 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_copy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_copy, 0, "");
430 #if VM_SCAN_FOR_SHADOW_CHAIN
431 static int vm_shadow_max_enabled = 0;    /* Disabled by default */
432 extern int proc_shadow_max(void);
433 static int
434 vm_shadow_max SYSCTL_HANDLER_ARGS
435 {
436 #pragma unused(arg1, arg2, oidp)
437 	int value = 0;
438 
439 	if (vm_shadow_max_enabled) {
440 		value = proc_shadow_max();
441 	}
442 
443 	return SYSCTL_OUT(req, &value, sizeof(value));
444 }
445 SYSCTL_PROC(_vm, OID_AUTO, vm_shadow_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
446     0, 0, &vm_shadow_max, "I", "");
447 
448 SYSCTL_INT(_vm, OID_AUTO, vm_shadow_max_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_shadow_max_enabled, 0, "");
449 
450 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
451 
452 SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, "");
453 
454 #if PAGE_SLEEP_WITH_INHERITOR
455 #if DEVELOPMENT || DEBUG
456 extern uint32_t page_worker_table_size;
457 SYSCTL_INT(_vm, OID_AUTO, page_worker_table_size, CTLFLAG_RD | CTLFLAG_LOCKED, &page_worker_table_size, 0, "");
458 SCALABLE_COUNTER_DECLARE(page_worker_hash_collisions);
459 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_hash_collisions, page_worker_hash_collisions, "");
460 SCALABLE_COUNTER_DECLARE(page_worker_inheritor_sleeps);
461 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_inheritor_sleeps, page_worker_inheritor_sleeps, "");
462 #endif /* DEVELOPMENT || DEBUG */
463 #endif /* PAGE_SLEEP_WITH_INHERITOR */
464 
465 #if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1
466 extern uint32_t vm_cheads;
467 extern vm_chead_select_t vm_chead_select;
468 extern boolean_t vm_chead_rehint;
469 #if DEVELOPMENT || DEBUG
470 SYSCTL_UINT(_vm, OID_AUTO, compressor_heads, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cheads, 0, "");
471 SYSCTL_UINT(_vm, OID_AUTO, compressor_head_select, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_chead_select, 0, "");
472 SYSCTL_INT(_vm, OID_AUTO, compressor_head_rehint, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_chead_rehint, 0, "");
473 #endif /* DEVELOPMENT || DEBUG */
474 EXPERIMENT_FACTOR_UINT(compressor_heads, &vm_cheads, 1, COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT, "");
475 EXPERIMENT_FACTOR_UINT(compressor_head_select, &vm_chead_select, CSEL_MIN, CSEL_MAX, "");
476 EXPERIMENT_FACTOR_INT(compressor_head_rehint, &vm_chead_rehint, 0, 1, "");
477 #endif /* COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 */
478 
479 /*
480  * Sysctl's related to data/stack execution.  See osfmk/vm/vm_map.c
481  */
482 
483 #if DEVELOPMENT || DEBUG
484 extern int allow_stack_exec, allow_data_exec;
485 
486 SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, "");
487 SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, "");
488 
489 #endif /* DEVELOPMENT || DEBUG */
490 
491 static const char *prot_values[] = {
492 	"none",
493 	"read-only",
494 	"write-only",
495 	"read-write",
496 	"execute-only",
497 	"read-execute",
498 	"write-execute",
499 	"read-write-execute"
500 };
501 
502 void
log_stack_execution_failure(addr64_t vaddr,vm_prot_t prot)503 log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot)
504 {
505 	printf("Data/Stack execution not permitted: %s[pid %d] at virtual address 0x%qx, protections were %s\n",
506 	    current_proc()->p_comm, proc_getpid(current_proc()), vaddr, prot_values[prot & VM_PROT_ALL]);
507 }
508 
509 /*
510  * shared_region_unnest_logging: level of logging of unnesting events
511  * 0	- no logging
512  * 1	- throttled logging of unexpected unnesting events (default)
513  * 2	- unthrottled logging of unexpected unnesting events
514  * 3+	- unthrottled logging of all unnesting events
515  */
516 int shared_region_unnest_logging = 1;
517 
518 SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED,
519     &shared_region_unnest_logging, 0, "");
520 
521 int vm_shared_region_unnest_log_interval = 10;
522 int shared_region_unnest_log_count_threshold = 5;
523 
524 
525 #if XNU_TARGET_OS_OSX
526 
527 #if defined (__x86_64__)
528 static int scdir_enforce = 1;
529 #else /* defined (__x86_64__) */
530 static int scdir_enforce = 0;   /* AOT caches live elsewhere */
531 #endif /* defined (__x86_64__) */
532 
533 static char *scdir_path[] = {
534 	"/System/Library/dyld/",
535 	"/System/Volumes/Preboot/Cryptexes/OS/System/Library/dyld",
536 	"/System/Cryptexes/OS/System/Library/dyld",
537 	NULL
538 };
539 
540 #else /* XNU_TARGET_OS_OSX */
541 
542 static int scdir_enforce = 0;
543 static char *scdir_path[] = {
544 	"/System/Library/Caches/com.apple.dyld/",
545 	"/private/preboot/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
546 	"/System/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
547 	NULL
548 };
549 
550 #endif /* XNU_TARGET_OS_OSX */
551 
552 static char *driverkit_scdir_path[] = {
553 	"/System/DriverKit/System/Library/dyld/",
554 #if XNU_TARGET_OS_OSX
555 	"/System/Volumes/Preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
556 #else
557 	"/private/preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
558 #endif /* XNU_TARGET_OS_OSX */
559 	"/System/Cryptexes/OS/System/DriverKit/System/Library/dyld",
560 	NULL
561 };
562 
563 #ifndef SECURE_KERNEL
564 static int sysctl_scdir_enforce SYSCTL_HANDLER_ARGS
565 {
566 #if CONFIG_CSR
567 	if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
568 		printf("Failed attempt to set vm.enforce_shared_cache_dir sysctl\n");
569 		return EPERM;
570 	}
571 #endif /* CONFIG_CSR */
572 	return sysctl_handle_int(oidp, arg1, arg2, req);
573 }
574 
575 SYSCTL_PROC(_vm, OID_AUTO, enforce_shared_cache_dir, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, sysctl_scdir_enforce, "I", "");
576 #endif
577 
578 /* These log rate throttling state variables aren't thread safe, but
579  * are sufficient unto the task.
580  */
581 static int64_t last_unnest_log_time = 0;
582 static int shared_region_unnest_log_count = 0;
583 
584 void
log_unnest_badness(vm_map_t m,vm_map_offset_t s,vm_map_offset_t e,boolean_t is_nested_map,vm_map_offset_t lowest_unnestable_addr)585 log_unnest_badness(
586 	vm_map_t        m,
587 	vm_map_offset_t s,
588 	vm_map_offset_t e,
589 	boolean_t       is_nested_map,
590 	vm_map_offset_t lowest_unnestable_addr)
591 {
592 	struct timeval  tv;
593 
594 	if (shared_region_unnest_logging == 0) {
595 		return;
596 	}
597 
598 	if (shared_region_unnest_logging <= 2 &&
599 	    is_nested_map &&
600 	    s >= lowest_unnestable_addr) {
601 		/*
602 		 * Unnesting of writable map entries is fine.
603 		 */
604 		return;
605 	}
606 
607 	if (shared_region_unnest_logging <= 1) {
608 		microtime(&tv);
609 		if ((tv.tv_sec - last_unnest_log_time) <
610 		    vm_shared_region_unnest_log_interval) {
611 			if (shared_region_unnest_log_count++ >
612 			    shared_region_unnest_log_count_threshold) {
613 				return;
614 			}
615 		} else {
616 			last_unnest_log_time = tv.tv_sec;
617 			shared_region_unnest_log_count = 0;
618 		}
619 	}
620 
621 	DTRACE_VM4(log_unnest_badness,
622 	    vm_map_t, m,
623 	    vm_map_offset_t, s,
624 	    vm_map_offset_t, e,
625 	    vm_map_offset_t, lowest_unnestable_addr);
626 	printf("%s[%d] triggered unnest of range 0x%qx->0x%qx of DYLD shared region in VM map %p. While not abnormal for debuggers, this increases system memory footprint until the target exits.\n", current_proc()->p_comm, proc_getpid(current_proc()), (uint64_t)s, (uint64_t)e, (void *) VM_KERNEL_ADDRPERM(m));
627 }
628 
629 uint64_t
vm_purge_filebacked_pagers(void)630 vm_purge_filebacked_pagers(void)
631 {
632 	uint64_t pages_purged;
633 
634 	pages_purged = 0;
635 	pages_purged += apple_protect_pager_purge_all();
636 	pages_purged += shared_region_pager_purge_all();
637 	pages_purged += dyld_pager_purge_all();
638 #if DEVELOPMENT || DEBUG
639 	printf("%s:%d pages purged: %llu\n", __FUNCTION__, __LINE__, pages_purged);
640 #endif /* DEVELOPMENT || DEBUG */
641 	return pages_purged;
642 }
643 
644 int
useracc(user_addr_ut addr_u,user_size_ut len_u,int prot)645 useracc(
646 	user_addr_ut    addr_u,
647 	user_size_ut    len_u,
648 	int             prot)
649 {
650 	vm_map_t        map;
651 	vm_prot_t       vm_prot = VM_PROT_WRITE;
652 
653 	map = current_map();
654 
655 	if (prot == B_READ) {
656 		vm_prot = VM_PROT_READ;
657 	}
658 
659 	return vm_map_check_protection(map, addr_u,
660 	           vm_sanitize_compute_ut_end(addr_u, len_u), vm_prot,
661 	           VM_SANITIZE_CALLER_USERACC);
662 }
663 
664 #if XNU_PLATFORM_MacOSX
665 static __attribute__((always_inline, warn_unused_result))
666 kern_return_t
vslock_sanitize(vm_map_t map,user_addr_ut addr_u,user_size_ut len_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)667 vslock_sanitize(
668 	vm_map_t                map,
669 	user_addr_ut            addr_u,
670 	user_size_ut            len_u,
671 	vm_sanitize_caller_t    vm_sanitize_caller,
672 	vm_map_offset_t        *start,
673 	vm_map_offset_t        *end,
674 	vm_map_size_t          *size)
675 {
676 	return vm_sanitize_addr_size(addr_u, len_u, vm_sanitize_caller,
677 	           map,
678 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
679 	           size);
680 }
681 #endif /* XNU_PLATFORM_MacOSX */
682 
683 int
vslock(user_addr_ut addr,user_size_ut len)684 vslock(user_addr_ut addr, user_size_ut len)
685 {
686 	kern_return_t kret;
687 
688 #if XNU_PLATFORM_MacOSX
689 	/*
690 	 * Preserve previous behavior on macOS for overflows due to bin
691 	 * compatibility i.e. return success for overflows without doing
692 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
693 	 * for overflow errors which gets converted to KERN_SUCCESS by
694 	 * vm_sanitize_get_kr.
695 	 */
696 	vm_map_offset_t start, end;
697 	vm_map_size_t   size;
698 
699 	kret = vslock_sanitize(current_map(),
700 	    addr,
701 	    len,
702 	    VM_SANITIZE_CALLER_VSLOCK,
703 	    &start,
704 	    &end,
705 	    &size);
706 	if (__improbable(kret != KERN_SUCCESS)) {
707 		switch (vm_sanitize_get_kr(kret)) {
708 		case KERN_SUCCESS:
709 			return 0;
710 		case KERN_INVALID_ADDRESS:
711 		case KERN_NO_SPACE:
712 			return ENOMEM;
713 		case KERN_PROTECTION_FAILURE:
714 			return EACCES;
715 		default:
716 			return EINVAL;
717 		}
718 	}
719 #endif /* XNU_PLATFORM_MacOSX */
720 
721 	kret = vm_map_wire_kernel(current_map(), addr,
722 	    vm_sanitize_compute_ut_end(addr, len),
723 	    vm_sanitize_wrap_prot(VM_PROT_READ | VM_PROT_WRITE),
724 	    VM_KERN_MEMORY_BSD,
725 	    FALSE);
726 
727 	switch (kret) {
728 	case KERN_SUCCESS:
729 		return 0;
730 	case KERN_INVALID_ADDRESS:
731 	case KERN_NO_SPACE:
732 		return ENOMEM;
733 	case KERN_PROTECTION_FAILURE:
734 		return EACCES;
735 	default:
736 		return EINVAL;
737 	}
738 }
739 
740 int
vsunlock(user_addr_ut addr,user_size_ut len,__unused int dirtied)741 vsunlock(user_addr_ut addr, user_size_ut len, __unused int dirtied)
742 {
743 #if FIXME  /* [ */
744 	pmap_t          pmap;
745 	vm_page_t       pg;
746 	vm_map_offset_t vaddr;
747 	ppnum_t         paddr;
748 #endif  /* FIXME ] */
749 	kern_return_t   kret;
750 	vm_map_t        map;
751 
752 	map = current_map();
753 
754 #if FIXME  /* [ */
755 	if (dirtied) {
756 		pmap = get_task_pmap(current_task());
757 		for (vaddr = vm_map_trunc_page(addr, PAGE_MASK);
758 		    vaddr < vm_map_round_page(addr + len, PAGE_MASK);
759 		    vaddr += PAGE_SIZE) {
760 			paddr = pmap_find_phys(pmap, vaddr);
761 			pg = PHYS_TO_VM_PAGE(paddr);
762 			vm_page_set_modified(pg);
763 		}
764 	}
765 #endif  /* FIXME ] */
766 #ifdef  lint
767 	dirtied++;
768 #endif  /* lint */
769 
770 #if XNU_PLATFORM_MacOSX
771 	/*
772 	 * Preserve previous behavior on macOS for overflows due to bin
773 	 * compatibility i.e. return success for overflows without doing
774 	 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
775 	 * for overflow errors which gets converted to KERN_SUCCESS by
776 	 * vm_sanitize_get_kr.
777 	 */
778 	vm_map_offset_t start, end;
779 	vm_map_size_t   size;
780 
781 	kret = vslock_sanitize(map,
782 	    addr,
783 	    len,
784 	    VM_SANITIZE_CALLER_VSUNLOCK,
785 	    &start,
786 	    &end,
787 	    &size);
788 	if (__improbable(kret != KERN_SUCCESS)) {
789 		switch (vm_sanitize_get_kr(kret)) {
790 		case KERN_SUCCESS:
791 			return 0;
792 		case KERN_INVALID_ADDRESS:
793 		case KERN_NO_SPACE:
794 			return ENOMEM;
795 		case KERN_PROTECTION_FAILURE:
796 			return EACCES;
797 		default:
798 			return EINVAL;
799 		}
800 	}
801 #endif /* XNU_PLATFORM_MacOSX */
802 
803 	kret = vm_map_unwire(map, addr,
804 	    vm_sanitize_compute_ut_end(addr, len), false);
805 	switch (kret) {
806 	case KERN_SUCCESS:
807 		return 0;
808 	case KERN_INVALID_ADDRESS:
809 	case KERN_NO_SPACE:
810 		return ENOMEM;
811 	case KERN_PROTECTION_FAILURE:
812 		return EACCES;
813 	default:
814 		return EINVAL;
815 	}
816 }
817 
818 int
subyte(user_addr_t addr,int byte)819 subyte(
820 	user_addr_t addr,
821 	int byte)
822 {
823 	char character;
824 
825 	character = (char)byte;
826 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
827 }
828 
829 int
suibyte(user_addr_t addr,int byte)830 suibyte(
831 	user_addr_t addr,
832 	int byte)
833 {
834 	char character;
835 
836 	character = (char)byte;
837 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
838 }
839 
840 int
fubyte(user_addr_t addr)841 fubyte(user_addr_t addr)
842 {
843 	unsigned char byte;
844 
845 	if (copyin(addr, (void *) &byte, sizeof(char))) {
846 		return -1;
847 	}
848 	return byte;
849 }
850 
851 int
fuibyte(user_addr_t addr)852 fuibyte(user_addr_t addr)
853 {
854 	unsigned char byte;
855 
856 	if (copyin(addr, (void *) &(byte), sizeof(char))) {
857 		return -1;
858 	}
859 	return byte;
860 }
861 
862 int
suword(user_addr_t addr,long word)863 suword(
864 	user_addr_t addr,
865 	long word)
866 {
867 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
868 }
869 
870 long
fuword(user_addr_t addr)871 fuword(user_addr_t addr)
872 {
873 	long word = 0;
874 
875 	if (copyin(addr, (void *) &word, sizeof(int))) {
876 		return -1;
877 	}
878 	return word;
879 }
880 
881 /* suiword and fuiword are the same as suword and fuword, respectively */
882 
883 int
suiword(user_addr_t addr,long word)884 suiword(
885 	user_addr_t addr,
886 	long word)
887 {
888 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
889 }
890 
891 long
fuiword(user_addr_t addr)892 fuiword(user_addr_t addr)
893 {
894 	long word = 0;
895 
896 	if (copyin(addr, (void *) &word, sizeof(int))) {
897 		return -1;
898 	}
899 	return word;
900 }
901 
902 /*
903  * With a 32-bit kernel and mixed 32/64-bit user tasks, this interface allows the
904  * fetching and setting of process-sized size_t and pointer values.
905  */
906 int
sulong(user_addr_t addr,int64_t word)907 sulong(user_addr_t addr, int64_t word)
908 {
909 	if (IS_64BIT_PROCESS(current_proc())) {
910 		return copyout((void *)&word, addr, sizeof(word)) == 0 ? 0 : -1;
911 	} else {
912 		return suiword(addr, (long)word);
913 	}
914 }
915 
916 int64_t
fulong(user_addr_t addr)917 fulong(user_addr_t addr)
918 {
919 	int64_t longword;
920 
921 	if (IS_64BIT_PROCESS(current_proc())) {
922 		if (copyin(addr, (void *)&longword, sizeof(longword)) != 0) {
923 			return -1;
924 		}
925 		return longword;
926 	} else {
927 		return (int64_t)fuiword(addr);
928 	}
929 }
930 
931 int
suulong(user_addr_t addr,uint64_t uword)932 suulong(user_addr_t addr, uint64_t uword)
933 {
934 	if (IS_64BIT_PROCESS(current_proc())) {
935 		return copyout((void *)&uword, addr, sizeof(uword)) == 0 ? 0 : -1;
936 	} else {
937 		return suiword(addr, (uint32_t)uword);
938 	}
939 }
940 
941 uint64_t
fuulong(user_addr_t addr)942 fuulong(user_addr_t addr)
943 {
944 	uint64_t ulongword;
945 
946 	if (IS_64BIT_PROCESS(current_proc())) {
947 		if (copyin(addr, (void *)&ulongword, sizeof(ulongword)) != 0) {
948 			return -1ULL;
949 		}
950 		return ulongword;
951 	} else {
952 		return (uint64_t)fuiword(addr);
953 	}
954 }
955 
956 int
swapon(__unused proc_t procp,__unused struct swapon_args * uap,__unused int * retval)957 swapon(__unused proc_t procp, __unused struct swapon_args *uap, __unused int *retval)
958 {
959 	return ENOTSUP;
960 }
961 
962 #if defined(SECURE_KERNEL)
963 static int kern_secure_kernel = 1;
964 #else
965 static int kern_secure_kernel = 0;
966 #endif
967 
968 SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_secure_kernel, 0, "");
969 SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW | CTLFLAG_LOCKED,
970     &shared_region_trace_level, 0, "");
971 SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD | CTLFLAG_LOCKED,
972     &shared_region_version, 0, "");
973 SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED,
974     &shared_region_persistence, 0, "");
975 
976 /*
977  * shared_region_check_np:
978  *
979  * This system call is intended for dyld.
980  *
981  * dyld calls this when any process starts to see if the process's shared
982  * region is already set up and ready to use.
983  * This call returns the base address of the first mapping in the
984  * process's shared region's first mapping.
985  * dyld will then check what's mapped at that address.
986  *
987  * If the shared region is empty, dyld will then attempt to map the shared
988  * cache file in the shared region via the shared_region_map_and_slide_2_np()
989  * system call.
990  *
991  * If something's already mapped in the shared region, dyld will check if it
992  * matches the shared cache it would like to use for that process.
993  * If it matches, evrything's ready and the process can proceed and use the
994  * shared region.
995  * If it doesn't match, dyld will unmap the shared region and map the shared
996  * cache into the process's address space via mmap().
997  *
998  * A NULL pointer argument can be used by dyld to indicate it has unmapped
999  * the shared region. We will remove the shared_region reference from the task.
1000  *
1001  * ERROR VALUES
1002  * EINVAL	no shared region
1003  * ENOMEM	shared region is empty
1004  * EFAULT	bad address for "start_address"
1005  */
1006 int
shared_region_check_np(__unused struct proc * p,struct shared_region_check_np_args * uap,__unused int * retvalp)1007 shared_region_check_np(
1008 	__unused struct proc                    *p,
1009 	struct shared_region_check_np_args      *uap,
1010 	__unused int                            *retvalp)
1011 {
1012 	vm_shared_region_t      shared_region;
1013 	mach_vm_offset_t        start_address = 0;
1014 	int                     error = 0;
1015 	kern_return_t           kr = KERN_FAILURE;
1016 	task_t                  task = current_task();
1017 
1018 	SHARED_REGION_TRACE_DEBUG(
1019 		("shared_region: %p [%d(%s)] -> check_np(0x%llx)\n",
1020 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1021 		proc_getpid(p), p->p_comm,
1022 		(uint64_t)uap->start_address));
1023 
1024 	/*
1025 	 * Special value of start_address used to indicate that map_with_linking() should
1026 	 * no longer be allowed in this process
1027 	 */
1028 	if (uap->start_address == (task_get_64bit_addr(task) ? DYLD_VM_END_MWL : (uint32_t)DYLD_VM_END_MWL)) {
1029 		p->p_disallow_map_with_linking = TRUE;
1030 		return 0;
1031 	}
1032 
1033 	/* retrieve the current task's shared region */
1034 	shared_region = vm_shared_region_get(task);
1035 	if (shared_region != NULL) {
1036 		/*
1037 		 * A NULL argument is used by dyld to indicate the task
1038 		 * has unmapped its shared region.
1039 		 */
1040 		if (uap->start_address == 0) {
1041 			/* unmap it first */
1042 			vm_shared_region_remove(task, shared_region);
1043 			vm_shared_region_set(task, NULL);
1044 		} else {
1045 			/* retrieve address of its first mapping... */
1046 			kr = vm_shared_region_start_address(shared_region, &start_address);
1047 			if (kr != KERN_SUCCESS) {
1048 				SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1049 				    "check_np(0x%llx) "
1050 				    "vm_shared_region_start_address() returned 0x%x\n",
1051 				    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1052 				    proc_getpid(p), p->p_comm,
1053 				    (uint64_t)uap->start_address, kr));
1054 				error = ENOMEM;
1055 			}
1056 			if (error == 0) {
1057 				/* Insert the shared region submap and various bits of debug info into the task. */
1058 				kr = vm_shared_region_update_task(task, shared_region, start_address);
1059 				if (kr != KERN_SUCCESS) {
1060 					SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1061 					    "check_np(0x%llx) "
1062 					    "vm_shared_update_task() returned 0x%x\n",
1063 					    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1064 					    proc_getpid(p), p->p_comm,
1065 					    (uint64_t)uap->start_address, kr));
1066 
1067 					error = ENOMEM;
1068 				}
1069 			}
1070 #if __has_feature(ptrauth_calls)
1071 			/*
1072 			 * Remap any section of the shared library that
1073 			 * has authenticated pointers into private memory.
1074 			 */
1075 			if ((error == 0) && (vm_shared_region_auth_remap(shared_region) != KERN_SUCCESS)) {
1076 				SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1077 				    "check_np(0x%llx) "
1078 				    "vm_shared_region_auth_remap() failed\n",
1079 				    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1080 				    proc_getpid(p), p->p_comm,
1081 				    (uint64_t)uap->start_address));
1082 				error = ENOMEM;
1083 			}
1084 #endif /* __has_feature(ptrauth_calls) */
1085 			/* Give the start address to the caller */
1086 			if (error == 0) {
1087 				error = copyout(&start_address,
1088 				    (user_addr_t) uap->start_address,
1089 				    sizeof(start_address));
1090 				if (error != 0) {
1091 					SHARED_REGION_TRACE_ERROR(
1092 						("shared_region: %p [%d(%s)] "
1093 						"check_np(0x%llx) "
1094 						"copyout(0x%llx) error %d\n",
1095 						(void *)VM_KERNEL_ADDRPERM(current_thread()),
1096 						proc_getpid(p), p->p_comm,
1097 						(uint64_t)uap->start_address, (uint64_t)start_address,
1098 						error));
1099 				}
1100 			}
1101 		}
1102 		vm_shared_region_deallocate(shared_region);
1103 	} else {
1104 		/* no shared region ! */
1105 		error = EINVAL;
1106 	}
1107 
1108 	SHARED_REGION_TRACE_DEBUG(
1109 		("shared_region: %p [%d(%s)] check_np(0x%llx) <- 0x%llx %d\n",
1110 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1111 		proc_getpid(p), p->p_comm,
1112 		(uint64_t)uap->start_address, (uint64_t)start_address, error));
1113 
1114 	return error;
1115 }
1116 
1117 
1118 static int
shared_region_copyin(struct proc * p,user_addr_t user_addr,unsigned int count,unsigned int element_size,void * kernel_data)1119 shared_region_copyin(
1120 	struct proc  *p,
1121 	user_addr_t  user_addr,
1122 	unsigned int count,
1123 	unsigned int element_size,
1124 	void         *kernel_data)
1125 {
1126 	int             error = 0;
1127 	vm_size_t       size = count * element_size;
1128 
1129 	error = copyin(user_addr, kernel_data, size);
1130 	if (error) {
1131 		SHARED_REGION_TRACE_ERROR(
1132 			("shared_region: %p [%d(%s)] map(): "
1133 			"copyin(0x%llx, %ld) failed (error=%d)\n",
1134 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1135 			proc_getpid(p), p->p_comm,
1136 			(uint64_t)user_addr, (long)size, error));
1137 	}
1138 	return error;
1139 }
1140 
1141 /*
1142  * A reasonable upper limit to prevent overflow of allocation/copyin.
1143  */
1144 #define _SR_FILE_MAPPINGS_MAX_FILES 256
1145 
1146 /* forward declaration */
1147 __attribute__((noinline))
1148 static void shared_region_map_and_slide_cleanup(
1149 	struct proc              *p,
1150 	uint32_t                 files_count,
1151 	struct _sr_file_mappings *sr_file_mappings,
1152 	struct vm_shared_region  *shared_region);
1153 
1154 /*
1155  * Setup part of _shared_region_map_and_slide().
1156  * It had to be broken out of _shared_region_map_and_slide() to
1157  * prevent compiler inlining from blowing out the stack.
1158  */
1159 __attribute__((noinline))
1160 static int
shared_region_map_and_slide_setup(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings,struct _sr_file_mappings ** sr_file_mappings,struct vm_shared_region ** shared_region_ptr,struct vnode * rdir_vp)1161 shared_region_map_and_slide_setup(
1162 	struct proc                         *p,
1163 	uint32_t                            files_count,
1164 	struct shared_file_np               *files,
1165 	uint32_t                            mappings_count,
1166 	struct shared_file_mapping_slide_np *mappings,
1167 	struct _sr_file_mappings            **sr_file_mappings,
1168 	struct vm_shared_region             **shared_region_ptr,
1169 	struct vnode                        *rdir_vp)
1170 {
1171 	int                             error = 0;
1172 	struct _sr_file_mappings        *srfmp;
1173 	uint32_t                        mappings_next;
1174 	struct vnode_attr               va;
1175 	off_t                           fs;
1176 #if CONFIG_MACF
1177 	vm_prot_t                       maxprot = VM_PROT_ALL;
1178 #endif
1179 	uint32_t                        i;
1180 	struct vm_shared_region         *shared_region = NULL;
1181 	boolean_t                       is_driverkit = task_is_driver(current_task());
1182 
1183 	SHARED_REGION_TRACE_DEBUG(
1184 		("shared_region: %p [%d(%s)] -> map_and_slide_setup\n",
1185 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1186 		proc_getpid(p), p->p_comm));
1187 
1188 	if (files_count > _SR_FILE_MAPPINGS_MAX_FILES) {
1189 		error = E2BIG;
1190 		goto done;
1191 	}
1192 	if (files_count == 0) {
1193 		error = EINVAL;
1194 		goto done;
1195 	}
1196 	*sr_file_mappings = kalloc_type(struct _sr_file_mappings, files_count,
1197 	    Z_WAITOK | Z_ZERO);
1198 	if (*sr_file_mappings == NULL) {
1199 		error = ENOMEM;
1200 		goto done;
1201 	}
1202 	mappings_next = 0;
1203 	for (i = 0; i < files_count; i++) {
1204 		srfmp = &(*sr_file_mappings)[i];
1205 		srfmp->fd = files[i].sf_fd;
1206 		srfmp->mappings_count = files[i].sf_mappings_count;
1207 		srfmp->mappings = &mappings[mappings_next];
1208 		mappings_next += srfmp->mappings_count;
1209 		if (mappings_next > mappings_count) {
1210 			error = EINVAL;
1211 			goto done;
1212 		}
1213 		srfmp->slide = files[i].sf_slide;
1214 	}
1215 
1216 	/* get the process's shared region (setup in vm_map_exec()) */
1217 	shared_region = vm_shared_region_get(current_task());
1218 	*shared_region_ptr = shared_region;
1219 	if (shared_region == NULL) {
1220 		SHARED_REGION_TRACE_ERROR(
1221 			("shared_region: %p [%d(%s)] map(): "
1222 			"no shared region\n",
1223 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1224 			proc_getpid(p), p->p_comm));
1225 		error = EINVAL;
1226 		goto done;
1227 	}
1228 
1229 	/*
1230 	 * Check the shared region matches the current root
1231 	 * directory of this process.  Deny the mapping to
1232 	 * avoid tainting the shared region with something that
1233 	 * doesn't quite belong into it.
1234 	 */
1235 	struct vnode *sr_vnode = vm_shared_region_root_dir(shared_region);
1236 	if (sr_vnode != NULL ?  rdir_vp != sr_vnode : rdir_vp != rootvnode) {
1237 		SHARED_REGION_TRACE_ERROR(
1238 			("shared_region: map(%p) root_dir mismatch\n",
1239 			(void *)VM_KERNEL_ADDRPERM(current_thread())));
1240 		error = EPERM;
1241 		goto done;
1242 	}
1243 
1244 
1245 	for (srfmp = &(*sr_file_mappings)[0];
1246 	    srfmp < &(*sr_file_mappings)[files_count];
1247 	    srfmp++) {
1248 		if (srfmp->mappings_count == 0) {
1249 			/* no mappings here... */
1250 			continue;
1251 		}
1252 
1253 		/*
1254 		 * A file descriptor of -1 is used to indicate that the data
1255 		 * to be put in the shared region for this mapping comes directly
1256 		 * from the processes address space. Ensure we have proper alignments.
1257 		 */
1258 		if (srfmp->fd == -1) {
1259 			/* only allow one mapping per fd */
1260 			if (srfmp->mappings_count > 1) {
1261 				SHARED_REGION_TRACE_ERROR(
1262 					("shared_region: %p [%d(%s)] map data >1 mapping\n",
1263 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1264 					proc_getpid(p), p->p_comm));
1265 				error = EINVAL;
1266 				goto done;
1267 			}
1268 
1269 			/*
1270 			 * The destination address and size must be page aligned.
1271 			 */
1272 			struct shared_file_mapping_slide_np *mapping = &srfmp->mappings[0];
1273 			mach_vm_address_t dest_addr = mapping->sms_address;
1274 			mach_vm_size_t    map_size = mapping->sms_size;
1275 			if (!vm_map_page_aligned(dest_addr, vm_map_page_mask(current_map()))) {
1276 				SHARED_REGION_TRACE_ERROR(
1277 					("shared_region: %p [%d(%s)] map data destination 0x%llx not aligned\n",
1278 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1279 					proc_getpid(p), p->p_comm, dest_addr));
1280 				error = EINVAL;
1281 				goto done;
1282 			}
1283 			if (!vm_map_page_aligned(map_size, vm_map_page_mask(current_map()))) {
1284 				SHARED_REGION_TRACE_ERROR(
1285 					("shared_region: %p [%d(%s)] map data size 0x%llx not aligned\n",
1286 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1287 					proc_getpid(p), p->p_comm, map_size));
1288 				error = EINVAL;
1289 				goto done;
1290 			}
1291 			continue;
1292 		}
1293 
1294 		/* get file structure from file descriptor */
1295 		error = fp_get_ftype(p, srfmp->fd, DTYPE_VNODE, EINVAL, &srfmp->fp);
1296 		if (error) {
1297 			SHARED_REGION_TRACE_ERROR(
1298 				("shared_region: %p [%d(%s)] map: "
1299 				"fd=%d lookup failed (error=%d)\n",
1300 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1301 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1302 			goto done;
1303 		}
1304 
1305 		/* we need at least read permission on the file */
1306 		if (!(srfmp->fp->fp_glob->fg_flag & FREAD)) {
1307 			SHARED_REGION_TRACE_ERROR(
1308 				("shared_region: %p [%d(%s)] map: "
1309 				"fd=%d not readable\n",
1310 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1311 				proc_getpid(p), p->p_comm, srfmp->fd));
1312 			error = EPERM;
1313 			goto done;
1314 		}
1315 
1316 		/* get vnode from file structure */
1317 		error = vnode_getwithref((vnode_t)fp_get_data(srfmp->fp));
1318 		if (error) {
1319 			SHARED_REGION_TRACE_ERROR(
1320 				("shared_region: %p [%d(%s)] map: "
1321 				"fd=%d getwithref failed (error=%d)\n",
1322 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1323 				proc_getpid(p), p->p_comm, srfmp->fd, error));
1324 			goto done;
1325 		}
1326 		srfmp->vp = (struct vnode *)fp_get_data(srfmp->fp);
1327 
1328 		/* make sure the vnode is a regular file */
1329 		if (srfmp->vp->v_type != VREG) {
1330 			SHARED_REGION_TRACE_ERROR(
1331 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1332 				"not a file (type=%d)\n",
1333 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1334 				proc_getpid(p), p->p_comm,
1335 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1336 				srfmp->vp->v_name, srfmp->vp->v_type));
1337 			error = EINVAL;
1338 			goto done;
1339 		}
1340 
1341 #if CONFIG_MACF
1342 		/* pass in 0 for the offset argument because AMFI does not need the offset
1343 		 *       of the shared cache */
1344 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
1345 		    srfmp->fp->fp_glob, VM_PROT_ALL, MAP_FILE | MAP_PRIVATE | MAP_FIXED, 0, &maxprot);
1346 		if (error) {
1347 			goto done;
1348 		}
1349 #endif /* MAC */
1350 
1351 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1352 		/*
1353 		 * Check if the shared cache is in the trust cache;
1354 		 * if so, we can skip the root ownership check.
1355 		 */
1356 #if DEVELOPMENT || DEBUG
1357 		/*
1358 		 * Skip both root ownership and trust cache check if
1359 		 * enforcement is disabled.
1360 		 */
1361 		if (!cs_system_enforcement()) {
1362 			goto after_root_check;
1363 		}
1364 #endif /* DEVELOPMENT || DEBUG */
1365 		struct cs_blob *blob = csvnode_get_blob(srfmp->vp, 0);
1366 		if (blob == NULL) {
1367 			SHARED_REGION_TRACE_ERROR(
1368 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1369 				"missing CS blob\n",
1370 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1371 				proc_getpid(p), p->p_comm,
1372 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1373 				srfmp->vp->v_name));
1374 			goto root_check;
1375 		}
1376 		const uint8_t *cdhash = csblob_get_cdhash(blob);
1377 		if (cdhash == NULL) {
1378 			SHARED_REGION_TRACE_ERROR(
1379 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1380 				"missing cdhash\n",
1381 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1382 				proc_getpid(p), p->p_comm,
1383 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1384 				srfmp->vp->v_name));
1385 			goto root_check;
1386 		}
1387 
1388 		bool in_trust_cache = false;
1389 		TrustCacheQueryToken_t qt;
1390 		if (query_trust_cache(kTCQueryTypeAll, cdhash, &qt) == KERN_SUCCESS) {
1391 			TCType_t tc_type = kTCTypeInvalid;
1392 			TCReturn_t tc_ret = amfi->TrustCache.queryGetTCType(&qt, &tc_type);
1393 			in_trust_cache = (tc_ret.error == kTCReturnSuccess &&
1394 			    (tc_type == kTCTypeCryptex1BootOS ||
1395 			    tc_type == kTCTypeStatic ||
1396 			    tc_type == kTCTypeEngineering));
1397 		}
1398 		if (!in_trust_cache) {
1399 			SHARED_REGION_TRACE_ERROR(
1400 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1401 				"not in trust cache\n",
1402 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1403 				proc_getpid(p), p->p_comm,
1404 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1405 				srfmp->vp->v_name));
1406 			goto root_check;
1407 		}
1408 		goto after_root_check;
1409 root_check:
1410 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1411 
1412 		/* The shared cache file must be owned by root */
1413 		VATTR_INIT(&va);
1414 		VATTR_WANTED(&va, va_uid);
1415 		error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1416 		if (error) {
1417 			SHARED_REGION_TRACE_ERROR(
1418 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1419 				"vnode_getattr(%p) failed (error=%d)\n",
1420 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1421 				proc_getpid(p), p->p_comm,
1422 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1423 				srfmp->vp->v_name,
1424 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1425 				error));
1426 			goto done;
1427 		}
1428 		if (va.va_uid != 0) {
1429 			SHARED_REGION_TRACE_ERROR(
1430 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1431 				"owned by uid=%d instead of 0\n",
1432 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1433 				proc_getpid(p), p->p_comm,
1434 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1435 				srfmp->vp->v_name, va.va_uid));
1436 			error = EPERM;
1437 			goto done;
1438 		}
1439 
1440 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1441 after_root_check:
1442 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1443 
1444 #if CONFIG_CSR
1445 		if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
1446 			VATTR_INIT(&va);
1447 			VATTR_WANTED(&va, va_flags);
1448 			error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1449 			if (error) {
1450 				SHARED_REGION_TRACE_ERROR(
1451 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1452 					"vnode_getattr(%p) failed (error=%d)\n",
1453 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1454 					proc_getpid(p), p->p_comm,
1455 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1456 					srfmp->vp->v_name,
1457 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1458 					error));
1459 				goto done;
1460 			}
1461 
1462 			if (!(va.va_flags & SF_RESTRICTED)) {
1463 				/*
1464 				 * CSR is not configured in CSR_ALLOW_UNRESTRICTED_FS mode, and
1465 				 * the shared cache file is NOT SIP-protected, so reject the
1466 				 * mapping request
1467 				 */
1468 				SHARED_REGION_TRACE_ERROR(
1469 					("shared_region: %p [%d(%s)] map(%p:'%s'), "
1470 					"vnode is not SIP-protected. \n",
1471 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1472 					proc_getpid(p), p->p_comm,
1473 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1474 					srfmp->vp->v_name));
1475 				error = EPERM;
1476 				goto done;
1477 			}
1478 		}
1479 #else /* CONFIG_CSR */
1480 
1481 		/*
1482 		 * Devices without SIP/ROSP need to make sure that the shared cache
1483 		 * is either on the root volume or in the preboot cryptex volume.
1484 		 */
1485 		assert(rdir_vp != NULL);
1486 		if (srfmp->vp->v_mount != rdir_vp->v_mount) {
1487 			vnode_t preboot_vp = NULL;
1488 #if XNU_TARGET_OS_OSX
1489 #define PREBOOT_CRYPTEX_PATH "/System/Volumes/Preboot/Cryptexes"
1490 #else
1491 #define PREBOOT_CRYPTEX_PATH "/private/preboot/Cryptexes"
1492 #endif
1493 			error = vnode_lookup(PREBOOT_CRYPTEX_PATH, 0, &preboot_vp, vfs_context_current());
1494 			if (error || srfmp->vp->v_mount != preboot_vp->v_mount) {
1495 				SHARED_REGION_TRACE_ERROR(
1496 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1497 					"not on process' root volume nor preboot volume\n",
1498 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1499 					proc_getpid(p), p->p_comm,
1500 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1501 					srfmp->vp->v_name));
1502 				error = EPERM;
1503 				if (preboot_vp) {
1504 					(void)vnode_put(preboot_vp);
1505 				}
1506 				goto done;
1507 			} else if (preboot_vp) {
1508 				(void)vnode_put(preboot_vp);
1509 			}
1510 		}
1511 #endif /* CONFIG_CSR */
1512 
1513 		if (scdir_enforce) {
1514 			char **expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1515 			struct vnode *scdir_vp = NULL;
1516 			for (expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1517 			    *expected_scdir_path != NULL;
1518 			    expected_scdir_path++) {
1519 				/* get vnode for expected_scdir_path */
1520 				error = vnode_lookup(*expected_scdir_path, 0, &scdir_vp, vfs_context_current());
1521 				if (error) {
1522 					SHARED_REGION_TRACE_ERROR(
1523 						("shared_region: %p [%d(%s)]: "
1524 						"vnode_lookup(%s) failed (error=%d)\n",
1525 						(void *)VM_KERNEL_ADDRPERM(current_thread()),
1526 						proc_getpid(p), p->p_comm,
1527 						*expected_scdir_path, error));
1528 					continue;
1529 				}
1530 
1531 				/* check if parent is scdir_vp */
1532 				assert(scdir_vp != NULL);
1533 				if (vnode_parent(srfmp->vp) == scdir_vp) {
1534 					(void)vnode_put(scdir_vp);
1535 					scdir_vp = NULL;
1536 					goto scdir_ok;
1537 				}
1538 				(void)vnode_put(scdir_vp);
1539 				scdir_vp = NULL;
1540 			}
1541 			/* nothing matches */
1542 			SHARED_REGION_TRACE_ERROR(
1543 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1544 				"shared cache file not in expected directory\n",
1545 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1546 				proc_getpid(p), p->p_comm,
1547 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1548 				srfmp->vp->v_name));
1549 			error = EPERM;
1550 			goto done;
1551 		}
1552 scdir_ok:
1553 
1554 		/* get vnode size */
1555 		error = vnode_size(srfmp->vp, &fs, vfs_context_current());
1556 		if (error) {
1557 			SHARED_REGION_TRACE_ERROR(
1558 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1559 				"vnode_size(%p) failed (error=%d)\n",
1560 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1561 				proc_getpid(p), p->p_comm,
1562 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1563 				srfmp->vp->v_name,
1564 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp), error));
1565 			goto done;
1566 		}
1567 		srfmp->file_size = fs;
1568 
1569 		/* get the file's memory object handle */
1570 		srfmp->file_control = ubc_getobject(srfmp->vp, UBC_HOLDOBJECT);
1571 		if (srfmp->file_control == MEMORY_OBJECT_CONTROL_NULL) {
1572 			SHARED_REGION_TRACE_ERROR(
1573 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
1574 				"no memory object\n",
1575 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
1576 				proc_getpid(p), p->p_comm,
1577 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1578 				srfmp->vp->v_name));
1579 			error = EINVAL;
1580 			goto done;
1581 		}
1582 
1583 		/* check that the mappings are properly covered by code signatures */
1584 		if (!cs_system_enforcement()) {
1585 			/* code signing is not enforced: no need to check */
1586 		} else {
1587 			for (i = 0; i < srfmp->mappings_count; i++) {
1588 				if (srfmp->mappings[i].sms_init_prot & VM_PROT_ZF) {
1589 					/* zero-filled mapping: not backed by the file */
1590 					continue;
1591 				}
1592 				if (ubc_cs_is_range_codesigned(srfmp->vp,
1593 				    srfmp->mappings[i].sms_file_offset,
1594 				    srfmp->mappings[i].sms_size)) {
1595 					/* this mapping is fully covered by code signatures */
1596 					continue;
1597 				}
1598 				SHARED_REGION_TRACE_ERROR(
1599 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
1600 					"mapping #%d/%d [0x%llx:0x%llx:0x%llx:0x%x:0x%x] "
1601 					"is not code-signed\n",
1602 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
1603 					proc_getpid(p), p->p_comm,
1604 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1605 					srfmp->vp->v_name,
1606 					i, srfmp->mappings_count,
1607 					srfmp->mappings[i].sms_address,
1608 					srfmp->mappings[i].sms_size,
1609 					srfmp->mappings[i].sms_file_offset,
1610 					srfmp->mappings[i].sms_max_prot,
1611 					srfmp->mappings[i].sms_init_prot));
1612 				error = EINVAL;
1613 				goto done;
1614 			}
1615 		}
1616 	}
1617 done:
1618 	if (error != 0) {
1619 		shared_region_map_and_slide_cleanup(p, files_count, *sr_file_mappings, shared_region);
1620 		*sr_file_mappings = NULL;
1621 		*shared_region_ptr = NULL;
1622 	}
1623 	SHARED_REGION_TRACE_DEBUG(
1624 		("shared_region: %p [%d(%s)] map_and_slide_setup <- %d\n",
1625 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1626 		proc_getpid(p), p->p_comm, error));
1627 	return error;
1628 }
1629 
1630 /*
1631  * shared_region_map_np()
1632  *
1633  * This system call is intended for dyld.
1634  *
1635  * dyld uses this to map a shared cache file into a shared region.
1636  * This is usually done only the first time a shared cache is needed.
1637  * Subsequent processes will just use the populated shared region without
1638  * requiring any further setup.
1639  */
1640 static int
_shared_region_map_and_slide(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings)1641 _shared_region_map_and_slide(
1642 	struct proc                         *p,
1643 	uint32_t                            files_count,
1644 	struct shared_file_np               *files,
1645 	uint32_t                            mappings_count,
1646 	struct shared_file_mapping_slide_np *mappings)
1647 {
1648 	int                             error = 0;
1649 	kern_return_t                   kr = KERN_SUCCESS;
1650 	struct _sr_file_mappings        *sr_file_mappings = NULL;
1651 	struct vnode                    *rdir_vp = NULL;
1652 	struct vm_shared_region         *shared_region = NULL;
1653 
1654 	/*
1655 	 * Get a reference to the current proc's root dir.
1656 	 * Need this to prevent racing with chroot.
1657 	 */
1658 	proc_fdlock(p);
1659 	rdir_vp = p->p_fd.fd_rdir;
1660 	if (rdir_vp == NULL) {
1661 		rdir_vp = rootvnode;
1662 	}
1663 	assert(rdir_vp != NULL);
1664 	vnode_get(rdir_vp);
1665 	proc_fdunlock(p);
1666 
1667 	/*
1668 	 * Turn files, mappings into sr_file_mappings and other setup.
1669 	 */
1670 	error = shared_region_map_and_slide_setup(p, files_count,
1671 	    files, mappings_count, mappings,
1672 	    &sr_file_mappings, &shared_region, rdir_vp);
1673 	if (error != 0) {
1674 		vnode_put(rdir_vp);
1675 		return error;
1676 	}
1677 
1678 	/* map the file(s) into that shared region's submap */
1679 	kr = vm_shared_region_map_file(shared_region, files_count, sr_file_mappings);
1680 	if (kr != KERN_SUCCESS) {
1681 		SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] map(): "
1682 		    "vm_shared_region_map_file() failed kr=0x%x\n",
1683 		    (void *)VM_KERNEL_ADDRPERM(current_thread()),
1684 		    proc_getpid(p), p->p_comm, kr));
1685 	}
1686 
1687 	/* convert kern_return_t to errno */
1688 	switch (kr) {
1689 	case KERN_SUCCESS:
1690 		error = 0;
1691 		break;
1692 	case KERN_INVALID_ADDRESS:
1693 		error = EFAULT;
1694 		break;
1695 	case KERN_PROTECTION_FAILURE:
1696 		error = EPERM;
1697 		break;
1698 	case KERN_NO_SPACE:
1699 		error = ENOMEM;
1700 		break;
1701 	case KERN_FAILURE:
1702 	case KERN_INVALID_ARGUMENT:
1703 	default:
1704 		error = EINVAL;
1705 		break;
1706 	}
1707 
1708 	/*
1709 	 * Mark that this process is now using split libraries.
1710 	 */
1711 	if (error == 0 && (p->p_flag & P_NOSHLIB)) {
1712 		OSBitAndAtomic(~((uint32_t)P_NOSHLIB), &p->p_flag);
1713 	}
1714 
1715 	vnode_put(rdir_vp);
1716 	shared_region_map_and_slide_cleanup(p, files_count, sr_file_mappings, shared_region);
1717 
1718 	SHARED_REGION_TRACE_DEBUG(
1719 		("shared_region: %p [%d(%s)] <- map\n",
1720 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1721 		proc_getpid(p), p->p_comm));
1722 
1723 	return error;
1724 }
1725 
1726 /*
1727  * Clean up part of _shared_region_map_and_slide()
1728  * It had to be broken out of _shared_region_map_and_slide() to
1729  * prevent compiler inlining from blowing out the stack.
1730  */
1731 __attribute__((noinline))
1732 static void
shared_region_map_and_slide_cleanup(struct proc * p,uint32_t files_count,struct _sr_file_mappings * sr_file_mappings,struct vm_shared_region * shared_region)1733 shared_region_map_and_slide_cleanup(
1734 	struct proc              *p,
1735 	uint32_t                 files_count,
1736 	struct _sr_file_mappings *sr_file_mappings,
1737 	struct vm_shared_region  *shared_region)
1738 {
1739 	struct _sr_file_mappings *srfmp;
1740 	struct vnode_attr        va;
1741 
1742 	if (sr_file_mappings != NULL) {
1743 		for (srfmp = &sr_file_mappings[0]; srfmp < &sr_file_mappings[files_count]; srfmp++) {
1744 			if (srfmp->vp != NULL) {
1745 				vnode_lock_spin(srfmp->vp);
1746 				srfmp->vp->v_flag |= VSHARED_DYLD;
1747 				vnode_unlock(srfmp->vp);
1748 
1749 				/* update the vnode's access time */
1750 				if (!(vnode_vfsvisflags(srfmp->vp) & MNT_NOATIME)) {
1751 					VATTR_INIT(&va);
1752 					nanotime(&va.va_access_time);
1753 					VATTR_SET_ACTIVE(&va, va_access_time);
1754 					vnode_setattr(srfmp->vp, &va, vfs_context_current());
1755 				}
1756 
1757 #if NAMEDSTREAMS
1758 				/*
1759 				 * If the shared cache is compressed, it may
1760 				 * have a namedstream vnode instantiated for
1761 				 * for it. That namedstream vnode will also
1762 				 * have to be marked with VSHARED_DYLD.
1763 				 */
1764 				if (vnode_hasnamedstreams(srfmp->vp)) {
1765 					vnode_t svp;
1766 					if (vnode_getnamedstream(srfmp->vp, &svp, XATTR_RESOURCEFORK_NAME,
1767 					    NS_OPEN, 0, vfs_context_kernel()) == 0) {
1768 						vnode_lock_spin(svp);
1769 						svp->v_flag |= VSHARED_DYLD;
1770 						vnode_unlock(svp);
1771 						vnode_put(svp);
1772 					}
1773 				}
1774 #endif /* NAMEDSTREAMS */
1775 				/*
1776 				 * release the vnode...
1777 				 * ubc_map() still holds it for us in the non-error case
1778 				 */
1779 				(void) vnode_put(srfmp->vp);
1780 				srfmp->vp = NULL;
1781 			}
1782 			if (srfmp->fp != NULL) {
1783 				/* release the file descriptor */
1784 				fp_drop(p, srfmp->fd, srfmp->fp, 0);
1785 				srfmp->fp = NULL;
1786 			}
1787 		}
1788 		kfree_type(struct _sr_file_mappings, files_count, sr_file_mappings);
1789 	}
1790 
1791 	if (shared_region != NULL) {
1792 		vm_shared_region_deallocate(shared_region);
1793 	}
1794 }
1795 
1796 /*
1797  * For each file mapped, we may have mappings for:
1798  *    TEXT, EXECUTE, LINKEDIT, DATA_CONST, __AUTH, DATA
1799  * so let's round up to 8 mappings per file.
1800  */
1801 #define SFM_MAX       (_SR_FILE_MAPPINGS_MAX_FILES * 8)     /* max mapping structs allowed to pass in */
1802 
1803 /*
1804  * This is the new interface for setting up shared region mappings.
1805  *
1806  * The slide used for shared regions setup using this interface is done differently
1807  * from the old interface. The slide value passed in the shared_files_np represents
1808  * a max value. The kernel will choose a random value based on that, then use it
1809  * for all shared regions.
1810  */
1811 #if defined (__x86_64__)
1812 #define SLIDE_AMOUNT_MASK ~FOURK_PAGE_MASK
1813 #else
1814 #define SLIDE_AMOUNT_MASK ~SIXTEENK_PAGE_MASK
1815 #endif
1816 
1817 static inline __result_use_check kern_return_t
shared_region_map_and_slide_2_np_sanitize(struct proc * p,user_addr_t mappings_userspace_addr,unsigned int count,shared_file_mapping_slide_np_t * mappings)1818 shared_region_map_and_slide_2_np_sanitize(
1819 	struct proc                         *p,
1820 	user_addr_t                         mappings_userspace_addr,
1821 	unsigned int                        count,
1822 	shared_file_mapping_slide_np_t      *mappings)
1823 {
1824 	kern_return_t kr;
1825 	vm_map_t map = current_map();
1826 	mach_vm_address_t addr, end;
1827 	mach_vm_offset_t offset, offset_end;
1828 	mach_vm_size_t size, offset_size;
1829 	user_addr_t slide_start, slide_end, slide_size;
1830 	vm_prot_t cur;
1831 	vm_prot_t max;
1832 
1833 	user_addr_t user_addr = mappings_userspace_addr;
1834 
1835 	for (size_t i = 0; i < count; i++) {
1836 		shared_file_mapping_slide_np_ut mapping_u;
1837 		/*
1838 		 * First we bring each mapping struct into our kernel stack to
1839 		 * avoid TOCTOU.
1840 		 */
1841 		kr = shared_region_copyin(
1842 			p,
1843 			user_addr,
1844 			1, // copy 1 element at a time
1845 			sizeof(shared_file_mapping_slide_np_ut),
1846 			&mapping_u);
1847 		if (__improbable(kr != KERN_SUCCESS)) {
1848 			return kr;
1849 		}
1850 
1851 		/*
1852 		 * Then, we sanitize the data on the kernel stack.
1853 		 */
1854 		kr = vm_sanitize_addr_size(
1855 			mapping_u.sms_address_u,
1856 			mapping_u.sms_size_u,
1857 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1858 			map,
1859 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1860 			| VM_SANITIZE_FLAGS_CHECK_ALIGNED_START
1861 			| VM_SANITIZE_FLAGS_CHECK_ALIGNED_SIZE),
1862 			&addr,
1863 			&end,
1864 			&size);
1865 		if (__improbable(kr != KERN_SUCCESS)) {
1866 			return kr;
1867 		}
1868 
1869 		kr = vm_sanitize_addr_size(
1870 			mapping_u.sms_file_offset_u,
1871 			mapping_u.sms_size_u,
1872 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1873 			PAGE_MASK,
1874 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1875 			| VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1876 			&offset,
1877 			&offset_end,
1878 			&offset_size);
1879 		if (__improbable(kr != KERN_SUCCESS)) {
1880 			return kr;
1881 		}
1882 		if (__improbable(0 != (offset & vm_map_page_mask(map)))) {
1883 			return KERN_INVALID_ARGUMENT;
1884 		}
1885 
1886 		/*
1887 		 * Unsafe access is immediately followed by wrap to
1888 		 * convert from addr to size.
1889 		 */
1890 		mach_vm_size_ut sms_slide_size_u =
1891 		    vm_sanitize_wrap_size(
1892 			VM_SANITIZE_UNSAFE_UNWRAP(
1893 				mapping_u.sms_slide_size_u));
1894 
1895 		kr = vm_sanitize_addr_size(
1896 			mapping_u.sms_slide_start_u,
1897 			sms_slide_size_u,
1898 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1899 			map,
1900 			(VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1901 			| VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1902 			&slide_start,
1903 			&slide_end,
1904 			&slide_size);
1905 		if (__improbable(kr != KERN_SUCCESS)) {
1906 			return kr;
1907 		}
1908 
1909 		kr = vm_sanitize_cur_and_max_prots(
1910 			mapping_u.sms_init_prot_u,
1911 			mapping_u.sms_max_prot_u,
1912 			VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1913 			map,
1914 			VM_PROT_SFM_EXTENSIONS_MASK | VM_PROT_TPRO,
1915 			&cur,
1916 			&max);
1917 		if (__improbable(kr != KERN_SUCCESS)) {
1918 			return kr;
1919 		}
1920 
1921 		/*
1922 		 * Finally, we move the data from the kernel stack to our
1923 		 * caller-allocated kernel heap buffer.
1924 		 */
1925 		mappings[i].sms_address = addr;
1926 		mappings[i].sms_size = size;
1927 		mappings[i].sms_file_offset = offset;
1928 		mappings[i].sms_slide_size = slide_size;
1929 		mappings[i].sms_slide_start = slide_start;
1930 		mappings[i].sms_max_prot = max;
1931 		mappings[i].sms_init_prot = cur;
1932 
1933 		if (__improbable(os_add_overflow(
1934 			    user_addr,
1935 			    sizeof(shared_file_mapping_slide_np_ut),
1936 			    &user_addr))) {
1937 			return KERN_INVALID_ARGUMENT;
1938 		}
1939 	}
1940 
1941 	return KERN_SUCCESS;
1942 }
1943 
1944 int
shared_region_map_and_slide_2_np(struct proc * p,struct shared_region_map_and_slide_2_np_args * uap,__unused int * retvalp)1945 shared_region_map_and_slide_2_np(
1946 	struct proc                                  *p,
1947 	struct shared_region_map_and_slide_2_np_args *uap,
1948 	__unused int                                 *retvalp)
1949 {
1950 	unsigned int                  files_count;
1951 	struct shared_file_np         *shared_files = NULL;
1952 	unsigned int                  mappings_count;
1953 	struct shared_file_mapping_slide_np *mappings = NULL;
1954 	kern_return_t                 kr = KERN_SUCCESS;
1955 
1956 	files_count = uap->files_count;
1957 	mappings_count = uap->mappings_count;
1958 
1959 	SHARED_REGION_TRACE_DEBUG(
1960 		("shared_region: %p [%d(%s)] -> map_and_slide(0x%llx)\n",
1961 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
1962 		proc_getpid(p), p->p_comm,
1963 		(uint64_t)uap->mappings_u));
1964 
1965 	if (files_count == 0) {
1966 		SHARED_REGION_TRACE_INFO(
1967 			("shared_region: %p [%d(%s)] map(): "
1968 			"no files\n",
1969 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1970 			proc_getpid(p), p->p_comm));
1971 		kr = 0; /* no files to map: we're done ! */
1972 		goto done;
1973 	} else if (files_count <= _SR_FILE_MAPPINGS_MAX_FILES) {
1974 		shared_files = kalloc_data(files_count * sizeof(shared_files[0]), Z_WAITOK);
1975 		if (shared_files == NULL) {
1976 			kr = KERN_RESOURCE_SHORTAGE;
1977 			goto done;
1978 		}
1979 	} else {
1980 		SHARED_REGION_TRACE_ERROR(
1981 			("shared_region: %p [%d(%s)] map(): "
1982 			"too many files (%d) max %d\n",
1983 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1984 			proc_getpid(p), p->p_comm,
1985 			files_count, _SR_FILE_MAPPINGS_MAX_FILES));
1986 		kr = KERN_FAILURE;
1987 		goto done;
1988 	}
1989 
1990 	if (mappings_count == 0) {
1991 		SHARED_REGION_TRACE_INFO(
1992 			("shared_region: %p [%d(%s)] map(): "
1993 			"no mappings\n",
1994 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
1995 			proc_getpid(p), p->p_comm));
1996 		kr = 0; /* no mappings: we're done ! */
1997 		goto done;
1998 	} else if (mappings_count <= SFM_MAX) {
1999 		mappings = kalloc_data(mappings_count * sizeof(mappings[0]), Z_WAITOK);
2000 		if (mappings == NULL) {
2001 			kr = KERN_RESOURCE_SHORTAGE;
2002 			goto done;
2003 		}
2004 	} else {
2005 		SHARED_REGION_TRACE_ERROR(
2006 			("shared_region: %p [%d(%s)] map(): "
2007 			"too many mappings (%d) max %d\n",
2008 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2009 			proc_getpid(p), p->p_comm,
2010 			mappings_count, SFM_MAX));
2011 		kr = KERN_FAILURE;
2012 		goto done;
2013 	}
2014 
2015 	/*
2016 	 * struct shared_file_np does not have fields that are subject to
2017 	 * sanitization, it is thus copied from userspace as is.
2018 	 */
2019 	kr = shared_region_copyin(p, uap->files, files_count, sizeof(shared_files[0]), shared_files);
2020 	if (kr != KERN_SUCCESS) {
2021 		SHARED_REGION_TRACE_ERROR(
2022 			("shared_region: %p [%d(%s)] copyin() returned 0x%x\n",
2023 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2024 			proc_getpid(p), p->p_comm, kr));
2025 		goto done;
2026 	}
2027 
2028 	kr = shared_region_map_and_slide_2_np_sanitize(
2029 		p,
2030 		uap->mappings_u,
2031 		mappings_count,
2032 		mappings);
2033 	if (__improbable(kr != KERN_SUCCESS)) {
2034 		SHARED_REGION_TRACE_ERROR(
2035 			("shared_region: %p [%d(%s)] sanitize() returned 0x%x\n",
2036 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2037 			proc_getpid(p), p->p_comm, kr));
2038 		kr = vm_sanitize_get_kr(kr);
2039 		goto done;
2040 	}
2041 
2042 	uint32_t max_slide = shared_files[0].sf_slide;
2043 	uint32_t random_val;
2044 	uint32_t slide_amount;
2045 
2046 	if (max_slide != 0) {
2047 		read_random(&random_val, sizeof random_val);
2048 		slide_amount = ((random_val % max_slide) & SLIDE_AMOUNT_MASK);
2049 	} else {
2050 		slide_amount = 0;
2051 	}
2052 #if DEVELOPMENT || DEBUG
2053 	extern bool bootarg_disable_aslr;
2054 	if (bootarg_disable_aslr) {
2055 		slide_amount = 0;
2056 	}
2057 #endif /* DEVELOPMENT || DEBUG */
2058 
2059 	/*
2060 	 * Fix up the mappings to reflect the desired slide.
2061 	 */
2062 	unsigned int f;
2063 	unsigned int m = 0;
2064 	unsigned int i;
2065 	for (f = 0; f < files_count; ++f) {
2066 		shared_files[f].sf_slide = slide_amount;
2067 		for (i = 0; i < shared_files[f].sf_mappings_count; ++i, ++m) {
2068 			if (m >= mappings_count) {
2069 				SHARED_REGION_TRACE_ERROR(
2070 					("shared_region: %p [%d(%s)] map(): "
2071 					"mapping count argument was too small\n",
2072 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
2073 					proc_getpid(p), p->p_comm));
2074 				kr = KERN_FAILURE;
2075 				goto done;
2076 			}
2077 			if (__improbable(
2078 				    os_add_overflow(
2079 					    mappings[m].sms_address,
2080 					    slide_amount,
2081 					    &mappings[m].sms_address))) {
2082 				kr = KERN_INVALID_ARGUMENT;
2083 				goto done;
2084 			}
2085 			if (mappings[m].sms_slide_size != 0) {
2086 				mach_vm_address_t discard;
2087 				/* Slide and check that new start/size pairs do not overflow. */
2088 				if (__improbable(
2089 					    os_add_overflow(
2090 						    mappings[m].sms_slide_start,
2091 						    slide_amount,
2092 						    &mappings[m].sms_slide_start) ||
2093 					    os_add_overflow(
2094 						    mappings[m].sms_slide_start,
2095 						    mappings[m].sms_slide_size,
2096 						    &discard))) {
2097 					kr = KERN_INVALID_ARGUMENT;
2098 					goto done;
2099 				}
2100 			}
2101 		}
2102 	}
2103 
2104 	kr = _shared_region_map_and_slide(p, files_count, shared_files, mappings_count, mappings);
2105 done:
2106 	kfree_data(shared_files, files_count * sizeof(shared_files[0]));
2107 	kfree_data(mappings, mappings_count * sizeof(mappings[0]));
2108 
2109 	SHARED_REGION_TRACE_DEBUG(
2110 		("shared_region: %p [%d(%s)] map_and_slide(0x%llx) <- 0x%x\n",
2111 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
2112 		proc_getpid(p), p->p_comm,
2113 		(uint64_t)uap->mappings_u, kr));
2114 
2115 	return kr;
2116 }
2117 
2118 
2119 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_total_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_total_success, "");
2120 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_total_fail, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_total_fail, "");
2121 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_overflow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_overflow, "");
2122 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_bad_offset, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_bad_offset, "");
2123 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_bad_addr, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_bad_addr, "");
2124 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_bad_prot, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_bad_prot, "");
2125 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_bad_file, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_bad_file, "");
2126 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_bad_shadows, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_bad_shadows, "");
2127 SYSCTL_QUAD(_vm, OID_AUTO, vmwls_bad_cow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_with_linking_stats.vmwls_bad_cow, "");
2128 
2129 /*
2130  * A syscall for dyld to use to map data pages that need load time relocation fixups.
2131  * The fixups are performed by a custom pager during page-in, so the pages still appear
2132  * "clean" and hence are easily discarded under memory pressure. They can be re-paged-in
2133  * on demand later, all w/o using the compressor.
2134  *
2135  * Note these page are treated as MAP_PRIVATE. So if the application dirties any pages while
2136  * running, they are COW'd as normal.
2137  */
2138 int
map_with_linking_np(struct proc * p,struct map_with_linking_np_args * uap,__unused int * retvalp)2139 map_with_linking_np(
2140 	struct proc                     *p,
2141 	struct map_with_linking_np_args *uap,
2142 	__unused int                    *retvalp)
2143 {
2144 	uint32_t                        region_count;
2145 	uint32_t                        r;
2146 	struct mwl_region               *regions = NULL;
2147 	struct mwl_region               *rp;
2148 	uint32_t                        link_info_size;
2149 	void                            *link_info = NULL;      /* starts with a struct mwl_info_hdr */
2150 	struct mwl_info_hdr             *info_hdr = NULL;
2151 	uint64_t                        binds_size;
2152 	int                             fd;
2153 	struct fileproc                 *fp = NULL;
2154 	struct vnode                    *vp = NULL;
2155 	size_t                          file_size;
2156 	off_t                           fs;
2157 	struct vnode_attr               va;
2158 	memory_object_control_t         file_control = NULL;
2159 	int                             error;
2160 	kern_return_t                   kr = KERN_SUCCESS;
2161 
2162 	/*
2163 	 * Check if dyld has told us it finished with this call.
2164 	 */
2165 	if (p->p_disallow_map_with_linking) {
2166 		printf("%s: [%d(%s)]: map__with_linking() was disabled\n",
2167 		    __func__, proc_getpid(p), p->p_comm);
2168 		kr = KERN_FAILURE;
2169 		goto done;
2170 	}
2171 
2172 	/*
2173 	 * First we do some sanity checking on what dyld has passed us.
2174 	 */
2175 	region_count = uap->region_count;
2176 	link_info_size = uap->link_info_size;
2177 	if (region_count == 0) {
2178 		printf("%s: [%d(%s)]: region_count == 0\n",
2179 		    __func__, proc_getpid(p), p->p_comm);
2180 		kr = KERN_FAILURE;
2181 		goto done;
2182 	}
2183 	if (region_count > MWL_MAX_REGION_COUNT) {
2184 		printf("%s: [%d(%s)]: region_count too big %d\n",
2185 		    __func__, proc_getpid(p), p->p_comm, region_count);
2186 		kr = KERN_FAILURE;
2187 		goto done;
2188 	}
2189 
2190 	if (link_info_size <= MWL_MIN_LINK_INFO_SIZE) {
2191 		printf("%s: [%d(%s)]: link_info_size too small\n",
2192 		    __func__, proc_getpid(p), p->p_comm);
2193 		kr = KERN_FAILURE;
2194 		goto done;
2195 	}
2196 	if (link_info_size >= MWL_MAX_LINK_INFO_SIZE) {
2197 		printf("%s: [%d(%s)]: link_info_size too big %d\n",
2198 		    __func__, proc_getpid(p), p->p_comm, link_info_size);
2199 		kr = KERN_FAILURE;
2200 		goto done;
2201 	}
2202 
2203 	/*
2204 	 * Allocate and copyin the regions and link info
2205 	 */
2206 	regions = kalloc_data(region_count * sizeof(regions[0]), Z_WAITOK);
2207 	if (regions == NULL) {
2208 		printf("%s: [%d(%s)]: failed to allocate regions\n",
2209 		    __func__, proc_getpid(p), p->p_comm);
2210 		kr = KERN_RESOURCE_SHORTAGE;
2211 		goto done;
2212 	}
2213 	kr = shared_region_copyin(p, uap->regions, region_count, sizeof(regions[0]), regions);
2214 	if (kr != KERN_SUCCESS) {
2215 		printf("%s: [%d(%s)]: failed to copyin regions kr=%d\n",
2216 		    __func__, proc_getpid(p), p->p_comm, kr);
2217 		goto done;
2218 	}
2219 
2220 	link_info = kalloc_data(link_info_size, Z_WAITOK);
2221 	if (link_info == NULL) {
2222 		printf("%s: [%d(%s)]: failed to allocate link_info\n",
2223 		    __func__, proc_getpid(p), p->p_comm);
2224 		kr = KERN_RESOURCE_SHORTAGE;
2225 		goto done;
2226 	}
2227 	kr = shared_region_copyin(p, uap->link_info, 1, link_info_size, link_info);
2228 	if (kr != KERN_SUCCESS) {
2229 		printf("%s: [%d(%s)]: failed to copyin link_info kr=%d\n",
2230 		    __func__, proc_getpid(p), p->p_comm, kr);
2231 		goto done;
2232 	}
2233 
2234 	/*
2235 	 * Do some verification the data structures.
2236 	 */
2237 	info_hdr = (struct mwl_info_hdr *)link_info;
2238 	if (info_hdr->mwli_version != MWL_INFO_VERS) {
2239 		printf("%s: [%d(%s)]: unrecognized mwli_version=%d\n",
2240 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_version);
2241 		kr = KERN_FAILURE;
2242 		goto done;
2243 	}
2244 
2245 	if (info_hdr->mwli_binds_offset > link_info_size) {
2246 		printf("%s: [%d(%s)]: mwli_binds_offset too large %d\n",
2247 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_offset);
2248 		kr = KERN_FAILURE;
2249 		goto done;
2250 	}
2251 
2252 	/* some older devs have s/w page size > h/w page size, no need to support them */
2253 	if (info_hdr->mwli_page_size != PAGE_SIZE) {
2254 		/* no printf, since this is expected on some devices */
2255 		kr = KERN_INVALID_ARGUMENT;
2256 		goto done;
2257 	}
2258 
2259 	binds_size = (uint64_t)info_hdr->mwli_binds_count *
2260 	    ((info_hdr->mwli_pointer_format == DYLD_CHAINED_PTR_32) ? 4 : 8);
2261 	if (binds_size > link_info_size - info_hdr->mwli_binds_offset) {
2262 		printf("%s: [%d(%s)]: mwli_binds_count too large %d\n",
2263 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_count);
2264 		kr = KERN_FAILURE;
2265 		goto done;
2266 	}
2267 
2268 	if (info_hdr->mwli_chains_offset > link_info_size) {
2269 		printf("%s: [%d(%s)]: mwli_chains_offset too large %d\n",
2270 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_offset);
2271 		kr = KERN_FAILURE;
2272 		goto done;
2273 	}
2274 
2275 
2276 	/*
2277 	 * Ensure the chained starts in the link info and make sure the
2278 	 * segment info offsets are within bounds.
2279 	 */
2280 	if (info_hdr->mwli_chains_size < sizeof(struct dyld_chained_starts_in_image)) {
2281 		printf("%s: [%d(%s)]: mwli_chains_size too small %d\n",
2282 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2283 		kr = KERN_FAILURE;
2284 		goto done;
2285 	}
2286 	if (info_hdr->mwli_chains_size > link_info_size - info_hdr->mwli_chains_offset) {
2287 		printf("%s: [%d(%s)]: mwli_chains_size too large %d\n",
2288 		    __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2289 		kr = KERN_FAILURE;
2290 		goto done;
2291 	}
2292 
2293 	/* Note that more verification of offsets is done in the pager itself */
2294 
2295 	/*
2296 	 * Ensure we've only been given one FD and verify valid protections.
2297 	 */
2298 	fd = regions[0].mwlr_fd;
2299 	for (r = 0; r < region_count; ++r) {
2300 		if (regions[r].mwlr_fd != fd) {
2301 			printf("%s: [%d(%s)]: mwlr_fd mismatch %d and %d\n",
2302 			    __func__, proc_getpid(p), p->p_comm, fd, regions[r].mwlr_fd);
2303 			kr = KERN_FAILURE;
2304 			goto done;
2305 		}
2306 
2307 		/*
2308 		 * Only allow data mappings and not zero fill. Permit TPRO
2309 		 * mappings only when VM_PROT_READ | VM_PROT_WRITE.
2310 		 */
2311 		if (regions[r].mwlr_protections & VM_PROT_EXECUTE) {
2312 			printf("%s: [%d(%s)]: mwlr_protections EXECUTE not allowed\n",
2313 			    __func__, proc_getpid(p), p->p_comm);
2314 			kr = KERN_FAILURE;
2315 			goto done;
2316 		}
2317 		if (regions[r].mwlr_protections & VM_PROT_ZF) {
2318 			printf("%s: [%d(%s)]: region %d, found VM_PROT_ZF not allowed\n",
2319 			    __func__, proc_getpid(p), p->p_comm, r);
2320 			kr = KERN_FAILURE;
2321 			goto done;
2322 		}
2323 		if ((regions[r].mwlr_protections & VM_PROT_TPRO) &&
2324 		    !(regions[r].mwlr_protections & VM_PROT_WRITE)) {
2325 			printf("%s: [%d(%s)]: region %d, found VM_PROT_TPRO without VM_PROT_WRITE\n",
2326 			    __func__, proc_getpid(p), p->p_comm, r);
2327 			kr = KERN_FAILURE;
2328 			goto done;
2329 		}
2330 	}
2331 
2332 
2333 	/* get file structure from file descriptor */
2334 	error = fp_get_ftype(p, fd, DTYPE_VNODE, EINVAL, &fp);
2335 	if (error) {
2336 		printf("%s: [%d(%s)]: fp_get_ftype() failed, error %d\n",
2337 		    __func__, proc_getpid(p), p->p_comm, error);
2338 		kr = KERN_FAILURE;
2339 		goto done;
2340 	}
2341 
2342 	/* We need at least read permission on the file */
2343 	if (!(fp->fp_glob->fg_flag & FREAD)) {
2344 		printf("%s: [%d(%s)]: not readable\n",
2345 		    __func__, proc_getpid(p), p->p_comm);
2346 		kr = KERN_FAILURE;
2347 		goto done;
2348 	}
2349 
2350 	/* Get the vnode from file structure */
2351 	vp = (struct vnode *)fp_get_data(fp);
2352 	error = vnode_getwithref(vp);
2353 	if (error) {
2354 		printf("%s: [%d(%s)]: failed to get vnode, error %d\n",
2355 		    __func__, proc_getpid(p), p->p_comm, error);
2356 		kr = KERN_FAILURE;
2357 		vp = NULL; /* just to be sure */
2358 		goto done;
2359 	}
2360 
2361 	/* Make sure the vnode is a regular file */
2362 	if (vp->v_type != VREG) {
2363 		printf("%s: [%d(%s)]: vnode not VREG\n",
2364 		    __func__, proc_getpid(p), p->p_comm);
2365 		kr = KERN_FAILURE;
2366 		goto done;
2367 	}
2368 
2369 	/* get vnode size */
2370 	error = vnode_size(vp, &fs, vfs_context_current());
2371 	if (error) {
2372 		goto done;
2373 	}
2374 	file_size = fs;
2375 
2376 	/* get the file's memory object handle */
2377 	file_control = ubc_getobject(vp, UBC_HOLDOBJECT);
2378 	if (file_control == MEMORY_OBJECT_CONTROL_NULL) {
2379 		printf("%s: [%d(%s)]: no memory object\n",
2380 		    __func__, proc_getpid(p), p->p_comm);
2381 		kr = KERN_FAILURE;
2382 		goto done;
2383 	}
2384 
2385 	for (r = 0; r < region_count; ++r) {
2386 		rp = &regions[r];
2387 
2388 #if CONFIG_MACF
2389 		vm_prot_t prot = (rp->mwlr_protections & VM_PROT_ALL);
2390 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
2391 		    fp->fp_glob, prot, MAP_FILE | MAP_PRIVATE | MAP_FIXED, rp->mwlr_file_offset, &prot);
2392 		if (error) {
2393 			printf("%s: [%d(%s)]: mac_file_check_mmap() failed, region %d, error %d\n",
2394 			    __func__, proc_getpid(p), p->p_comm, r, error);
2395 			kr = KERN_FAILURE;
2396 			goto done;
2397 		}
2398 #endif /* MAC */
2399 
2400 		/* check that the mappings are properly covered by code signatures */
2401 		if (cs_system_enforcement()) {
2402 			if (!ubc_cs_is_range_codesigned(vp, rp->mwlr_file_offset, rp->mwlr_size)) {
2403 				printf("%s: [%d(%s)]: region %d, not code signed\n",
2404 				    __func__, proc_getpid(p), p->p_comm, r);
2405 				kr = KERN_FAILURE;
2406 				goto done;
2407 			}
2408 		}
2409 	}
2410 
2411 	/* update the vnode's access time */
2412 	if (!(vnode_vfsvisflags(vp) & MNT_NOATIME)) {
2413 		VATTR_INIT(&va);
2414 		nanotime(&va.va_access_time);
2415 		VATTR_SET_ACTIVE(&va, va_access_time);
2416 		vnode_setattr(vp, &va, vfs_context_current());
2417 	}
2418 
2419 	/* get the VM to do the work */
2420 	kr = vm_map_with_linking(proc_task(p), regions, region_count, &link_info, link_info_size, file_control);
2421 
2422 done:
2423 	if (fp != NULL) {
2424 		/* release the file descriptor */
2425 		fp_drop(p, fd, fp, 0);
2426 	}
2427 	if (vp != NULL) {
2428 		(void)vnode_put(vp);
2429 	}
2430 	if (regions != NULL) {
2431 		kfree_data(regions, region_count * sizeof(regions[0]));
2432 	}
2433 	/* link info is NULL if it is used in the pager, if things worked */
2434 	if (link_info != NULL) {
2435 		kfree_data(link_info, link_info_size);
2436 	}
2437 
2438 	switch (kr) {
2439 	case KERN_SUCCESS:
2440 		return 0;
2441 	case KERN_RESOURCE_SHORTAGE:
2442 		return ENOMEM;
2443 	default:
2444 		return EINVAL;
2445 	}
2446 }
2447 
2448 #if DEBUG || DEVELOPMENT
2449 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count,
2450     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count, 0, "");
2451 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count_max,
2452     CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count_max, 0, "");
2453 #endif /* DEBUG || DEVELOPMENT */
2454 
2455 /* sysctl overflow room */
2456 
2457 SYSCTL_INT(_vm, OID_AUTO, pagesize, CTLFLAG_RD | CTLFLAG_LOCKED,
2458     (int *) &page_size, 0, "vm page size");
2459 
2460 /* vm_page_free_target is provided as a makeshift solution for applications that want to
2461  *       allocate buffer space, possibly purgeable memory, but not cause inactive pages to be
2462  *       reclaimed. It allows the app to calculate how much memory is free outside the free target. */
2463 extern unsigned int     vm_page_free_target;
2464 SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED,
2465     &vm_page_free_target, 0, "Pageout daemon free target");
2466 
2467 SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED,
2468     &vm_pageout_state.vm_memory_pressure, 0, "Memory pressure indicator");
2469 
2470 static int
2471 vm_ctl_page_free_wanted SYSCTL_HANDLER_ARGS
2472 {
2473 #pragma unused(oidp, arg1, arg2)
2474 	unsigned int page_free_wanted;
2475 
2476 	page_free_wanted = mach_vm_ctl_page_free_wanted();
2477 	return SYSCTL_OUT(req, &page_free_wanted, sizeof(page_free_wanted));
2478 }
2479 SYSCTL_PROC(_vm, OID_AUTO, page_free_wanted,
2480     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
2481     0, 0, vm_ctl_page_free_wanted, "I", "");
2482 
2483 extern unsigned int     vm_page_purgeable_count;
2484 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2485     &vm_page_purgeable_count, 0, "Purgeable page count");
2486 
2487 extern unsigned int     vm_page_purgeable_wired_count;
2488 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2489     &vm_page_purgeable_wired_count, 0, "Wired purgeable page count");
2490 
2491 extern unsigned int vm_page_kern_lpage_count;
2492 SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2493     &vm_page_kern_lpage_count, 0, "kernel used large pages");
2494 
2495 SCALABLE_COUNTER_DECLARE(vm_page_grab_count);
2496 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed");
2497 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_kern);
2498 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_kern, vm_page_grab_count_kern, "Total pages grabbed (kernel)");
2499 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_iopl);
2500 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_iopl, vm_page_grab_count_iopl, "Total pages grabbed (iopl)");
2501 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_upl);
2502 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_upl, vm_page_grab_count_upl, "Total pages grabbed (upl)");
2503 
2504 
2505 #if DEVELOPMENT || DEBUG
2506 SCALABLE_COUNTER_DECLARE(vm_page_deactivate_behind_count);
2507 SYSCTL_SCALABLE_COUNTER(_vm, pages_deactivated_behind, vm_page_deactivate_behind_count,
2508     "Number of pages deactivated behind");
2509 #endif
2510 
2511 #if DEVELOPMENT || DEBUG
2512 #if __ARM_MIXED_PAGE_SIZE__
2513 static int vm_mixed_pagesize_supported = 1;
2514 #else
2515 static int vm_mixed_pagesize_supported = 0;
2516 #endif /*__ARM_MIXED_PAGE_SIZE__ */
2517 SYSCTL_INT(_debug, OID_AUTO, vm_mixed_pagesize_supported, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED,
2518     &vm_mixed_pagesize_supported, 0, "kernel support for mixed pagesize");
2519 
2520 SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
2521     &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed");
2522 
2523 SYSCTL_INT(_vm, OID_AUTO, pageout_purged_objects, CTLFLAG_RD | CTLFLAG_LOCKED,
2524     &vm_pageout_debug.vm_pageout_purged_objects, 0, "System purged object count");
2525 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED,
2526     &vm_pageout_debug.vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)");
2527 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED,
2528     &vm_pageout_debug.vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)");
2529 
2530 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2531     &vm_pageout_debug.vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated");
2532 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2533     &vm_pageout_debug.vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated");
2534 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2535     &vm_pageout_debug.vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated");         /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */
2536 SYSCTL_ULONG(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2537     &vm_pageout_vminfo.vm_pageout_freed_cleaned, "Cleaned pages freed");
2538 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2539     &vm_pageout_debug.vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated");
2540 SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2541     &vm_pageout_debug.vm_pageout_enqueued_cleaned, 0, "");         /* sum of next two */
2542 #endif /* DEVELOPMENT || DEBUG */
2543 
2544 extern int madvise_free_debug;
2545 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
2546     &madvise_free_debug, 0, "zero-fill on madvise(MADV_FREE*)");
2547 extern int madvise_free_debug_sometimes;
2548 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug_sometimes, CTLFLAG_RW | CTLFLAG_LOCKED,
2549     &madvise_free_debug_sometimes, 0, "sometimes zero-fill on madvise(MADV_FREE*)");
2550 
2551 SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2552     &vm_page_stats_reusable.reusable_count, 0, "Reusable page count");
2553 SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2554     &vm_page_stats_reusable.reusable_pages_success, "");
2555 SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2556     &vm_page_stats_reusable.reusable_pages_failure, "");
2557 SYSCTL_QUAD(_vm, OID_AUTO, reusable_pages_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2558     &vm_page_stats_reusable.reusable_pages_shared, "");
2559 SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2560     &vm_page_stats_reusable.all_reusable_calls, "");
2561 SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2562     &vm_page_stats_reusable.partial_reusable_calls, "");
2563 SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2564     &vm_page_stats_reusable.reuse_pages_success, "");
2565 SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2566     &vm_page_stats_reusable.reuse_pages_failure, "");
2567 SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2568     &vm_page_stats_reusable.all_reuse_calls, "");
2569 SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2570     &vm_page_stats_reusable.partial_reuse_calls, "");
2571 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2572     &vm_page_stats_reusable.can_reuse_success, "");
2573 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2574     &vm_page_stats_reusable.can_reuse_failure, "");
2575 SYSCTL_QUAD(_vm, OID_AUTO, reusable_reclaimed, CTLFLAG_RD | CTLFLAG_LOCKED,
2576     &vm_page_stats_reusable.reusable_reclaimed, "");
2577 SYSCTL_QUAD(_vm, OID_AUTO, reusable_nonwritable, CTLFLAG_RD | CTLFLAG_LOCKED,
2578     &vm_page_stats_reusable.reusable_nonwritable, "");
2579 SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2580     &vm_page_stats_reusable.reusable_shared, "");
2581 SYSCTL_QUAD(_vm, OID_AUTO, free_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2582     &vm_page_stats_reusable.free_shared, "");
2583 
2584 
2585 extern unsigned int vm_page_free_count, vm_page_speculative_count;
2586 SYSCTL_UINT(_vm, OID_AUTO, page_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_count, 0, "");
2587 SYSCTL_UINT(_vm, OID_AUTO, page_speculative_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_speculative_count, 0, "");
2588 
2589 extern unsigned int vm_page_cleaned_count;
2590 SYSCTL_UINT(_vm, OID_AUTO, page_cleaned_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_cleaned_count, 0, "Cleaned queue size");
2591 
2592 extern unsigned int vm_page_pageable_internal_count, vm_page_pageable_external_count;
2593 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_internal_count, 0, "");
2594 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, "");
2595 
2596 /* pageout counts */
2597 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_clean, 0, "");
2598 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_used, 0, "");
2599 
2600 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, "");
2601 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_external, "");
2602 SYSCTL_ULONG(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2603 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_external, "");
2604 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2605 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_cleaned, "");
2606 
2607 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_sharedcache, "");
2608 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache, "");
2609 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_realtime, "");
2610 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime, "");
2611 extern unsigned int vm_page_realtime_count;
2612 SYSCTL_UINT(_vm, OID_AUTO, page_realtime_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_realtime_count, 0, "");
2613 extern int vm_pageout_protect_realtime;
2614 SYSCTL_INT(_vm, OID_AUTO, pageout_protect_realtime, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_protect_realtime, 0, "");
2615 
2616 /* counts of pages prefaulted when entering a memory object */
2617 extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout;
2618 extern int64_t vm_prefault_nb_no_page, vm_prefault_nb_wrong_page;
2619 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_pages, "");
2620 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_bailout, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_bailout, "");
2621 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_no_page, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_no_page, "");
2622 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_wrong_page, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_wrong_page, "");
2623 
2624 #if defined (__x86_64__)
2625 extern unsigned int vm_clump_promote_threshold;
2626 SYSCTL_UINT(_vm, OID_AUTO, vm_clump_promote_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_clump_promote_threshold, 0, "clump size threshold for promotes");
2627 #if DEVELOPMENT || DEBUG
2628 extern unsigned long vm_clump_stats[];
2629 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[1], "free page allocations from clump of 1 page");
2630 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[2], "free page allocations from clump of 2 pages");
2631 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[3], "free page allocations from clump of 3 pages");
2632 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats4, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[4], "free page allocations from clump of 4 pages");
2633 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats5, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[5], "free page allocations from clump of 5 pages");
2634 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats6, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[6], "free page allocations from clump of 6 pages");
2635 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats7, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[7], "free page allocations from clump of 7 pages");
2636 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats8, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[8], "free page allocations from clump of 8 pages");
2637 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats9, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[9], "free page allocations from clump of 9 pages");
2638 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats10, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[10], "free page allocations from clump of 10 pages");
2639 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats11, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[11], "free page allocations from clump of 11 pages");
2640 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats12, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[12], "free page allocations from clump of 12 pages");
2641 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats13, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[13], "free page allocations from clump of 13 pages");
2642 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats14, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[14], "free page allocations from clump of 14 pages");
2643 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats15, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[15], "free page allocations from clump of 15 pages");
2644 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats16, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[16], "free page allocations from clump of 16 pages");
2645 extern unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
2646 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_alloc, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_allocs, "free page allocations");
2647 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inserts, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inserts, "free page insertions");
2648 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inrange, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inrange, "free page insertions that are part of vm_pages");
2649 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_promotes, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_promotes, "pages promoted to head");
2650 #endif  /* if DEVELOPMENT || DEBUG */
2651 #endif  /* #if defined (__x86_64__) */
2652 
2653 #if CONFIG_SECLUDED_MEMORY
2654 
2655 SYSCTL_UINT(_vm, OID_AUTO, num_tasks_can_use_secluded_mem, CTLFLAG_RD | CTLFLAG_LOCKED, &num_tasks_can_use_secluded_mem, 0, "");
2656 extern unsigned int vm_page_secluded_target;
2657 extern unsigned int vm_page_secluded_count;
2658 extern unsigned int vm_page_secluded_count_free;
2659 extern unsigned int vm_page_secluded_count_inuse;
2660 extern unsigned int vm_page_secluded_count_over_target;
2661 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, "");
2662 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, "");
2663 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, "");
2664 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, "");
2665 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_over_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_over_target, 0, "");
2666 
2667 extern struct vm_page_secluded_data vm_page_secluded;
2668 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, "");
2669 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_free, 0, "");
2670 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_other, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_other, 0, "");
2671 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_locked, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_locked, 0, "");
2672 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_state, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_state, 0, "");
2673 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_realtime, 0, "");
2674 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_dirty, 0, "");
2675 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, "");
2676 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, "");
2677 
2678 #endif /* CONFIG_SECLUDED_MEMORY */
2679 
2680 #if CONFIG_DEFERRED_RECLAIM
2681 #pragma mark Deferred Reclaim
2682 SYSCTL_NODE(_vm, OID_AUTO, reclaim, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Deferred Memory Reclamation");
2683 #if DEVELOPMENT || DEBUG
2684 /*
2685  * VM reclaim testing
2686  */
2687 extern bool vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task);
2688 
2689 static int
2690 sysctl_vm_reclaim_wait_for_pid SYSCTL_HANDLER_ARGS
2691 {
2692 	int error = EINVAL, pid = 0;
2693 	/*
2694 	 * Only send on write
2695 	 */
2696 	error = sysctl_handle_int(oidp, &pid, 0, req);
2697 	if (error || !req->newptr) {
2698 		return error;
2699 	}
2700 	if (pid <= 0) {
2701 		return EINVAL;
2702 	}
2703 	proc_t p = proc_find(pid);
2704 	if (p == PROC_NULL) {
2705 		return ESRCH;
2706 	}
2707 	task_t t = proc_task(p);
2708 	if (t == TASK_NULL) {
2709 		proc_rele(p);
2710 		return ESRCH;
2711 	}
2712 	task_reference(t);
2713 	proc_rele(p);
2714 
2715 	bool success = vm_deferred_reclamation_block_until_task_has_been_reclaimed(t);
2716 	if (success) {
2717 		error = 0;
2718 	}
2719 	task_deallocate(t);
2720 
2721 	return error;
2722 }
2723 
2724 SYSCTL_PROC(_vm_reclaim, OID_AUTO, wait_for_pid,
2725     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2726     &sysctl_vm_reclaim_wait_for_pid, "I",
2727     "Block until the given pid has been drained by kernel GC");
2728 
2729 static int
2730 sysctl_vm_reclaim_drain_pid SYSCTL_HANDLER_ARGS
2731 {
2732 	int error = EINVAL;
2733 	kern_return_t kr;
2734 	pid_t pid;
2735 	error = sysctl_handle_int(oidp, &pid, 0, req);
2736 	/* Only reclaim on write */
2737 	if (error || !req->newptr) {
2738 		return error;
2739 	}
2740 	if (pid <= 0) {
2741 		return EINVAL;
2742 	}
2743 	proc_t p = proc_find(pid);
2744 	if (p == PROC_NULL) {
2745 		return ESRCH;
2746 	}
2747 	task_t t = proc_task(p);
2748 	if (t == TASK_NULL) {
2749 		proc_rele(p);
2750 		return ESRCH;
2751 	}
2752 	task_reference(t);
2753 	proc_rele(p);
2754 	kr = vm_deferred_reclamation_task_drain(t, RECLAIM_OPTIONS_NONE);
2755 	task_deallocate(t);
2756 	return mach_to_bsd_errno(kr);
2757 }
2758 
2759 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_pid,
2760     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2761     &sysctl_vm_reclaim_drain_pid, "I",
2762     "Drain the deferred reclamation buffer for a pid");
2763 
2764 static int
proc_filter_reclaimable(proc_t p,__unused void * arg)2765 proc_filter_reclaimable(proc_t p, __unused void *arg)
2766 {
2767 	task_t task = proc_task(p);
2768 	return vm_deferred_reclamation_task_has_ring(task);
2769 }
2770 
2771 static int
proc_reclaim_drain(proc_t p,__unused void * arg)2772 proc_reclaim_drain(proc_t p, __unused void *arg)
2773 {
2774 	kern_return_t kr;
2775 	task_t task = proc_task(p);
2776 	kr = vm_deferred_reclamation_task_drain(task, RECLAIM_OPTIONS_NONE);
2777 	return mach_to_bsd_errno(kr);
2778 }
2779 
2780 static int
2781 sysctl_vm_reclaim_drain_all SYSCTL_HANDLER_ARGS
2782 {
2783 	int error;
2784 	int val;
2785 	if (!req->newptr) {
2786 		return EINVAL;
2787 	}
2788 	error = sysctl_handle_int(oidp, &val, 0, req);
2789 	if (error || val == FALSE) {
2790 		return error;
2791 	}
2792 	proc_iterate(PROC_ALLPROCLIST, proc_reclaim_drain, NULL,
2793 	    proc_filter_reclaimable, NULL);
2794 	return 0;
2795 }
2796 
2797 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_all,
2798     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2799     &sysctl_vm_reclaim_drain_all, "I",
2800     "Fully reclaim from every deferred reclamation buffer on the system");
2801 
2802 extern uint32_t vm_reclaim_buffer_count;
2803 extern uint64_t vm_reclaim_gc_epoch;
2804 extern uint64_t vm_reclaim_gc_reclaim_count;
2805 extern uint64_t vm_reclaim_sampling_period_abs;
2806 extern uint64_t vm_reclaim_sampling_period_ns;
2807 extern bool vm_reclaim_debug;
2808 extern bool vm_reclaim_enabled;
2809 extern uint32_t vm_reclaim_autotrim_pct_normal;
2810 extern uint32_t vm_reclaim_autotrim_pct_pressure;
2811 extern uint32_t vm_reclaim_autotrim_pct_critical;
2812 extern uint32_t vm_reclaim_wma_weight_base;
2813 extern uint32_t vm_reclaim_wma_weight_cur;
2814 extern uint32_t vm_reclaim_wma_denom;
2815 extern uint64_t vm_reclaim_abandonment_threshold;
2816 
2817 SYSCTL_UINT(_vm_reclaim, OID_AUTO, reclaim_buffer_count,
2818     CTLFLAG_RD | CTLFLAG_LOCKED, (uint32_t *)&vm_reclaim_buffer_count, 0,
2819     "The number of deferred memory buffers currently alive");
2820 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_epoch,
2821     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_epoch,
2822     "Number of times the global GC thread has run");
2823 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_reclaim_count,
2824     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_reclaim_count,
2825     "Number of times the global GC thread has reclaimed from a buffer");
2826 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, debug,
2827     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_debug, 0,
2828     "Debug logs for vm.reclaim");
2829 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, enabled,
2830     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_enabled, 0,
2831     "Whether deferred memory reclamation is enabled on this system");
2832 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_normal,
2833     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_normal, 0,
2834     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2835     "to engage auto-trim when the system is operating normally");
2836 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_pressure,
2837     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_pressure, 0,
2838     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2839     "to engage auto-trim when the system is under memory pressure");
2840 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_critical,
2841     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_critical, 0,
2842     "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2843     "to engage auto-trim when the system is under critical memory pressure");
2844 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_base,
2845     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_base, 0,
2846     "Weight applied to historical minimum buffer size samples");
2847 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_cur,
2848     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_cur, 0,
2849     "Weight applied to current sampled minimum buffer size");
2850 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_denom,
2851     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_denom, 0,
2852     "Denominator for weighted moving average calculation");
2853 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, abandonment_threshold,
2854     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_abandonment_threshold,
2855     "The number of sampling periods between accounting updates that may elapse "
2856     "before the buffer is considered \"abandoned\"");
2857 
2858 static int
2859 sysctl_vm_reclaim_sampling_period SYSCTL_HANDLER_ARGS
2860 {
2861 	uint64_t new_val_ns;
2862 	uint64_t old_val_ns = vm_reclaim_sampling_period_ns;
2863 	int err = sysctl_io_number(req, vm_reclaim_sampling_period_ns,
2864 	    sizeof(vm_reclaim_sampling_period_ns), &new_val_ns, NULL);
2865 	if (err || !req->newptr) {
2866 		return err;
2867 	}
2868 	if (new_val_ns != old_val_ns) {
2869 		vm_reclaim_sampling_period_ns = new_val_ns;
2870 		nanoseconds_to_absolutetime(vm_reclaim_sampling_period_ns, &vm_reclaim_sampling_period_abs);
2871 	}
2872 	return 0;
2873 }
2874 
2875 SYSCTL_PROC(_vm_reclaim, OID_AUTO, sampling_period_ns,
2876     CTLFLAG_RW | CTLTYPE_QUAD | CTLFLAG_LOCKED, NULL, 0, sysctl_vm_reclaim_sampling_period, "QU",
2877     "Interval (nanoseconds) at which to sample the minimum buffer size and "
2878     "consider trimming excess");
2879 #endif /* DEVELOPMENT || DEBUG */
2880 #endif /* CONFIG_DEFERRED_RECLAIM */
2881 
2882 #include <kern/thread.h>
2883 #include <sys/user.h>
2884 
2885 void vm_pageout_io_throttle(void);
2886 
2887 void
vm_pageout_io_throttle(void)2888 vm_pageout_io_throttle(void)
2889 {
2890 	struct uthread *uthread = current_uthread();
2891 
2892 	/*
2893 	 * thread is marked as a low priority I/O type
2894 	 * and the I/O we issued while in this cleaning operation
2895 	 * collided with normal I/O operations... we'll
2896 	 * delay in order to mitigate the impact of this
2897 	 * task on the normal operation of the system
2898 	 */
2899 
2900 	if (uthread->uu_lowpri_window) {
2901 		throttle_lowpri_io(1);
2902 	}
2903 }
2904 
2905 int
vm_pressure_monitor(__unused struct proc * p,struct vm_pressure_monitor_args * uap,int * retval)2906 vm_pressure_monitor(
2907 	__unused struct proc *p,
2908 	struct vm_pressure_monitor_args *uap,
2909 	int *retval)
2910 {
2911 	kern_return_t   kr;
2912 	uint32_t        pages_reclaimed;
2913 	uint32_t        pages_wanted;
2914 
2915 	kr = mach_vm_pressure_monitor(
2916 		(boolean_t) uap->wait_for_pressure,
2917 		uap->nsecs_monitored,
2918 		(uap->pages_reclaimed) ? &pages_reclaimed : NULL,
2919 		&pages_wanted);
2920 
2921 	switch (kr) {
2922 	case KERN_SUCCESS:
2923 		break;
2924 	case KERN_ABORTED:
2925 		return EINTR;
2926 	default:
2927 		return EINVAL;
2928 	}
2929 
2930 	if (uap->pages_reclaimed) {
2931 		if (copyout((void *)&pages_reclaimed,
2932 		    uap->pages_reclaimed,
2933 		    sizeof(pages_reclaimed)) != 0) {
2934 			return EFAULT;
2935 		}
2936 	}
2937 
2938 	*retval = (int) pages_wanted;
2939 	return 0;
2940 }
2941 
2942 int
kas_info(struct proc * p,struct kas_info_args * uap,int * retval __unused)2943 kas_info(struct proc *p,
2944     struct kas_info_args *uap,
2945     int *retval __unused)
2946 {
2947 #ifndef CONFIG_KAS_INFO
2948 	(void)p;
2949 	(void)uap;
2950 	return ENOTSUP;
2951 #else /* CONFIG_KAS_INFO */
2952 	int                     selector = uap->selector;
2953 	user_addr_t     valuep = uap->value;
2954 	user_addr_t     sizep = uap->size;
2955 	user_size_t size, rsize;
2956 	int                     error;
2957 
2958 	if (!kauth_cred_issuser(kauth_cred_get())) {
2959 		return EPERM;
2960 	}
2961 
2962 #if CONFIG_MACF
2963 	error = mac_system_check_kas_info(kauth_cred_get(), selector);
2964 	if (error) {
2965 		return error;
2966 	}
2967 #endif
2968 
2969 	if (IS_64BIT_PROCESS(p)) {
2970 		user64_size_t size64;
2971 		error = copyin(sizep, &size64, sizeof(size64));
2972 		size = (user_size_t)size64;
2973 	} else {
2974 		user32_size_t size32;
2975 		error = copyin(sizep, &size32, sizeof(size32));
2976 		size = (user_size_t)size32;
2977 	}
2978 	if (error) {
2979 		return error;
2980 	}
2981 
2982 	switch (selector) {
2983 	case KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR:
2984 	{
2985 		uint64_t slide = vm_kernel_slide;
2986 
2987 		if (sizeof(slide) != size) {
2988 			return EINVAL;
2989 		}
2990 
2991 		error = copyout(&slide, valuep, sizeof(slide));
2992 		if (error) {
2993 			return error;
2994 		}
2995 		rsize = size;
2996 	}
2997 	break;
2998 	case KAS_INFO_KERNEL_SEGMENT_VMADDR_SELECTOR:
2999 	{
3000 		uint32_t i;
3001 		kernel_mach_header_t *mh = &_mh_execute_header;
3002 		struct load_command *cmd;
3003 		cmd = (struct load_command*) &mh[1];
3004 		uint64_t *bases;
3005 		rsize = mh->ncmds * sizeof(uint64_t);
3006 
3007 		/*
3008 		 * Return the size if no data was passed
3009 		 */
3010 		if (valuep == 0) {
3011 			break;
3012 		}
3013 
3014 		if (rsize > size) {
3015 			return EINVAL;
3016 		}
3017 
3018 		bases = kalloc_data(rsize, Z_WAITOK | Z_ZERO);
3019 
3020 		for (i = 0; i < mh->ncmds; i++) {
3021 			if (cmd->cmd == LC_SEGMENT_KERNEL) {
3022 				__IGNORE_WCASTALIGN(kernel_segment_command_t * sg = (kernel_segment_command_t *) cmd);
3023 				bases[i] = (uint64_t)sg->vmaddr;
3024 			}
3025 			cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize);
3026 		}
3027 
3028 		error = copyout(bases, valuep, rsize);
3029 
3030 		kfree_data(bases, rsize);
3031 
3032 		if (error) {
3033 			return error;
3034 		}
3035 	}
3036 	break;
3037 	case KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR:
3038 	case KAS_INFO_TXM_TEXT_SLIDE_SELECTOR:
3039 	{
3040 #if CONFIG_SPTM
3041 		const uint64_t slide =
3042 		    (selector == KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR) ? vm_sptm_offsets.slide : vm_txm_offsets.slide;
3043 #else
3044 		const uint64_t slide = 0;
3045 #endif
3046 
3047 		if (sizeof(slide) != size) {
3048 			return EINVAL;
3049 		}
3050 
3051 		error = copyout(&slide, valuep, sizeof(slide));
3052 		if (error) {
3053 			return error;
3054 		}
3055 		rsize = size;
3056 	}
3057 	break;
3058 	default:
3059 		return EINVAL;
3060 	}
3061 
3062 	if (IS_64BIT_PROCESS(p)) {
3063 		user64_size_t size64 = (user64_size_t)rsize;
3064 		error = copyout(&size64, sizep, sizeof(size64));
3065 	} else {
3066 		user32_size_t size32 = (user32_size_t)rsize;
3067 		error = copyout(&size32, sizep, sizeof(size32));
3068 	}
3069 
3070 	return error;
3071 #endif /* CONFIG_KAS_INFO */
3072 }
3073 
3074 #pragma clang diagnostic push
3075 #pragma clang diagnostic ignored "-Wcast-qual"
3076 #pragma clang diagnostic ignored "-Wunused-function"
3077 
3078 static void
asserts()3079 asserts()
3080 {
3081 	static_assert(sizeof(vm_min_kernel_address) == sizeof(unsigned long));
3082 	static_assert(sizeof(vm_max_kernel_address) == sizeof(unsigned long));
3083 }
3084 
3085 SYSCTL_ULONG(_vm, OID_AUTO, vm_min_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_min_kernel_address, "");
3086 SYSCTL_ULONG(_vm, OID_AUTO, vm_max_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_max_kernel_address, "");
3087 #pragma clang diagnostic pop
3088 
3089 extern uint32_t vm_page_pages;
3090 SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0, "");
3091 
3092 extern uint32_t vm_page_busy_absent_skipped;
3093 SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, "");
3094 
3095 extern uint32_t vm_page_upl_tainted;
3096 SYSCTL_UINT(_vm, OID_AUTO, upl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_upl_tainted, 0, "");
3097 
3098 extern uint32_t vm_page_iopl_tainted;
3099 SYSCTL_UINT(_vm, OID_AUTO, iopl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_iopl_tainted, 0, "");
3100 
3101 #if __arm64__ && (DEVELOPMENT || DEBUG)
3102 extern int vm_footprint_suspend_allowed;
3103 SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, "");
3104 
3105 extern void pmap_footprint_suspend(vm_map_t map, boolean_t suspend);
3106 static int
3107 sysctl_vm_footprint_suspend SYSCTL_HANDLER_ARGS
3108 {
3109 #pragma unused(oidp, arg1, arg2)
3110 	int error = 0;
3111 	int new_value;
3112 
3113 	if (req->newptr == USER_ADDR_NULL) {
3114 		return 0;
3115 	}
3116 	error = SYSCTL_IN(req, &new_value, sizeof(int));
3117 	if (error) {
3118 		return error;
3119 	}
3120 	if (!vm_footprint_suspend_allowed) {
3121 		if (new_value != 0) {
3122 			/* suspends are not allowed... */
3123 			return 0;
3124 		}
3125 		/* ... but let resumes proceed */
3126 	}
3127 	DTRACE_VM2(footprint_suspend,
3128 	    vm_map_t, current_map(),
3129 	    int, new_value);
3130 
3131 	pmap_footprint_suspend(current_map(), new_value);
3132 
3133 	return 0;
3134 }
3135 SYSCTL_PROC(_vm, OID_AUTO, footprint_suspend,
3136     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3137     0, 0, &sysctl_vm_footprint_suspend, "I", "");
3138 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
3139 
3140 extern uint64_t vm_map_corpse_footprint_count;
3141 extern uint64_t vm_map_corpse_footprint_size_avg;
3142 extern uint64_t vm_map_corpse_footprint_size_max;
3143 extern uint64_t vm_map_corpse_footprint_full;
3144 extern uint64_t vm_map_corpse_footprint_no_buf;
3145 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_count,
3146     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_count, "");
3147 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_avg,
3148     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_avg, "");
3149 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_max,
3150     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_max, "");
3151 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_full,
3152     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_full, "");
3153 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_no_buf,
3154     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_no_buf, "");
3155 
3156 #if CODE_SIGNING_MONITOR
3157 extern uint64_t vm_cs_defer_to_csm;
3158 extern uint64_t vm_cs_defer_to_csm_not;
3159 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm,
3160     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm, "");
3161 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm_not,
3162     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm_not, "");
3163 #endif /* CODE_SIGNING_MONITOR */
3164 
3165 extern uint64_t shared_region_pager_copied;
3166 extern uint64_t shared_region_pager_slid;
3167 extern uint64_t shared_region_pager_slid_error;
3168 extern uint64_t shared_region_pager_reclaimed;
3169 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_copied,
3170     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_copied, "");
3171 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid,
3172     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid, "");
3173 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid_error,
3174     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid_error, "");
3175 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed,
3176     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_reclaimed, "");
3177 extern int shared_region_destroy_delay;
3178 SYSCTL_INT(_vm, OID_AUTO, shared_region_destroy_delay,
3179     CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_destroy_delay, 0, "");
3180 
3181 #if MACH_ASSERT
3182 extern int pmap_ledgers_panic_leeway;
3183 SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
3184 #endif /* MACH_ASSERT */
3185 
3186 
3187 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_count;
3188 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_size;
3189 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_max;
3190 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart;
3191 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_error;
3192 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_count;
3193 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_size;
3194 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_max;
3195 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart;
3196 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_error;
3197 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_count;
3198 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_size;
3199 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_max;
3200 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_count,
3201     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_count, "");
3202 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_size,
3203     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_size, "");
3204 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_max,
3205     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_max, "");
3206 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_restart,
3207     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_restart, "");
3208 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_error,
3209     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_error, "");
3210 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_count,
3211     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_count, "");
3212 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_size,
3213     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_size, "");
3214 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_max,
3215     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_max, "");
3216 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_restart,
3217     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_restart, "");
3218 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_error,
3219     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_error, "");
3220 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_count,
3221     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_count, "");
3222 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_size,
3223     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_size, "");
3224 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_max,
3225     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_max, "");
3226 
3227 extern int vm_protect_privileged_from_untrusted;
3228 SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted,
3229     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, "");
3230 extern uint64_t vm_copied_on_read;
3231 extern uint64_t vm_copied_on_read_kernel_map;
3232 extern uint64_t vm_copied_on_read_platform_map;
3233 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read,
3234     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, "");
3235 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read_kernel_map,
3236     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read_kernel_map, "");
3237 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read_platform_map,
3238     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read_platform_map, "");
3239 
3240 extern int vm_shared_region_count;
3241 extern int vm_shared_region_peak;
3242 SYSCTL_INT(_vm, OID_AUTO, shared_region_count,
3243     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_count, 0, "");
3244 SYSCTL_INT(_vm, OID_AUTO, shared_region_peak,
3245     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_peak, 0, "");
3246 #if DEVELOPMENT || DEBUG
3247 extern unsigned int shared_region_pagers_resident_count;
3248 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_count,
3249     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_count, 0, "");
3250 extern unsigned int shared_region_pagers_resident_peak;
3251 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_peak,
3252     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_peak, 0, "");
3253 extern int shared_region_pager_count;
3254 SYSCTL_INT(_vm, OID_AUTO, shared_region_pager_count,
3255     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_count, 0, "");
3256 #if __has_feature(ptrauth_calls)
3257 extern int shared_region_key_count;
3258 SYSCTL_INT(_vm, OID_AUTO, shared_region_key_count,
3259     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_key_count, 0, "");
3260 extern int vm_shared_region_reslide_count;
3261 SYSCTL_INT(_vm, OID_AUTO, shared_region_reslide_count,
3262     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_reslide_count, 0, "");
3263 #endif /* __has_feature(ptrauth_calls) */
3264 #endif /* DEVELOPMENT || DEBUG */
3265 
3266 #if MACH_ASSERT
3267 extern int debug4k_filter;
3268 SYSCTL_INT(_vm, OID_AUTO, debug4k_filter, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_filter, 0, "");
3269 extern int debug4k_panic_on_terminate;
3270 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_terminate, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_terminate, 0, "");
3271 extern int debug4k_panic_on_exception;
3272 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_exception, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_exception, 0, "");
3273 extern int debug4k_panic_on_misaligned_sharing;
3274 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_misaligned_sharing, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_misaligned_sharing, 0, "");
3275 #endif /* MACH_ASSERT */
3276 
3277 extern uint64_t vm_map_set_size_limit_count;
3278 extern uint64_t vm_map_set_data_limit_count;
3279 extern uint64_t vm_map_enter_RLIMIT_AS_count;
3280 extern uint64_t vm_map_enter_RLIMIT_DATA_count;
3281 SYSCTL_QUAD(_vm, OID_AUTO, map_set_size_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_size_limit_count, "");
3282 SYSCTL_QUAD(_vm, OID_AUTO, map_set_data_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_data_limit_count, "");
3283 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_AS_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_AS_count, "");
3284 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_DATA_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_DATA_count, "");
3285 
3286 extern uint64_t vm_map_copy_extra_adjustments;
3287 SYSCTL_QUAD(_vm, OID_AUTO, map_copy_extra_adjustments, CTLFLAG_RD | CTLFLAG_LOCKED,
3288     &vm_map_copy_extra_adjustments, "");
3289 
3290 extern uint64_t vm_fault_resilient_media_initiate;
3291 extern uint64_t vm_fault_resilient_media_retry;
3292 extern uint64_t vm_fault_resilient_media_proceed;
3293 extern uint64_t vm_fault_resilient_media_release;
3294 extern uint64_t vm_fault_resilient_media_abort1;
3295 extern uint64_t vm_fault_resilient_media_abort2;
3296 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_initiate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_initiate, "");
3297 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_retry, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_retry, "");
3298 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_proceed, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_proceed, "");
3299 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_release, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_release, "");
3300 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort1, "");
3301 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort2, "");
3302 #if MACH_ASSERT
3303 extern int vm_fault_resilient_media_inject_error1_rate;
3304 extern int vm_fault_resilient_media_inject_error1;
3305 extern int vm_fault_resilient_media_inject_error2_rate;
3306 extern int vm_fault_resilient_media_inject_error2;
3307 extern int vm_fault_resilient_media_inject_error3_rate;
3308 extern int vm_fault_resilient_media_inject_error3;
3309 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1_rate, 0, "");
3310 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1, 0, "");
3311 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2_rate, 0, "");
3312 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2, 0, "");
3313 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3_rate, 0, "");
3314 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3, 0, "");
3315 #endif /* MACH_ASSERT */
3316 
3317 extern uint64_t pmap_query_page_info_retries;
3318 SYSCTL_QUAD(_vm, OID_AUTO, pmap_query_page_info_retries, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_query_page_info_retries, "");
3319 
3320 /*
3321  * A sysctl which causes all existing shared regions to become stale. They
3322  * will no longer be used by anything new and will be torn down as soon as
3323  * the last existing user exits. A write of non-zero value causes that to happen.
3324  * This should only be used by launchd, so we check that this is initproc.
3325  */
3326 static int
shared_region_pivot(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3327 shared_region_pivot(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3328 {
3329 	unsigned int value = 0;
3330 	int changed = 0;
3331 	int error = sysctl_io_number(req, 0, sizeof(value), &value, &changed);
3332 	if (error || !changed) {
3333 		return error;
3334 	}
3335 	if (current_proc() != initproc) {
3336 		return EPERM;
3337 	}
3338 
3339 	vm_shared_region_pivot();
3340 
3341 	return 0;
3342 }
3343 
3344 SYSCTL_PROC(_vm, OID_AUTO, shared_region_pivot,
3345     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
3346     0, 0, shared_region_pivot, "I", "");
3347 
3348 extern uint64_t vm_object_shadow_forced;
3349 extern uint64_t vm_object_shadow_skipped;
3350 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
3351     &vm_object_shadow_forced, "");
3352 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_skipped, CTLFLAG_RD | CTLFLAG_LOCKED,
3353     &vm_object_shadow_skipped, "");
3354 
3355 extern uint64_t vm_object_upl_throttle_cnt;
3356 SYSCTL_QUAD(_vm, OID_AUTO, object_upl_throttle_cnt, CTLFLAG_RD | CTLFLAG_LOCKED,
3357     &vm_object_upl_throttle_cnt,
3358     "The number of times in which a UPL write was throttled due to pageout starvation");
3359 
3360 #if HAS_MTE
3361 #pragma mark MTE
3362 
3363 SYSCTL_NODE(_vm, OID_AUTO, mte, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "mte");
3364 
3365 /* sysctls for vm.mte.* counters. */
3366 
3367 SYSCTL_UINT(_vm_mte, OID_AUTO, tagged, CTLFLAG_RD,
3368     &vm_page_tagged_count, 0, "tagged pages in use");
3369 
3370 SYSCTL_QUAD(_vm_mte, OID_AUTO, refill_thread_wakeups, CTLFLAG_RD,
3371     &vm_mte_refill_thread_wakeups,
3372     "the number of times the refill thread was woken up");
3373 
3374 /* sysctls for vm.mte.free.* counters. */
3375 
3376 SYSCTL_NODE(_vm_mte, OID_AUTO, free, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "free counts");
3377 
3378 SYSCTL_UINT(_vm_mte_free, OID_AUTO, total, CTLFLAG_RD,
3379     &vm_page_free_count, 0,
3380     "total free pages (same as vm.page_free_count)");
3381 SYSCTL_UINT(_vm_mte_free, OID_AUTO, taggable, CTLFLAG_RD,
3382     &vm_page_free_taggable_count, 0,
3383     "free taggable pages in the MTE free queue");
3384 SYSCTL_UINT(_vm_mte_free, OID_AUTO, claimable, CTLFLAG_RD,
3385     &mte_claimable_queue.vmpfq_count, 0,
3386     "free tag storage pages on the MTE claimable queue");
3387 
3388 SYSCTL_SCALABLE_COUNTER(_vm_mte_free, cpu_untagged, vm_cpu_free_count,
3389     "free untagged pages in CPU lists");
3390 SYSCTL_SCALABLE_COUNTER(_vm_mte_free, cpu_claimed, vm_cpu_free_claimed_count,
3391     "free claimed pages in CPU lists");
3392 SYSCTL_SCALABLE_COUNTER(_vm_mte_free, cpu_tagged, vm_cpu_free_tagged_count,
3393     "free tagged pages in CPU lists");
3394 
3395 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_0, CTLFLAG_RD,
3396     &mte_free_queues[MTE_FREE_UNTAGGABLE_0].vmpfq_count, 0,
3397     "disabled/pinned/deactivating/claimed (with 16 free pages or less) tag storage pages")
3398 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_1, CTLFLAG_RD,
3399     &mte_free_queues[MTE_FREE_UNTAGGABLE_1].vmpfq_count, 0,
3400     "claimed (with 17 free pages or more) or disabled (with 16 pages or less) tag storage pages")
3401 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_2, CTLFLAG_RD,
3402     &mte_free_queues[MTE_FREE_UNTAGGABLE_2].vmpfq_count, 0,
3403     "disabled (with 17 pages or more) tag storage pages")
3404 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_0, CTLFLAG_RD,
3405     &mte_free_queues[MTE_FREE_ACTIVE_0].vmpfq_count, 0,
3406     "active tag storages with free covered pages (bucket 0)");
3407 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_1, CTLFLAG_RD,
3408     &mte_free_queues[MTE_FREE_ACTIVE_1].vmpfq_count, 0,
3409     "active tag storages with free covered pages (bucket 1)");
3410 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_2, CTLFLAG_RD,
3411     &mte_free_queues[MTE_FREE_ACTIVE_2].vmpfq_count, 0,
3412     "active tag storages with free covered pages (bucket 2)");
3413 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_3, CTLFLAG_RD,
3414     &mte_free_queues[MTE_FREE_ACTIVE_3].vmpfq_count, 0,
3415     "active tag storages with free covered pages (bucket 3)");
3416 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_activating, CTLFLAG_RD,
3417     &mte_free_queues[MTE_FREE_UNTAGGABLE_ACTIVATING].vmpfq_count, 0,
3418     "activating/reclaiming tag storages with free covered pages");
3419 
3420 /* sysctls for vm.mte.tag_storage.cell_* counters. */
3421 
3422 SYSCTL_NODE(_vm_mte, OID_AUTO, cell, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "mte cell");
3423 
3424 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, disabled, CTLFLAG_RD,
3425     &mte_info_lists[MTE_LIST_DISABLED_IDX].count, 0,
3426     "free inactive tag storage pages");
3427 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, disabled_recursive, CTLFLAG_RD,
3428     &vm_page_recursive_tag_storage_count, 0,
3429     "recursive tag storage pages");
3430 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, disabled_unmanaged, CTLFLAG_RD,
3431     &vm_page_unmanaged_tag_storage_count, 0,
3432     "unmanaged tag storage pages");
3433 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, retired, CTLFLAG_RD,
3434     &vm_page_retired_tag_storage_count, 0,
3435     "retired tag storage pages");
3436 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, pinned, CTLFLAG_RD,
3437     &mte_info_lists[MTE_LIST_PINNED_IDX].count, 0,
3438     "unreclaimable tag storage pages");
3439 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, deactivating, CTLFLAG_RD,
3440     &mte_info_lists[MTE_LIST_DEACTIVATING_IDX].count, 0,
3441     "deactivating tag storage pages");
3442 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, claimed, CTLFLAG_RD,
3443     &mte_info_lists[MTE_LIST_CLAIMED_IDX].count, 0,
3444     "claimed tag storage pages");
3445 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, inactive, CTLFLAG_RD,
3446     &mte_info_lists[MTE_LIST_INACTIVE_IDX].count, 0,
3447     "free inactive tag storage pages");
3448 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, reclaiming, CTLFLAG_RD,
3449     &mte_info_lists[MTE_LIST_RECLAIMING_IDX].count, 0,
3450     "reclaiming tag storage pages");
3451 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, activating, CTLFLAG_RD,
3452     &mte_info_lists[MTE_LIST_ACTIVATING_IDX].count, 0,
3453     "activating tag storage pages");
3454 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, active_0, CTLFLAG_RD,
3455     &mte_info_lists[MTE_LIST_ACTIVE_0_IDX].count, 0,
3456     "active tag storage pages with no used page tagged");
3457 static int
3458 tag_storage_active SYSCTL_HANDLER_ARGS
3459 {
3460 #pragma unused(arg1, arg2, oidp)
3461 	uint32_t value = mteinfo_tag_storage_active(false);
3462 
3463 	return SYSCTL_OUT(req, &value, sizeof(value));
3464 }
3465 SYSCTL_PROC(_vm_mte_cell, OID_AUTO, active,
3466     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3467     0, 0, &tag_storage_active, "I",
3468     "active tag storage pages");
3469 
3470 /* sysctls for vm.mte.tag_storage.* counters. */
3471 
3472 SYSCTL_NODE(_vm_mte, OID_AUTO, tag_storage, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "mte tag storage");
3473 
3474 SYSCTL_UINT(_vm_mte_tag_storage, OID_AUTO, reserved, CTLFLAG_RD,
3475     &vm_page_tag_storage_reserved, 0,
3476     "free tag storage pages reserve");
3477 SYSCTL_UINT(_vm_mte_tag_storage, OID_AUTO, wired, CTLFLAG_RD,
3478     &vm_page_wired_tag_storage_count, 0,
3479     "wired tag storage pages");
3480 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, activations, CTLFLAG_RD,
3481     &vm_page_tag_storage_activation_count,
3482     "tag storage activations (inactive/claimed -> active)");
3483 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, deactivations, CTLFLAG_RD,
3484     &vm_page_tag_storage_deactivation_count,
3485     "tag storage deactivations (active -> inactive)");
3486 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaims, CTLFLAG_RD,
3487     &vm_page_tag_storage_reclaim_success_count,
3488     "successful tag storage reclamations");
3489 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaims_from_cpu, CTLFLAG_RD,
3490     &vm_page_tag_storage_reclaim_from_cpu_count,
3491     "successful tag storage reclamations from the cpu free lists");
3492 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaim_failures, CTLFLAG_RD,
3493     &vm_page_tag_storage_reclaim_failure_count,
3494     "failed tag storage reclamations");
3495 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaim_wired_failures, CTLFLAG_RD,
3496     &vm_page_tag_storage_reclaim_wired_failure_count,
3497     "failed tag storage reclamations due to tag storage being wired");
3498 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, wire_relocations, CTLFLAG_RD,
3499     &vm_page_tag_storage_wire_relocation_count,
3500     "tag storage relocations due to wiring");
3501 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaim_compressor_failures, CTLFLAG_RD,
3502     &vm_page_tag_storage_reclaim_compressor_failure_count,
3503     "failed tag storage reclamations due to tag storage used in compressor pool");
3504 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, compressor_relocations, CTLFLAG_RD,
3505     &vm_page_tag_storage_compressor_relocation_count,
3506     "tag storage relocations due to compressor pool");
3507 SYSCTL_UINT(_vm_mte_tag_storage, OID_AUTO, free_unmanaged, CTLFLAG_RD,
3508     &vm_page_free_unmanaged_tag_storage_count, 0,
3509     "number of free unmanaged tag storage pages");
3510 
3511 SYSCTL_SCALABLE_COUNTER(_vm_mte_tag_storage, cpu_allocated_claimed,
3512     vm_cpu_claimed_count, "claimed tag storage pages allocated");
3513 
3514 static int
3515 tag_storage_fragmentation SYSCTL_HANDLER_ARGS
3516 {
3517 #pragma unused(arg1, arg2, oidp)
3518 	uint32_t value = mteinfo_tag_storage_fragmentation(false);
3519 
3520 	return SYSCTL_OUT(req, &value, sizeof(value));
3521 }
3522 SYSCTL_PROC(_vm_mte_tag_storage, OID_AUTO, fragmentation,
3523     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3524     0, 0, &tag_storage_fragmentation, "I",
3525     "the achievable the fragmentation of the tag storage space (in parts per thousand)");
3526 
3527 static int
3528 tag_storage_fragmentation_actual SYSCTL_HANDLER_ARGS
3529 {
3530 #pragma unused(arg1, arg2, oidp)
3531 	uint32_t value = mteinfo_tag_storage_fragmentation(true);
3532 
3533 	return SYSCTL_OUT(req, &value, sizeof(value));
3534 }
3535 SYSCTL_PROC(_vm_mte_tag_storage, OID_AUTO, fragmentation_actual,
3536     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3537     0, 0, &tag_storage_fragmentation_actual, "I",
3538     "the actual the fragmentation of the tag storage space (in parts per thousand)");
3539 
3540 /* sysctls for vm.mte.compresor_* */
3541 
3542 extern unsigned int vm_object_no_compressor_pager_for_mte_count;
3543 SYSCTL_INT(_vm_mte, OID_AUTO, no_compressor_pager_for_mte, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_no_compressor_pager_for_mte_count, 0, "");
3544 
3545 /* sysctls for MTE compression stats */
3546 
3547 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_compressed, compressor_tagged_pages_compressed, "");
3548 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_decompressed, compressor_tagged_pages_decompressed, "");
3549 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_freed, compressor_tagged_pages_freed, "");
3550 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_corrupted, compressor_tagged_pages_corrupted, "");
3551 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_overhead_bytes, compressor_tags_overhead_bytes, "");
3552 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages, compressor_tagged_pages, "");
3553 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_ts_pages_used, compressor_tag_storage_pages_in_pool,
3554     "the number of tag storage pages used in the compressor");
3555 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_non_ts_pages_used, compressor_non_tag_storage_pages_in_pool,
3556     "the number of non-tag storage pages used in the compressor");
3557 #if DEVELOPMENT || DEBUG
3558 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_all_zero, compressor_tags_all_zero, "");
3559 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_same_value, compressor_tags_same_value, "");
3560 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_below_align, compressor_tags_below_align, "");
3561 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_above_align, compressor_tags_above_align, "");
3562 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_incompressible, compressor_tags_incompressible, "");
3563 #endif /* DEVELOPMENT || DEBUG */
3564 
3565 #endif /* HAS_MTE */
3566 
3567 SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED,
3568     &vmtc_total, 0, "total text page corruptions detected");
3569 
3570 
3571 #if DEBUG || DEVELOPMENT
3572 /*
3573  * A sysctl that can be used to corrupt a text page with an illegal instruction.
3574  * Used for testing text page self healing.
3575  */
3576 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
3577 static int
corrupt_text_addr(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3578 corrupt_text_addr(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3579 {
3580 	uint64_t value = 0;
3581 	int error = sysctl_handle_quad(oidp, &value, 0, req);
3582 	if (error || !req->newptr) {
3583 		return error;
3584 	}
3585 
3586 	if (vm_corrupt_text_addr((uintptr_t)value) == KERN_SUCCESS) {
3587 		return 0;
3588 	} else {
3589 		return EINVAL;
3590 	}
3591 }
3592 
3593 SYSCTL_PROC(_vm, OID_AUTO, corrupt_text_addr,
3594     CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3595     0, 0, corrupt_text_addr, "-", "");
3596 #endif /* DEBUG || DEVELOPMENT */
3597 
3598 #if CONFIG_MAP_RANGES
3599 /*
3600  * vm.malloc_ranges
3601  *
3602  * space-separated list of <left:right> hexadecimal addresses.
3603  */
3604 static int
3605 vm_map_malloc_ranges SYSCTL_HANDLER_ARGS
3606 {
3607 	vm_map_t map = current_map();
3608 	struct mach_vm_range r1, r2;
3609 	char str[20 * 4];
3610 	int len;
3611 	mach_vm_offset_t right_hole_max;
3612 
3613 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_DEFAULT, &r1)) {
3614 		return ENOENT;
3615 	}
3616 	if (vm_map_get_user_range(map, UMEM_RANGE_ID_HEAP, &r2)) {
3617 		return ENOENT;
3618 	}
3619 
3620 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
3621 	right_hole_max = MACH_VM_JUMBO_ADDRESS;
3622 #else /* !XNU_TARGET_OS_IOS || !EXTENDED_USER_VA_SUPPORT */
3623 	right_hole_max = get_map_max(map);
3624 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
3625 
3626 	len = scnprintf(str, sizeof(str), "0x%llx:0x%llx 0x%llx:0x%llx",
3627 	    r1.max_address, r2.min_address,
3628 	    r2.max_address, right_hole_max);
3629 
3630 	return SYSCTL_OUT(req, str, len);
3631 }
3632 
3633 SYSCTL_PROC(_vm, OID_AUTO, malloc_ranges,
3634     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3635     0, 0, &vm_map_malloc_ranges, "A", "");
3636 
3637 #if DEBUG || DEVELOPMENT
3638 static int
3639 vm_map_user_range_default SYSCTL_HANDLER_ARGS
3640 {
3641 #pragma unused(arg1, arg2, oidp)
3642 	struct mach_vm_range range;
3643 
3644 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_DEFAULT, &range)
3645 	    != KERN_SUCCESS) {
3646 		return EINVAL;
3647 	}
3648 
3649 	return SYSCTL_OUT(req, &range, sizeof(range));
3650 }
3651 
3652 static int
3653 vm_map_user_range_heap SYSCTL_HANDLER_ARGS
3654 {
3655 #pragma unused(arg1, arg2, oidp)
3656 	struct mach_vm_range range;
3657 
3658 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_HEAP, &range)
3659 	    != KERN_SUCCESS) {
3660 		return EINVAL;
3661 	}
3662 
3663 	return SYSCTL_OUT(req, &range, sizeof(range));
3664 }
3665 
3666 static int
3667 vm_map_user_range_large_file SYSCTL_HANDLER_ARGS
3668 {
3669 #pragma unused(arg1, arg2, oidp)
3670 	struct mach_vm_range range;
3671 
3672 	if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_LARGE_FILE, &range)
3673 	    != KERN_SUCCESS) {
3674 		return EINVAL;
3675 	}
3676 
3677 	return SYSCTL_OUT(req, &range, sizeof(range));
3678 }
3679 
3680 /*
3681  * A sysctl that can be used to return ranges for the current VM map.
3682  * Used for testing VM ranges.
3683  */
3684 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_default, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3685     0, 0, &vm_map_user_range_default, "S,mach_vm_range", "");
3686 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_heap, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3687     0, 0, &vm_map_user_range_heap, "S,mach_vm_range", "");
3688 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_large_file, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3689     0, 0, &vm_map_user_range_large_file, "S,mach_vm_range", "");
3690 
3691 #endif /* DEBUG || DEVELOPMENT */
3692 #endif /* CONFIG_MAP_RANGES */
3693 
3694 #if DEBUG || DEVELOPMENT
3695 #endif /* DEBUG || DEVELOPMENT */
3696 
3697 extern uint64_t vm_map_range_overflows_count;
3698 SYSCTL_QUAD(_vm, OID_AUTO, map_range_overflows_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_range_overflows_count, "");
3699 extern boolean_t vm_map_range_overflows_log;
3700 SYSCTL_INT(_vm, OID_AUTO, map_range_oveflows_log, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_range_overflows_log, 0, "");
3701 
3702 extern uint64_t c_seg_filled_no_contention;
3703 extern uint64_t c_seg_filled_contention;
3704 extern clock_sec_t c_seg_filled_contention_sec_max;
3705 extern clock_nsec_t c_seg_filled_contention_nsec_max;
3706 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_no_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_no_contention, "");
3707 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention, "");
3708 SYSCTL_ULONG(_vm, OID_AUTO, c_seg_filled_contention_sec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_sec_max, "");
3709 SYSCTL_UINT(_vm, OID_AUTO, c_seg_filled_contention_nsec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_nsec_max, 0, "");
3710 #if (XNU_TARGET_OS_OSX && __arm64__)
3711 extern clock_nsec_t c_process_major_report_over_ms; /* report if over ? ms */
3712 extern int c_process_major_yield_after; /* yield after moving ? segments */
3713 extern uint64_t c_process_major_reports;
3714 extern clock_sec_t c_process_major_max_sec;
3715 extern clock_nsec_t c_process_major_max_nsec;
3716 extern uint32_t c_process_major_peak_segcount;
3717 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_report_over_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_report_over_ms, 0, "");
3718 SYSCTL_INT(_vm, OID_AUTO, c_process_major_yield_after, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_yield_after, 0, "");
3719 SYSCTL_QUAD(_vm, OID_AUTO, c_process_major_reports, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_reports, "");
3720 SYSCTL_ULONG(_vm, OID_AUTO, c_process_major_max_sec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_sec, "");
3721 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_max_nsec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_nsec, 0, "");
3722 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_peak_segcount, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_peak_segcount, 0, "");
3723 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3724 
3725 #if DEVELOPMENT || DEBUG
3726 extern int panic_object_not_alive;
3727 SYSCTL_INT(_vm, OID_AUTO, panic_object_not_alive, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &panic_object_not_alive, 0, "");
3728 #endif /* DEVELOPMENT || DEBUG */
3729 
3730 #if FBDP_DEBUG_OBJECT_NO_PAGER
3731 extern int fbdp_no_panic;
3732 SYSCTL_INT(_vm, OID_AUTO, fbdp_no_panic, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &fbdp_no_panic, 0, "");
3733 #endif /* MACH_ASSERT */
3734 
3735 extern uint64_t cluster_direct_write_wired;
3736 SYSCTL_QUAD(_vm, OID_AUTO, cluster_direct_write_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &cluster_direct_write_wired, "");
3737 
3738 extern uint64_t vm_object_pageout_not_on_queue;
3739 extern uint64_t vm_object_pageout_not_pageable;
3740 extern uint64_t vm_object_pageout_pageable;
3741 extern uint64_t vm_object_pageout_active_local;
3742 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_on_queue, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_on_queue, "");
3743 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_pageable, "");
3744 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_pageable, "");
3745 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_active_local, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_active_local, "");
3746 
3747 
3748 #if DEVELOPMENT || DEBUG
3749 
3750 static uint32_t
sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)3751 sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)
3752 {
3753 #if HAS_MTE
3754 	if (with_data == VM_C_SERIALIZE_DATA_TAGS) {
3755 		return VM_C_SEGMENT_INFO_MAGIC_WITH_TAGS;
3756 	}
3757 #else
3758 #pragma unused(with_data)
3759 #endif /* HAS_MTE */
3760 	return VM_C_SEGMENT_INFO_MAGIC;
3761 }
3762 
3763 /* The largest possible single segment + its slots is
3764  * (sizeof(c_segment_info) + C_SLOT_MAX_INDEX * sizeof(c_slot_info)) + (data of a single segment) */
3765 #define SYSCTL_SEG_BUF_SIZE (8 * 1024 + 64 * 1024)
3766 
3767 extern uint32_t c_segments_available;
3768 
3769 struct sysctl_buf_header {
3770 	uint32_t magic;
3771 } __attribute__((packed));
3772 
3773 /* This sysctl iterates over the populated c_segments and writes some info about each one and its slots.
3774  * instead of doing everything here, the function calls a function vm_compressor.c. */
3775 static int
sysctl_compressor_segments_stream(struct sysctl_req * req,vm_c_serialize_add_data_t with_data)3776 sysctl_compressor_segments_stream(struct sysctl_req *req, vm_c_serialize_add_data_t with_data)
3777 {
3778 	char* buf = kalloc_data(SYSCTL_SEG_BUF_SIZE, Z_WAITOK | Z_ZERO);
3779 	if (!buf) {
3780 		return ENOMEM;
3781 	}
3782 	size_t offset = 0;
3783 	int error = 0;
3784 	int segno = 0;
3785 	/* 4 byte header to identify the version of the formatting of the data.
3786 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3787 	((struct sysctl_buf_header*)buf)->magic = sysctl_compressor_seg_magic(with_data);
3788 	offset += sizeof(uint32_t);
3789 
3790 	while (segno < c_segments_available) {
3791 		size_t left_sz = SYSCTL_SEG_BUF_SIZE - offset;
3792 		kern_return_t kr = vm_compressor_serialize_segment_debug_info(segno, buf + offset, &left_sz, with_data);
3793 		if (kr == KERN_NO_SPACE) {
3794 			/* failed to add another segment, push the current buffer out and try again */
3795 			if (offset == 0) {
3796 				error = EINVAL; /* no space to write but I didn't write anything, shouldn't really happen */
3797 				goto out;
3798 			}
3799 			/* write out chunk */
3800 			error = SYSCTL_OUT(req, buf, offset);
3801 			if (error) {
3802 				goto out;
3803 			}
3804 			offset = 0;
3805 			bzero(buf, SYSCTL_SEG_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3806 			/* don't increment segno, need to try again saving the current one */
3807 		} else if (kr != KERN_SUCCESS) {
3808 			error = EINVAL;
3809 			goto out;
3810 		} else {
3811 			offset += left_sz;
3812 			++segno;
3813 			assert(offset <= SYSCTL_SEG_BUF_SIZE);
3814 		}
3815 	}
3816 
3817 	if (offset > 0) { /* write last chunk */
3818 		error = SYSCTL_OUT(req, buf, offset);
3819 	}
3820 
3821 out:
3822 	kfree_data(buf, SYSCTL_SEG_BUF_SIZE)
3823 	return error;
3824 }
3825 
3826 static int
sysctl_compressor_segments(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3827 sysctl_compressor_segments(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3828 {
3829 	return sysctl_compressor_segments_stream(req, VM_C_SERIALIZE_DATA_NONE);
3830 }
3831 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments, "S", "");
3832 
3833 #if HAS_MTE
3834 static int
sysctl_compressor_segments_data(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3835 sysctl_compressor_segments_data(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3836 {
3837 	return sysctl_compressor_segments_stream(req, VM_C_SERIALIZE_DATA_TAGS);
3838 }
3839 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments_data, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments_data, "S", "");
3840 #endif /* HAS_MTE */
3841 
3842 extern uint32_t vm_compressor_fragmentation_level(void);
3843 
3844 static int
sysctl_compressor_fragmentation_level(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3845 sysctl_compressor_fragmentation_level(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3846 {
3847 	uint32_t value = vm_compressor_fragmentation_level();
3848 	return SYSCTL_OUT(req, &value, sizeof(value));
3849 }
3850 
3851 SYSCTL_PROC(_vm, OID_AUTO, compressor_fragmentation_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_fragmentation_level, "IU", "");
3852 
3853 extern uint32_t vm_compressor_incore_fragmentation_wasted_pages(void);
3854 
3855 static int
sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3856 sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3857 {
3858 	uint32_t value = vm_compressor_incore_fragmentation_wasted_pages();
3859 	return SYSCTL_OUT(req, &value, sizeof(value));
3860 }
3861 
3862 SYSCTL_PROC(_vm, OID_AUTO, compressor_incore_fragmentation_wasted_pages, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_incore_fragmentation_wasted_pages, "IU", "");
3863 
3864 
3865 
3866 #define SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE (8 * 1024)
3867 
3868 
3869 /* This sysctl iterates over all the entries of the vm_map of the a given process and write some info about the vm_object pointed by the entries.
3870  * This can be used for mapping where are all the pages of a process located in the compressor.
3871  */
3872 static int
sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)3873 sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3874 {
3875 	int error = 0;
3876 	char *buf = NULL;
3877 	proc_t p = PROC_NULL;
3878 	task_t task = TASK_NULL;
3879 	vm_map_t map = VM_MAP_NULL;
3880 	__block size_t offset = 0;
3881 
3882 	/* go from pid to proc to task to vm_map. see sysctl_procargsx() for another example of this procession */
3883 	int *name = arg1;
3884 	int namelen = arg2;
3885 	if (namelen < 1) {
3886 		return EINVAL;
3887 	}
3888 	int pid = name[0];
3889 	p = proc_find(pid);  /* this increments a reference to the proc */
3890 	if (p == PROC_NULL) {
3891 		return EINVAL;
3892 	}
3893 	task = proc_task(p);
3894 	proc_rele(p);  /* decrement ref of proc */
3895 	p = PROC_NULL;
3896 	if (task == TASK_NULL) {
3897 		return EINVAL;
3898 	}
3899 	/* convert proc reference to task reference */
3900 	task_reference(task);
3901 	/* task reference to map reference */
3902 	map = get_task_map_reference(task);
3903 	task_deallocate(task);
3904 
3905 	if (map == VM_MAP_NULL) {
3906 		return EINVAL;  /* nothing allocated yet */
3907 	}
3908 
3909 	buf = kalloc_data(SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE, Z_WAITOK | Z_ZERO);
3910 	if (!buf) {
3911 		error = ENOMEM;
3912 		goto out;
3913 	}
3914 
3915 	/* 4 byte header to identify the version of the formatting of the data.
3916 	 * This should be incremented if c_segment_info or c_slot_info are changed */
3917 	((struct sysctl_buf_header*)buf)->magic = VM_MAP_ENTRY_INFO_MAGIC;
3918 	offset += sizeof(uint32_t);
3919 
3920 	kern_return_t (^write_header)(int) = ^kern_return_t (int nentries) {
3921 		/* write the header, happens only once at the beginning so we should have enough space */
3922 		assert(offset + sizeof(struct vm_map_info_hdr) < SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE);
3923 		struct vm_map_info_hdr* out_hdr = (struct vm_map_info_hdr*)(buf + offset);
3924 		out_hdr->vmi_nentries = nentries;
3925 		offset += sizeof(struct vm_map_info_hdr);
3926 		return KERN_SUCCESS;
3927 	};
3928 
3929 	kern_return_t (^write_entry)(void*) = ^kern_return_t (void* entry) {
3930 		while (true) { /* try up to 2 times, first try write the the current buffer, otherwise to a new buffer */
3931 			size_t left_sz = SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE - offset;
3932 			kern_return_t kr = vm_map_dump_entry_and_compressor_pager(entry, buf + offset, &left_sz);
3933 			if (kr == KERN_NO_SPACE) {
3934 				/* failed to write anything, flush the current buffer and try again */
3935 				if (offset == 0) {
3936 					return KERN_FAILURE; /* no space to write but I didn't write anything yet, shouldn't really happen */
3937 				}
3938 				/* write out chunk */
3939 				int out_error = SYSCTL_OUT(req, buf, offset);
3940 				if (out_error) {
3941 					return KERN_FAILURE;
3942 				}
3943 				offset = 0;
3944 				bzero(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3945 				continue; /* need to retry the entry dump again with the cleaned buffer */
3946 			} else if (kr != KERN_SUCCESS) {
3947 				return kr;
3948 			}
3949 			offset += left_sz;
3950 			break;
3951 		}
3952 		return KERN_SUCCESS;
3953 	};
3954 
3955 	/* this foreach first calls to the first callback with the number of entries, then calls the second for every entry
3956 	 * when the buffer is exhausted, it is flushed to the sysctl and restarted */
3957 	kern_return_t kr = vm_map_entries_foreach(map, write_header, write_entry);
3958 
3959 	if (kr != KERN_SUCCESS) {
3960 		goto out;
3961 	}
3962 
3963 	if (offset > 0) { /* last chunk */
3964 		error = SYSCTL_OUT(req, buf, offset);
3965 	}
3966 
3967 out:
3968 	if (buf != NULL) {
3969 		kfree_data(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE)
3970 	}
3971 	if (map != NULL) {
3972 		vm_map_deallocate(map);
3973 	}
3974 	return error;
3975 }
3976 
3977 SYSCTL_PROC(_vm, OID_AUTO, task_vm_objects_slotmap, CTLTYPE_NODE | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_task_vm_objects_slotmap, "S", "");
3978 
3979 #pragma mark VM Host Statistics
3980 
3981 SYSCTL_NODE(_vm, OID_AUTO, stat, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Host memory statistics");
3982 
3983 SYSCTL_SCALABLE_COUNTER(_vm_stat, zero_fills, vm_statistics_zero_fill_count, "Pages zero-filled");
3984 SYSCTL_SCALABLE_COUNTER(_vm_stat, reactivations, vm_statistics_reactivations, "Pages reactivated");
3985 SYSCTL_SCALABLE_COUNTER(_vm_stat, pageins, vm_statistics_pageins, "Pages paged-in (including speculation)");
3986 SYSCTL_SCALABLE_COUNTER(_vm_stat, pageins_requested, vm_statistics_pageins_requested, "Page-ins requested");
3987 SYSCTL_SCALABLE_COUNTER(_vm_stat, pageins_aborted, vm_statistics_pageins_aborted, "Pages aborted during page-in");
3988 SYSCTL_SCALABLE_COUNTER(_vm_stat, pageouts, vm_statistics_pageouts, "Pages paged-out");
3989 SYSCTL_SCALABLE_COUNTER(_vm_stat, faults, vm_statistics_faults, "Pages faulted");
3990 SYSCTL_SCALABLE_COUNTER(_vm_stat, cow_faults, vm_statistics_cow_faults, "Pages faulted due to copy-on-write");
3991 SYSCTL_SCALABLE_COUNTER(_vm_stat, obj_cache_lookups, vm_statistics_lookups, "Pages looked up in the object-cache");
3992 SYSCTL_SCALABLE_COUNTER(_vm_stat, obj_cache_hits, vm_statistics_hits, "Object-cache lookup hits");
3993 SYSCTL_SCALABLE_COUNTER(_vm_stat, purges, vm_statistics_purges, "Pages purged");
3994 SYSCTL_SCALABLE_COUNTER(_vm_stat, decompressions, vm_statistics_decompressions, "Pages decompressed");
3995 SYSCTL_SCALABLE_COUNTER(_vm_stat, compressions, vm_statistics_compressions, "Pages compressed");
3996 SYSCTL_SCALABLE_COUNTER(_vm_stat, swapins, vm_statistics_swapins, "Pages swapped in");
3997 SYSCTL_SCALABLE_COUNTER(_vm_stat, swapouts, vm_statistics_swapouts, "Pages swapped out");
3998 
3999 static int
4000 systctl_vm_reset_tag SYSCTL_HANDLER_ARGS
4001 {
4002 #pragma unused(oidp, arg1, arg2)
4003 	int error;
4004 	int tag;
4005 	kern_return_t kr;
4006 
4007 	/* Need to be root */
4008 	if (!kauth_cred_issuser(kauth_cred_get())) {
4009 		return EPERM;
4010 	}
4011 
4012 	error = SYSCTL_IN(req, &tag, sizeof(tag));
4013 	if (error) {
4014 		return error;
4015 	}
4016 
4017 	if (tag > VM_MAX_TAG_VALUE) {
4018 		return EINVAL;
4019 	}
4020 
4021 	kr = vm_tag_reset_peak((vm_tag_t)tag);
4022 
4023 	return mach_to_bsd_errno(kr);
4024 }
4025 
4026 SYSCTL_PROC(_vm, OID_AUTO, reset_tag,
4027     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED,
4028     0, 0, &systctl_vm_reset_tag, "I", "");
4029 
4030 static int
4031 systctl_vm_reset_all_tags SYSCTL_HANDLER_ARGS
4032 {
4033 #pragma unused(oidp, arg1, arg2)
4034 	/* Only reset the values if the sysctl is a write */
4035 	if (!req->newptr) {
4036 		return EINVAL;
4037 	}
4038 
4039 	/* Need to be root */
4040 	if (!kauth_cred_issuser(kauth_cred_get())) {
4041 		return EPERM;
4042 	}
4043 
4044 	vm_tag_reset_all_peaks();
4045 
4046 	return 0;
4047 }
4048 
4049 SYSCTL_PROC(_vm, OID_AUTO, reset_all_tags,
4050     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED,
4051     0, 0, &systctl_vm_reset_all_tags, "I", "");
4052 
4053 #endif /* DEVELOPMENT || DEBUG */
4054 
4055 SYSCTL_NODE(_vm, OID_AUTO, compressor, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor");
4056 
4057 SYSCTL_INT(_vm_compressor, OID_AUTO, mode, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_mode, 0, "");
4058 SYSCTL_INT(_vm_compressor, OID_AUTO, is_active, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_is_active, 0, "");
4059 SYSCTL_INT(_vm_compressor, OID_AUTO, is_available, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_available, 0, "");
4060 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed, CTLFLAG_RD | CTLFLAG_LOCKED,
4061     &c_segment_pages_compressed, 0, "The amount of uncompressed data stored in the compressor (in pages)");
4062 #if CONFIG_FREEZE
4063 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed_incore, CTLFLAG_RD | CTLFLAG_LOCKED,
4064     &c_segment_pages_compressed_incore, 0, "The amount of uncompressed data stored in the in-core compressor (in pages)");
4065 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed_incore_late_swapout, CTLFLAG_RD | CTLFLAG_LOCKED,
4066     &c_segment_pages_compressed_incore_late_swapout, 0, "The amount of uncompressed data stored in the in-core compressor and queued for swapout (in pages)");
4067 #endif
4068 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed_limit, CTLFLAG_RD | CTLFLAG_LOCKED,
4069     &c_segment_pages_compressed_limit, 0, "The limit on the amount of uncompressed data the compressor will store (in pages)");
4070 
4071 SYSCTL_NODE(_vm_compressor, OID_AUTO, segment, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor Segment Counts");
4072 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, total, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_count, 0, "Number of allocated segments");
4073 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, aging, CTLFLAG_RD | CTLFLAG_LOCKED, &c_age_count, 0, "Number of aging segments");
4074 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedin_early, CTLFLAG_RD | CTLFLAG_LOCKED, &c_early_swappedin_count, 0, "Number of (early) swapped-in segments");
4075 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedin_regular, CTLFLAG_RD | CTLFLAG_LOCKED, &c_regular_swappedin_count, 0, "Number of (regular) swapped-in segments");
4076 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedin_late, CTLFLAG_RD | CTLFLAG_LOCKED, &c_late_swappedin_count, 0, "Number of (late) swapped-in segments");
4077 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapout_early, CTLFLAG_RD | CTLFLAG_LOCKED, &c_early_swapout_count, 0, "Number of (early) ready-to-swap segments");
4078 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapout_regular, CTLFLAG_RD | CTLFLAG_LOCKED, &c_regular_swapout_count, 0, "Number of (regular) ready-to-swap segments");
4079 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapout_late, CTLFLAG_RD | CTLFLAG_LOCKED, &c_late_swapout_count, 0, "Number of (late) ready-to-swap segments");
4080 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapio, CTLFLAG_RD | CTLFLAG_LOCKED, &c_swapio_count, 0, "Number of swapping-out segments");
4081 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedout, CTLFLAG_RD | CTLFLAG_LOCKED, &c_swappedout_count, 0, "Number of (non-sparse) swapped-out segments");
4082 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedout_sparse, CTLFLAG_RD | CTLFLAG_LOCKED, &c_swappedout_sparse_count, 0, "Number of (sparse) swapped-out segments");
4083 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, majorcompact, CTLFLAG_RD | CTLFLAG_LOCKED, &c_major_count, 0, "Number of recently-compacted segments");
4084 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, minorcompact, CTLFLAG_RD | CTLFLAG_LOCKED, &c_minor_count, 0, "Number of segments queued for deferred minor compaction");
4085 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, filling, CTLFLAG_RD | CTLFLAG_LOCKED, &c_filling_count, 0, "Number of filling segments");
4086 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, empty, CTLFLAG_RD | CTLFLAG_LOCKED, &c_empty_count, 0, "Number of empty segments");
4087 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, bad, CTLFLAG_RD | CTLFLAG_LOCKED, &c_bad_count, 0, "Number of bad segments");
4088 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, limit, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segments_limit, 0, "Limit on the number of allocated segments");
4089 
4090 SYSCTL_NODE(_vm_compressor, OID_AUTO, svp, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor Single-Value");
4091 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, in_hash, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_in_hash, 0, "");
4092 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, hash_succeeded, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_hash_succeeded, 0, "");
4093 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, hash_failed, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_hash_failed, 0, "");
4094 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, zval_compressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_zero_compressions, 0, "");
4095 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, zval_decompressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_zero_decompressions, 0, "");
4096 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, nzval_compressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_nonzero_compressions, 0, "");
4097 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, nzval_decompressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_nonzero_decompressions, 0, "");
4098 
4099 SYSCTL_NODE(_vm_compressor, OID_AUTO, compactor, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor Compactor");
4100 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compactions_completed, CTLFLAG_RD | CTLFLAG_LOCKED,
4101     &vm_pageout_vminfo.vm_compactor_major_compactions_completed, "Major compactions completed");
4102 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compactions_considered, CTLFLAG_RD | CTLFLAG_LOCKED,
4103     &vm_pageout_vminfo.vm_compactor_major_compactions_considered, "Major compactions considered");
4104 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compactions_bailed, CTLFLAG_RD | CTLFLAG_LOCKED,
4105     &vm_pageout_vminfo.vm_compactor_major_compactions_bailed, "Major compactions bailed (due to contention)");
4106 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_bytes_moved, CTLFLAG_RD | CTLFLAG_LOCKED,
4107     &vm_pageout_vminfo.vm_compactor_major_compaction_bytes_moved, "Bytes moved between segments during major compactions");
4108 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_slots_moved, CTLFLAG_RD | CTLFLAG_LOCKED,
4109     &vm_pageout_vminfo.vm_compactor_major_compaction_slots_moved, "Slots moved between segments during major compactions");
4110 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_bytes_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
4111     &vm_pageout_vminfo.vm_compactor_major_compaction_bytes_freed, "Bytes freed as a result of major compaction");
4112 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_segments_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
4113     &vm_pageout_vminfo.vm_compactor_major_compaction_segments_freed, "Segments freed as a result of major compaction");
4114 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, swapouts_queued, CTLFLAG_RD | CTLFLAG_LOCKED,
4115     &vm_pageout_vminfo.vm_compactor_swapouts_queued, "The number of segments queued for swapout after a major compaction");
4116 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, swapout_bytes_wasted, CTLFLAG_RD | CTLFLAG_LOCKED,
4117     &vm_pageout_vminfo.vm_compactor_swapout_bytes_wasted, "The number of unused bytes in segments queued for swapout");
4118