1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Mach Operating System
30 * Copyright (c) 1987 Carnegie-Mellon University
31 * All rights reserved. The CMU software License Agreement specifies
32 * the terms and conditions for use and redistribution.
33 */
34 /*
35 * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
36 * support for mandatory and extensible security protections. This notice
37 * is included in support of clause 2.2 (b) of the Apple Public License,
38 * Version 2.0.
39 */
40 #include <vm/vm_options.h>
41
42 #include <kern/ecc.h>
43 #include <kern/task.h>
44 #include <kern/thread.h>
45 #include <kern/debug.h>
46 #include <kern/extmod_statistics.h>
47 #include <mach/mach_traps.h>
48 #include <mach/port.h>
49 #include <mach/sdt.h>
50 #include <mach/task.h>
51 #include <mach/task_access.h>
52 #include <mach/task_special_ports.h>
53 #include <mach/time_value.h>
54 #include <mach/vm_map.h>
55 #include <mach/vm_param.h>
56 #include <mach/vm_prot.h>
57 #include <machine/machine_routines.h>
58
59 #include <sys/file_internal.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/dir.h>
63 #include <sys/namei.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/vm.h>
67 #include <sys/file.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/mount.h>
70 #include <sys/xattr.h>
71 #include <sys/trace.h>
72 #include <sys/kernel.h>
73 #include <sys/ubc_internal.h>
74 #include <sys/user.h>
75 #include <sys/syslog.h>
76 #include <sys/stat.h>
77 #include <sys/sysproto.h>
78 #include <sys/mman.h>
79 #include <sys/sysctl.h>
80 #include <sys/cprotect.h>
81 #include <sys/kpi_socket.h>
82 #include <sys/kas_info.h>
83 #include <sys/socket.h>
84 #include <sys/socketvar.h>
85 #include <sys/random.h>
86 #include <sys/code_signing.h>
87 #if NECP
88 #include <net/necp.h>
89 #endif /* NECP */
90 #if SKYWALK
91 #include <skywalk/os_channel.h>
92 #endif /* SKYWALK */
93
94 #include <security/audit/audit.h>
95 #include <security/mac.h>
96 #include <bsm/audit_kevents.h>
97
98 #include <kern/kalloc.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_kern_xnu.h>
101 #include <vm/vm_pageout_xnu.h>
102
103 #include <mach/shared_region.h>
104 #include <vm/vm_shared_region_internal.h>
105
106 #include <vm/vm_dyld_pager_internal.h>
107 #include <vm/vm_protos_internal.h>
108 #include <vm/vm_compressor_info.h> /* for c_segment_info */
109 #include <vm/vm_compressor_internal.h>
110 #include <vm/vm_compressor_xnu.h> /* for vm_compressor_serialize_segment_debug_info() */
111 #include <vm/vm_object_xnu.h> /* for vm_chead_select_t */
112 #include <vm/vm_memory_entry_xnu.h>
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_reclaim_xnu.h>
115 #if HAS_MTE
116 #include <vm/vm_compressor_xnu.h>
117 #include <vm/vm_mteinfo_internal.h>
118 #endif /* HAS_MTE */
119
120 #include <sys/kern_memorystatus.h>
121 #include <sys/kern_memorystatus_freeze.h>
122 #include <sys/proc_internal.h>
123
124 #include <mach-o/fixup-chains.h>
125
126 #if CONFIG_MACF
127 #include <security/mac_framework.h>
128 #endif
129
130 #include <kern/bits.h>
131
132 #if CONFIG_CSR
133 #include <sys/csr.h>
134 #endif /* CONFIG_CSR */
135 #include <sys/trust_caches.h>
136 #include <libkern/amfi/amfi.h>
137 #include <IOKit/IOBSD.h>
138
139 #if VM_MAP_DEBUG_APPLE_PROTECT
140 SYSCTL_INT(_vm, OID_AUTO, map_debug_apple_protect, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_apple_protect, 0, "");
141 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
142
143 #if DEVELOPMENT || DEBUG
144
145 extern int vm_object_cache_evict_all(void);
146 static int
147 sysctl_vm_object_cache_evict SYSCTL_HANDLER_ARGS
148 {
149 #pragma unused(arg1, arg2, req)
150 (void) vm_object_cache_evict_all();
151 return 0;
152 }
153
154 SYSCTL_PROC(_vm, OID_AUTO, object_cache_evict, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
155 0, 0, &sysctl_vm_object_cache_evict, "I", "");
156
157 static int
158 sysctl_kmem_alloc_contig SYSCTL_HANDLER_ARGS
159 {
160 #pragma unused(arg1, arg2)
161 vm_offset_t kaddr;
162 kern_return_t kr;
163 int error = 0;
164 int size = 0;
165
166 error = sysctl_handle_int(oidp, &size, 0, req);
167 if (error || !req->newptr) {
168 return error;
169 }
170
171 kr = kmem_alloc_contig(kernel_map, &kaddr, (vm_size_t)size,
172 0, 0, 0, KMA_DATA, VM_KERN_MEMORY_IOKIT);
173
174 if (kr == KERN_SUCCESS) {
175 kmem_free(kernel_map, kaddr, size);
176 }
177
178 return error;
179 }
180
181 SYSCTL_PROC(_vm, OID_AUTO, kmem_alloc_contig, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
182 0, 0, &sysctl_kmem_alloc_contig, "I", "");
183
184 extern int vm_region_footprint;
185 SYSCTL_INT(_vm, OID_AUTO, region_footprint, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, &vm_region_footprint, 0, "");
186
187 static int
188 sysctl_kmem_gobj_stats SYSCTL_HANDLER_ARGS
189 {
190 #pragma unused(arg1, arg2, oidp)
191 kmem_gobj_stats stats = kmem_get_gobj_stats();
192
193 return SYSCTL_OUT(req, &stats, sizeof(stats));
194 }
195
196 SYSCTL_PROC(_vm, OID_AUTO, kmem_gobj_stats,
197 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
198 0, 0, &sysctl_kmem_gobj_stats, "S,kmem_gobj_stats", "");
199
200 #endif /* DEVELOPMENT || DEBUG */
201
202 static int
203 sysctl_vm_self_region_footprint SYSCTL_HANDLER_ARGS
204 {
205 #pragma unused(arg1, arg2, oidp)
206 int error = 0;
207 int value;
208
209 value = task_self_region_footprint();
210 error = SYSCTL_OUT(req, &value, sizeof(int));
211 if (error) {
212 return error;
213 }
214
215 if (!req->newptr) {
216 return 0;
217 }
218
219 error = SYSCTL_IN(req, &value, sizeof(int));
220 if (error) {
221 return error;
222 }
223 task_self_region_footprint_set(value);
224 return 0;
225 }
226 SYSCTL_PROC(_vm, OID_AUTO, self_region_footprint, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_footprint, "I", "");
227
228 static int
229 sysctl_vm_self_region_page_size SYSCTL_HANDLER_ARGS
230 {
231 #pragma unused(arg1, arg2, oidp)
232 int error = 0;
233 int value;
234
235 value = (1 << thread_self_region_page_shift());
236 error = SYSCTL_OUT(req, &value, sizeof(int));
237 if (error) {
238 return error;
239 }
240
241 if (!req->newptr) {
242 return 0;
243 }
244
245 error = SYSCTL_IN(req, &value, sizeof(int));
246 if (error) {
247 return error;
248 }
249
250 if (value != 0 && value != 4096 && value != 16384) {
251 return EINVAL;
252 }
253
254 #if !__ARM_MIXED_PAGE_SIZE__
255 if (value != vm_map_page_size(current_map())) {
256 return EINVAL;
257 }
258 #endif /* !__ARM_MIXED_PAGE_SIZE__ */
259
260 thread_self_region_page_shift_set(bit_first(value));
261 return 0;
262 }
263 SYSCTL_PROC(_vm, OID_AUTO, self_region_page_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_page_size, "I", "");
264
265 static int
266 sysctl_vm_self_region_info_flags SYSCTL_HANDLER_ARGS
267 {
268 #pragma unused(arg1, arg2, oidp)
269 int error = 0;
270 int value;
271 kern_return_t kr;
272
273 value = task_self_region_info_flags();
274 error = SYSCTL_OUT(req, &value, sizeof(int));
275 if (error) {
276 return error;
277 }
278
279 if (!req->newptr) {
280 return 0;
281 }
282
283 error = SYSCTL_IN(req, &value, sizeof(int));
284 if (error) {
285 return error;
286 }
287
288 kr = task_self_region_info_flags_set(value);
289 if (kr != KERN_SUCCESS) {
290 return EINVAL;
291 }
292
293 return 0;
294 }
295 SYSCTL_PROC(_vm, OID_AUTO, self_region_info_flags, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_info_flags, "I", "");
296
297
298 #if DEVELOPMENT || DEBUG
299 extern int panic_on_unsigned_execute;
300 SYSCTL_INT(_vm, OID_AUTO, panic_on_unsigned_execute, CTLFLAG_RW | CTLFLAG_LOCKED, &panic_on_unsigned_execute, 0, "");
301
302 extern int vm_log_xnu_user_debug;
303 SYSCTL_INT(_vm, OID_AUTO, log_xnu_user_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_xnu_user_debug, 0, "");
304 #endif /* DEVELOPMENT || DEBUG */
305
306 extern int vm_log_map_delete_permanent_prot_none;
307 SYSCTL_INT(_vm, OID_AUTO, log_map_delete_permanent_prot_none, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_map_delete_permanent_prot_none, 0, "");
308
309 extern int cs_executable_create_upl;
310 extern int cs_executable_wire;
311 SYSCTL_INT(_vm, OID_AUTO, cs_executable_create_upl, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_create_upl, 0, "");
312 SYSCTL_INT(_vm, OID_AUTO, cs_executable_wire, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_wire, 0, "");
313
314 extern int apple_protect_pager_count;
315 extern int apple_protect_pager_count_mapped;
316 extern unsigned int apple_protect_pager_cache_limit;
317 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count, 0, "");
318 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count_mapped, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count_mapped, 0, "");
319 SYSCTL_UINT(_vm, OID_AUTO, apple_protect_pager_cache_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_cache_limit, 0, "");
320
321 #if DEVELOPMENT || DEBUG
322 extern int radar_20146450;
323 SYSCTL_INT(_vm, OID_AUTO, radar_20146450, CTLFLAG_RW | CTLFLAG_LOCKED, &radar_20146450, 0, "");
324
325 extern int macho_printf;
326 SYSCTL_INT(_vm, OID_AUTO, macho_printf, CTLFLAG_RW | CTLFLAG_LOCKED, &macho_printf, 0, "");
327
328 extern int apple_protect_pager_data_request_debug;
329 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_data_request_debug, 0, "");
330
331 extern unsigned int vm_object_copy_delayed_paging_wait_disable;
332 EXPERIMENT_FACTOR_LEGACY_UINT(_vm, vm_object_copy_delayed_paging_wait_disable, &vm_object_copy_delayed_paging_wait_disable, FALSE, TRUE, "");
333
334 __enum_closed_decl(vm_submap_test_op, uint32_t, {
335 vsto_make_submap = 1, /* make submap from entries in current_map()
336 * at start..end, offset ignored */
337 vsto_remap_submap = 2, /* map in current_map() at start..end,
338 * from parent address submap_base_address
339 * and submap address offset */
340 vsto_end
341 });
342
343 static int
344 sysctl_vm_submap_test_ctl SYSCTL_HANDLER_ARGS
345 {
346 int error;
347 struct {
348 vm_submap_test_op op;
349 mach_vm_address_t submap_base_address;
350 mach_vm_address_t start;
351 mach_vm_address_t end;
352 mach_vm_address_t offset;
353 } args;
354 if (req->newlen != sizeof(args)) {
355 return EINVAL;
356 }
357 error = SYSCTL_IN(req, &args, sizeof(args));
358 if (error) {
359 return error;
360 }
361
362 switch (args.op) {
363 case vsto_make_submap:
364 vm_map_testing_make_sealed_submap(current_map(), args.start, args.end);
365 break;
366 case vsto_remap_submap:
367 vm_map_testing_remap_submap(current_map(),
368 args.submap_base_address, args.start, args.end, args.offset);
369 break;
370 default:
371 return EINVAL;
372 }
373
374 return 0;
375 }
376 SYSCTL_PROC(_vm, OID_AUTO, submap_test_ctl, CTLFLAG_WR | CTLFLAG_LOCKED, 0, 0, &sysctl_vm_submap_test_ctl, "-", "");
377
378 #if __arm64__
379 /* These are meant to support the page table accounting unit test. */
380 extern unsigned int arm_hardware_page_size;
381 extern unsigned int arm_pt_desc_size;
382 extern unsigned int arm_pt_root_size;
383 extern unsigned int inuse_user_tteroot_count;
384 extern unsigned int inuse_kernel_tteroot_count;
385 extern unsigned int inuse_user_ttepages_count;
386 extern unsigned int inuse_kernel_ttepages_count;
387 extern unsigned int inuse_user_ptepages_count;
388 extern unsigned int inuse_kernel_ptepages_count;
389 SYSCTL_UINT(_vm, OID_AUTO, native_hw_pagesize, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_hardware_page_size, 0, "");
390 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_desc_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_desc_size, 0, "");
391 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_root_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_root_size, 0, "");
392 SYSCTL_UINT(_vm, OID_AUTO, user_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_tteroot_count, 0, "");
393 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_tteroot_count, 0, "");
394 SYSCTL_UINT(_vm, OID_AUTO, user_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ttepages_count, 0, "");
395 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ttepages_count, 0, "");
396 SYSCTL_UINT(_vm, OID_AUTO, user_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ptepages_count, 0, "");
397 SYSCTL_UINT(_vm, OID_AUTO, kernel_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ptepages_count, 0, "");
398 #if !CONFIG_SPTM
399 extern unsigned int free_page_size_tt_count;
400 extern unsigned int free_tt_count;
401 SYSCTL_UINT(_vm, OID_AUTO, free_1page_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_page_size_tt_count, 0, "");
402 SYSCTL_UINT(_vm, OID_AUTO, free_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_tt_count, 0, "");
403 #endif
404 #if DEVELOPMENT || DEBUG
405 extern unsigned long pmap_asid_flushes;
406 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_flushes, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_flushes, "");
407 extern unsigned long pmap_asid_hits;
408 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_hits, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_hits, "");
409 extern unsigned long pmap_asid_misses;
410 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_misses, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_misses, "");
411 extern unsigned long pmap_speculation_restrictions;
412 SYSCTL_ULONG(_vm, OID_AUTO, pmap_speculation_restrictions, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_speculation_restrictions, "");
413 #endif
414 #endif /* __arm64__ */
415 #endif /* DEVELOPMENT || DEBUG */
416
417 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor, 0, "");
418 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor_pages, 0, "");
419 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate, 0, "");
420 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate_failure, 0, "");
421 SYSCTL_INT(_vm, OID_AUTO, vm_should_cow_but_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.should_cow_but_wired, 0, "");
422 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow, 0, "");
423 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow_pages, 0, "");
424 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_write, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_write, 0, "");
425 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_copy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_copy, 0, "");
426 #if VM_SCAN_FOR_SHADOW_CHAIN
427 static int vm_shadow_max_enabled = 0; /* Disabled by default */
428 extern int proc_shadow_max(void);
429 static int
430 vm_shadow_max SYSCTL_HANDLER_ARGS
431 {
432 #pragma unused(arg1, arg2, oidp)
433 int value = 0;
434
435 if (vm_shadow_max_enabled) {
436 value = proc_shadow_max();
437 }
438
439 return SYSCTL_OUT(req, &value, sizeof(value));
440 }
441 SYSCTL_PROC(_vm, OID_AUTO, vm_shadow_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
442 0, 0, &vm_shadow_max, "I", "");
443
444 SYSCTL_INT(_vm, OID_AUTO, vm_shadow_max_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_shadow_max_enabled, 0, "");
445
446 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
447
448 SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, "");
449
450 #if PAGE_SLEEP_WITH_INHERITOR
451 #if DEVELOPMENT || DEBUG
452 extern uint32_t page_worker_table_size;
453 SYSCTL_INT(_vm, OID_AUTO, page_worker_table_size, CTLFLAG_RD | CTLFLAG_LOCKED, &page_worker_table_size, 0, "");
454 SCALABLE_COUNTER_DECLARE(page_worker_hash_collisions);
455 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_hash_collisions, page_worker_hash_collisions, "");
456 SCALABLE_COUNTER_DECLARE(page_worker_inheritor_sleeps);
457 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_inheritor_sleeps, page_worker_inheritor_sleeps, "");
458 #endif /* DEVELOPMENT || DEBUG */
459 #endif /* PAGE_SLEEP_WITH_INHERITOR */
460
461 #if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1
462 extern uint32_t vm_cheads;
463 extern vm_chead_select_t vm_chead_select;
464 extern boolean_t vm_chead_rehint;
465 #if DEVELOPMENT || DEBUG
466 SYSCTL_UINT(_vm, OID_AUTO, compressor_heads, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cheads, 0, "");
467 SYSCTL_UINT(_vm, OID_AUTO, compressor_head_select, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_chead_select, 0, "");
468 SYSCTL_INT(_vm, OID_AUTO, compressor_head_rehint, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_chead_rehint, 0, "");
469 #endif /* DEVELOPMENT || DEBUG */
470 EXPERIMENT_FACTOR_UINT(compressor_heads, &vm_cheads, 1, COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT, "");
471 EXPERIMENT_FACTOR_UINT(compressor_head_select, &vm_chead_select, CSEL_MIN, CSEL_MAX, "");
472 EXPERIMENT_FACTOR_INT(compressor_head_rehint, &vm_chead_rehint, 0, 1, "");
473 #endif /* COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 */
474
475 /*
476 * Sysctl's related to data/stack execution. See osfmk/vm/vm_map.c
477 */
478
479 #if DEVELOPMENT || DEBUG
480 extern int allow_stack_exec, allow_data_exec;
481
482 SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, "");
483 SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, "");
484
485 #endif /* DEVELOPMENT || DEBUG */
486
487 static const char *prot_values[] = {
488 "none",
489 "read-only",
490 "write-only",
491 "read-write",
492 "execute-only",
493 "read-execute",
494 "write-execute",
495 "read-write-execute"
496 };
497
498 void
log_stack_execution_failure(addr64_t vaddr,vm_prot_t prot)499 log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot)
500 {
501 printf("Data/Stack execution not permitted: %s[pid %d] at virtual address 0x%qx, protections were %s\n",
502 current_proc()->p_comm, proc_getpid(current_proc()), vaddr, prot_values[prot & VM_PROT_ALL]);
503 }
504
505 /*
506 * shared_region_unnest_logging: level of logging of unnesting events
507 * 0 - no logging
508 * 1 - throttled logging of unexpected unnesting events (default)
509 * 2 - unthrottled logging of unexpected unnesting events
510 * 3+ - unthrottled logging of all unnesting events
511 */
512 int shared_region_unnest_logging = 1;
513
514 SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED,
515 &shared_region_unnest_logging, 0, "");
516
517 int vm_shared_region_unnest_log_interval = 10;
518 int shared_region_unnest_log_count_threshold = 5;
519
520
521 #if XNU_TARGET_OS_OSX
522
523 #if defined (__x86_64__)
524 static int scdir_enforce = 1;
525 #else /* defined (__x86_64__) */
526 static int scdir_enforce = 0; /* AOT caches live elsewhere */
527 #endif /* defined (__x86_64__) */
528
529 static char *scdir_path[] = {
530 "/System/Library/dyld/",
531 "/System/Volumes/Preboot/Cryptexes/OS/System/Library/dyld",
532 "/System/Cryptexes/OS/System/Library/dyld",
533 NULL
534 };
535
536 #else /* XNU_TARGET_OS_OSX */
537
538 static int scdir_enforce = 0;
539 static char *scdir_path[] = {
540 "/System/Library/Caches/com.apple.dyld/",
541 "/private/preboot/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
542 "/System/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
543 NULL
544 };
545
546 #endif /* XNU_TARGET_OS_OSX */
547
548 static char *driverkit_scdir_path[] = {
549 "/System/DriverKit/System/Library/dyld/",
550 #if XNU_TARGET_OS_OSX
551 "/System/Volumes/Preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
552 #else
553 "/private/preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
554 #endif /* XNU_TARGET_OS_OSX */
555 "/System/Cryptexes/OS/System/DriverKit/System/Library/dyld",
556 NULL
557 };
558
559 #ifndef SECURE_KERNEL
560 static int sysctl_scdir_enforce SYSCTL_HANDLER_ARGS
561 {
562 #if CONFIG_CSR
563 if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
564 printf("Failed attempt to set vm.enforce_shared_cache_dir sysctl\n");
565 return EPERM;
566 }
567 #endif /* CONFIG_CSR */
568 return sysctl_handle_int(oidp, arg1, arg2, req);
569 }
570
571 SYSCTL_PROC(_vm, OID_AUTO, enforce_shared_cache_dir, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, sysctl_scdir_enforce, "I", "");
572 #endif
573
574 /* These log rate throttling state variables aren't thread safe, but
575 * are sufficient unto the task.
576 */
577 static int64_t last_unnest_log_time = 0;
578 static int shared_region_unnest_log_count = 0;
579
580 void
log_unnest_badness(vm_map_t m,vm_map_offset_t s,vm_map_offset_t e,boolean_t is_nested_map,vm_map_offset_t lowest_unnestable_addr)581 log_unnest_badness(
582 vm_map_t m,
583 vm_map_offset_t s,
584 vm_map_offset_t e,
585 boolean_t is_nested_map,
586 vm_map_offset_t lowest_unnestable_addr)
587 {
588 struct timeval tv;
589
590 if (shared_region_unnest_logging == 0) {
591 return;
592 }
593
594 if (shared_region_unnest_logging <= 2 &&
595 is_nested_map &&
596 s >= lowest_unnestable_addr) {
597 /*
598 * Unnesting of writable map entries is fine.
599 */
600 return;
601 }
602
603 if (shared_region_unnest_logging <= 1) {
604 microtime(&tv);
605 if ((tv.tv_sec - last_unnest_log_time) <
606 vm_shared_region_unnest_log_interval) {
607 if (shared_region_unnest_log_count++ >
608 shared_region_unnest_log_count_threshold) {
609 return;
610 }
611 } else {
612 last_unnest_log_time = tv.tv_sec;
613 shared_region_unnest_log_count = 0;
614 }
615 }
616
617 DTRACE_VM4(log_unnest_badness,
618 vm_map_t, m,
619 vm_map_offset_t, s,
620 vm_map_offset_t, e,
621 vm_map_offset_t, lowest_unnestable_addr);
622 printf("%s[%d] triggered unnest of range 0x%qx->0x%qx of DYLD shared region in VM map %p. While not abnormal for debuggers, this increases system memory footprint until the target exits.\n", current_proc()->p_comm, proc_getpid(current_proc()), (uint64_t)s, (uint64_t)e, (void *) VM_KERNEL_ADDRPERM(m));
623 }
624
625 uint64_t
vm_purge_filebacked_pagers(void)626 vm_purge_filebacked_pagers(void)
627 {
628 uint64_t pages_purged;
629
630 pages_purged = 0;
631 pages_purged += apple_protect_pager_purge_all();
632 pages_purged += shared_region_pager_purge_all();
633 pages_purged += dyld_pager_purge_all();
634 #if DEVELOPMENT || DEBUG
635 printf("%s:%d pages purged: %llu\n", __FUNCTION__, __LINE__, pages_purged);
636 #endif /* DEVELOPMENT || DEBUG */
637 return pages_purged;
638 }
639
640 int
useracc(user_addr_ut addr_u,user_size_ut len_u,int prot)641 useracc(
642 user_addr_ut addr_u,
643 user_size_ut len_u,
644 int prot)
645 {
646 vm_map_t map;
647 vm_prot_t vm_prot = VM_PROT_WRITE;
648
649 map = current_map();
650
651 if (prot == B_READ) {
652 vm_prot = VM_PROT_READ;
653 }
654
655 return vm_map_check_protection(map, addr_u,
656 vm_sanitize_compute_ut_end(addr_u, len_u), vm_prot,
657 VM_SANITIZE_CALLER_USERACC);
658 }
659
660 #if XNU_PLATFORM_MacOSX
661 static __attribute__((always_inline, warn_unused_result))
662 kern_return_t
vslock_sanitize(vm_map_t map,user_addr_ut addr_u,user_size_ut len_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)663 vslock_sanitize(
664 vm_map_t map,
665 user_addr_ut addr_u,
666 user_size_ut len_u,
667 vm_sanitize_caller_t vm_sanitize_caller,
668 vm_map_offset_t *start,
669 vm_map_offset_t *end,
670 vm_map_size_t *size)
671 {
672 return vm_sanitize_addr_size(addr_u, len_u, vm_sanitize_caller,
673 map,
674 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
675 size);
676 }
677 #endif /* XNU_PLATFORM_MacOSX */
678
679 int
vslock(user_addr_ut addr,user_size_ut len)680 vslock(user_addr_ut addr, user_size_ut len)
681 {
682 kern_return_t kret;
683
684 #if XNU_PLATFORM_MacOSX
685 /*
686 * Preserve previous behavior on macOS for overflows due to bin
687 * compatibility i.e. return success for overflows without doing
688 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
689 * for overflow errors which gets converted to KERN_SUCCESS by
690 * vm_sanitize_get_kr.
691 */
692 vm_map_offset_t start, end;
693 vm_map_size_t size;
694
695 kret = vslock_sanitize(current_map(),
696 addr,
697 len,
698 VM_SANITIZE_CALLER_VSLOCK,
699 &start,
700 &end,
701 &size);
702 if (__improbable(kret != KERN_SUCCESS)) {
703 switch (vm_sanitize_get_kr(kret)) {
704 case KERN_SUCCESS:
705 return 0;
706 case KERN_INVALID_ADDRESS:
707 case KERN_NO_SPACE:
708 return ENOMEM;
709 case KERN_PROTECTION_FAILURE:
710 return EACCES;
711 default:
712 return EINVAL;
713 }
714 }
715 #endif /* XNU_PLATFORM_MacOSX */
716
717 kret = vm_map_wire_kernel(current_map(), addr,
718 vm_sanitize_compute_ut_end(addr, len),
719 vm_sanitize_wrap_prot(VM_PROT_READ | VM_PROT_WRITE),
720 VM_KERN_MEMORY_BSD,
721 FALSE);
722
723 switch (kret) {
724 case KERN_SUCCESS:
725 return 0;
726 case KERN_INVALID_ADDRESS:
727 case KERN_NO_SPACE:
728 return ENOMEM;
729 case KERN_PROTECTION_FAILURE:
730 return EACCES;
731 default:
732 return EINVAL;
733 }
734 }
735
736 int
vsunlock(user_addr_ut addr,user_size_ut len,__unused int dirtied)737 vsunlock(user_addr_ut addr, user_size_ut len, __unused int dirtied)
738 {
739 #if FIXME /* [ */
740 pmap_t pmap;
741 vm_page_t pg;
742 vm_map_offset_t vaddr;
743 ppnum_t paddr;
744 #endif /* FIXME ] */
745 kern_return_t kret;
746 vm_map_t map;
747
748 map = current_map();
749
750 #if FIXME /* [ */
751 if (dirtied) {
752 pmap = get_task_pmap(current_task());
753 for (vaddr = vm_map_trunc_page(addr, PAGE_MASK);
754 vaddr < vm_map_round_page(addr + len, PAGE_MASK);
755 vaddr += PAGE_SIZE) {
756 paddr = pmap_find_phys(pmap, vaddr);
757 pg = PHYS_TO_VM_PAGE(paddr);
758 vm_page_set_modified(pg);
759 }
760 }
761 #endif /* FIXME ] */
762 #ifdef lint
763 dirtied++;
764 #endif /* lint */
765
766 #if XNU_PLATFORM_MacOSX
767 /*
768 * Preserve previous behavior on macOS for overflows due to bin
769 * compatibility i.e. return success for overflows without doing
770 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
771 * for overflow errors which gets converted to KERN_SUCCESS by
772 * vm_sanitize_get_kr.
773 */
774 vm_map_offset_t start, end;
775 vm_map_size_t size;
776
777 kret = vslock_sanitize(map,
778 addr,
779 len,
780 VM_SANITIZE_CALLER_VSUNLOCK,
781 &start,
782 &end,
783 &size);
784 if (__improbable(kret != KERN_SUCCESS)) {
785 switch (vm_sanitize_get_kr(kret)) {
786 case KERN_SUCCESS:
787 return 0;
788 case KERN_INVALID_ADDRESS:
789 case KERN_NO_SPACE:
790 return ENOMEM;
791 case KERN_PROTECTION_FAILURE:
792 return EACCES;
793 default:
794 return EINVAL;
795 }
796 }
797 #endif /* XNU_PLATFORM_MacOSX */
798
799 kret = vm_map_unwire(map, addr,
800 vm_sanitize_compute_ut_end(addr, len), false);
801 switch (kret) {
802 case KERN_SUCCESS:
803 return 0;
804 case KERN_INVALID_ADDRESS:
805 case KERN_NO_SPACE:
806 return ENOMEM;
807 case KERN_PROTECTION_FAILURE:
808 return EACCES;
809 default:
810 return EINVAL;
811 }
812 }
813
814 int
subyte(user_addr_t addr,int byte)815 subyte(
816 user_addr_t addr,
817 int byte)
818 {
819 char character;
820
821 character = (char)byte;
822 return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
823 }
824
825 int
suibyte(user_addr_t addr,int byte)826 suibyte(
827 user_addr_t addr,
828 int byte)
829 {
830 char character;
831
832 character = (char)byte;
833 return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
834 }
835
836 int
fubyte(user_addr_t addr)837 fubyte(user_addr_t addr)
838 {
839 unsigned char byte;
840
841 if (copyin(addr, (void *) &byte, sizeof(char))) {
842 return -1;
843 }
844 return byte;
845 }
846
847 int
fuibyte(user_addr_t addr)848 fuibyte(user_addr_t addr)
849 {
850 unsigned char byte;
851
852 if (copyin(addr, (void *) &(byte), sizeof(char))) {
853 return -1;
854 }
855 return byte;
856 }
857
858 int
suword(user_addr_t addr,long word)859 suword(
860 user_addr_t addr,
861 long word)
862 {
863 return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
864 }
865
866 long
fuword(user_addr_t addr)867 fuword(user_addr_t addr)
868 {
869 long word = 0;
870
871 if (copyin(addr, (void *) &word, sizeof(int))) {
872 return -1;
873 }
874 return word;
875 }
876
877 /* suiword and fuiword are the same as suword and fuword, respectively */
878
879 int
suiword(user_addr_t addr,long word)880 suiword(
881 user_addr_t addr,
882 long word)
883 {
884 return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
885 }
886
887 long
fuiword(user_addr_t addr)888 fuiword(user_addr_t addr)
889 {
890 long word = 0;
891
892 if (copyin(addr, (void *) &word, sizeof(int))) {
893 return -1;
894 }
895 return word;
896 }
897
898 /*
899 * With a 32-bit kernel and mixed 32/64-bit user tasks, this interface allows the
900 * fetching and setting of process-sized size_t and pointer values.
901 */
902 int
sulong(user_addr_t addr,int64_t word)903 sulong(user_addr_t addr, int64_t word)
904 {
905 if (IS_64BIT_PROCESS(current_proc())) {
906 return copyout((void *)&word, addr, sizeof(word)) == 0 ? 0 : -1;
907 } else {
908 return suiword(addr, (long)word);
909 }
910 }
911
912 int64_t
fulong(user_addr_t addr)913 fulong(user_addr_t addr)
914 {
915 int64_t longword;
916
917 if (IS_64BIT_PROCESS(current_proc())) {
918 if (copyin(addr, (void *)&longword, sizeof(longword)) != 0) {
919 return -1;
920 }
921 return longword;
922 } else {
923 return (int64_t)fuiword(addr);
924 }
925 }
926
927 int
suulong(user_addr_t addr,uint64_t uword)928 suulong(user_addr_t addr, uint64_t uword)
929 {
930 if (IS_64BIT_PROCESS(current_proc())) {
931 return copyout((void *)&uword, addr, sizeof(uword)) == 0 ? 0 : -1;
932 } else {
933 return suiword(addr, (uint32_t)uword);
934 }
935 }
936
937 uint64_t
fuulong(user_addr_t addr)938 fuulong(user_addr_t addr)
939 {
940 uint64_t ulongword;
941
942 if (IS_64BIT_PROCESS(current_proc())) {
943 if (copyin(addr, (void *)&ulongword, sizeof(ulongword)) != 0) {
944 return -1ULL;
945 }
946 return ulongword;
947 } else {
948 return (uint64_t)fuiword(addr);
949 }
950 }
951
952 int
swapon(__unused proc_t procp,__unused struct swapon_args * uap,__unused int * retval)953 swapon(__unused proc_t procp, __unused struct swapon_args *uap, __unused int *retval)
954 {
955 return ENOTSUP;
956 }
957
958 #if defined(SECURE_KERNEL)
959 static int kern_secure_kernel = 1;
960 #else
961 static int kern_secure_kernel = 0;
962 #endif
963
964 SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_secure_kernel, 0, "");
965 SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW | CTLFLAG_LOCKED,
966 &shared_region_trace_level, 0, "");
967 SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD | CTLFLAG_LOCKED,
968 &shared_region_version, 0, "");
969 SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED,
970 &shared_region_persistence, 0, "");
971
972 /*
973 * shared_region_check_np:
974 *
975 * This system call is intended for dyld.
976 *
977 * dyld calls this when any process starts to see if the process's shared
978 * region is already set up and ready to use.
979 * This call returns the base address of the first mapping in the
980 * process's shared region's first mapping.
981 * dyld will then check what's mapped at that address.
982 *
983 * If the shared region is empty, dyld will then attempt to map the shared
984 * cache file in the shared region via the shared_region_map_and_slide_2_np()
985 * system call.
986 *
987 * If something's already mapped in the shared region, dyld will check if it
988 * matches the shared cache it would like to use for that process.
989 * If it matches, evrything's ready and the process can proceed and use the
990 * shared region.
991 * If it doesn't match, dyld will unmap the shared region and map the shared
992 * cache into the process's address space via mmap().
993 *
994 * A NULL pointer argument can be used by dyld to indicate it has unmapped
995 * the shared region. We will remove the shared_region reference from the task.
996 *
997 * ERROR VALUES
998 * EINVAL no shared region
999 * ENOMEM shared region is empty
1000 * EFAULT bad address for "start_address"
1001 */
1002 int
shared_region_check_np(__unused struct proc * p,struct shared_region_check_np_args * uap,__unused int * retvalp)1003 shared_region_check_np(
1004 __unused struct proc *p,
1005 struct shared_region_check_np_args *uap,
1006 __unused int *retvalp)
1007 {
1008 vm_shared_region_t shared_region;
1009 mach_vm_offset_t start_address = 0;
1010 int error = 0;
1011 kern_return_t kr = KERN_FAILURE;
1012 task_t task = current_task();
1013
1014 SHARED_REGION_TRACE_DEBUG(
1015 ("shared_region: %p [%d(%s)] -> check_np(0x%llx)\n",
1016 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1017 proc_getpid(p), p->p_comm,
1018 (uint64_t)uap->start_address));
1019
1020 /*
1021 * Special value of start_address used to indicate that map_with_linking() should
1022 * no longer be allowed in this process
1023 */
1024 if (uap->start_address == (task_get_64bit_addr(task) ? DYLD_VM_END_MWL : (uint32_t)DYLD_VM_END_MWL)) {
1025 p->p_disallow_map_with_linking = TRUE;
1026 return 0;
1027 }
1028
1029 /* retrieve the current task's shared region */
1030 shared_region = vm_shared_region_get(task);
1031 if (shared_region != NULL) {
1032 /*
1033 * A NULL argument is used by dyld to indicate the task
1034 * has unmapped its shared region.
1035 */
1036 if (uap->start_address == 0) {
1037 /* unmap it first */
1038 vm_shared_region_remove(task, shared_region);
1039 vm_shared_region_set(task, NULL);
1040 } else {
1041 /* retrieve address of its first mapping... */
1042 kr = vm_shared_region_start_address(shared_region, &start_address);
1043 if (kr != KERN_SUCCESS) {
1044 SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1045 "check_np(0x%llx) "
1046 "vm_shared_region_start_address() returned 0x%x\n",
1047 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1048 proc_getpid(p), p->p_comm,
1049 (uint64_t)uap->start_address, kr));
1050 error = ENOMEM;
1051 }
1052 if (error == 0) {
1053 /* Insert the shared region submap and various bits of debug info into the task. */
1054 kr = vm_shared_region_update_task(task, shared_region, start_address);
1055 if (kr != KERN_SUCCESS) {
1056 SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1057 "check_np(0x%llx) "
1058 "vm_shared_update_task() returned 0x%x\n",
1059 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1060 proc_getpid(p), p->p_comm,
1061 (uint64_t)uap->start_address, kr));
1062
1063 error = ENOMEM;
1064 }
1065 }
1066 #if __has_feature(ptrauth_calls)
1067 /*
1068 * Remap any section of the shared library that
1069 * has authenticated pointers into private memory.
1070 */
1071 if ((error == 0) && (vm_shared_region_auth_remap(shared_region) != KERN_SUCCESS)) {
1072 SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1073 "check_np(0x%llx) "
1074 "vm_shared_region_auth_remap() failed\n",
1075 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1076 proc_getpid(p), p->p_comm,
1077 (uint64_t)uap->start_address));
1078 error = ENOMEM;
1079 }
1080 #endif /* __has_feature(ptrauth_calls) */
1081 /* Give the start address to the caller */
1082 if (error == 0) {
1083 error = copyout(&start_address,
1084 (user_addr_t) uap->start_address,
1085 sizeof(start_address));
1086 if (error != 0) {
1087 SHARED_REGION_TRACE_ERROR(
1088 ("shared_region: %p [%d(%s)] "
1089 "check_np(0x%llx) "
1090 "copyout(0x%llx) error %d\n",
1091 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1092 proc_getpid(p), p->p_comm,
1093 (uint64_t)uap->start_address, (uint64_t)start_address,
1094 error));
1095 }
1096 }
1097 }
1098 vm_shared_region_deallocate(shared_region);
1099 } else {
1100 /* no shared region ! */
1101 error = EINVAL;
1102 }
1103
1104 SHARED_REGION_TRACE_DEBUG(
1105 ("shared_region: %p [%d(%s)] check_np(0x%llx) <- 0x%llx %d\n",
1106 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1107 proc_getpid(p), p->p_comm,
1108 (uint64_t)uap->start_address, (uint64_t)start_address, error));
1109
1110 return error;
1111 }
1112
1113
1114 static int
shared_region_copyin(struct proc * p,user_addr_t user_addr,unsigned int count,unsigned int element_size,void * kernel_data)1115 shared_region_copyin(
1116 struct proc *p,
1117 user_addr_t user_addr,
1118 unsigned int count,
1119 unsigned int element_size,
1120 void *kernel_data)
1121 {
1122 int error = 0;
1123 vm_size_t size = count * element_size;
1124
1125 error = copyin(user_addr, kernel_data, size);
1126 if (error) {
1127 SHARED_REGION_TRACE_ERROR(
1128 ("shared_region: %p [%d(%s)] map(): "
1129 "copyin(0x%llx, %ld) failed (error=%d)\n",
1130 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1131 proc_getpid(p), p->p_comm,
1132 (uint64_t)user_addr, (long)size, error));
1133 }
1134 return error;
1135 }
1136
1137 /*
1138 * A reasonable upper limit to prevent overflow of allocation/copyin.
1139 */
1140 #define _SR_FILE_MAPPINGS_MAX_FILES 256
1141
1142 /* forward declaration */
1143 __attribute__((noinline))
1144 static void shared_region_map_and_slide_cleanup(
1145 struct proc *p,
1146 uint32_t files_count,
1147 struct _sr_file_mappings *sr_file_mappings,
1148 struct vm_shared_region *shared_region);
1149
1150 /*
1151 * Setup part of _shared_region_map_and_slide().
1152 * It had to be broken out of _shared_region_map_and_slide() to
1153 * prevent compiler inlining from blowing out the stack.
1154 */
1155 __attribute__((noinline))
1156 static int
shared_region_map_and_slide_setup(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings,struct _sr_file_mappings ** sr_file_mappings,struct vm_shared_region ** shared_region_ptr,struct vnode * rdir_vp)1157 shared_region_map_and_slide_setup(
1158 struct proc *p,
1159 uint32_t files_count,
1160 struct shared_file_np *files,
1161 uint32_t mappings_count,
1162 struct shared_file_mapping_slide_np *mappings,
1163 struct _sr_file_mappings **sr_file_mappings,
1164 struct vm_shared_region **shared_region_ptr,
1165 struct vnode *rdir_vp)
1166 {
1167 int error = 0;
1168 struct _sr_file_mappings *srfmp;
1169 uint32_t mappings_next;
1170 struct vnode_attr va;
1171 off_t fs;
1172 #if CONFIG_MACF
1173 vm_prot_t maxprot = VM_PROT_ALL;
1174 #endif
1175 uint32_t i;
1176 struct vm_shared_region *shared_region = NULL;
1177 boolean_t is_driverkit = task_is_driver(current_task());
1178
1179 SHARED_REGION_TRACE_DEBUG(
1180 ("shared_region: %p [%d(%s)] -> map_and_slide_setup\n",
1181 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1182 proc_getpid(p), p->p_comm));
1183
1184 if (files_count > _SR_FILE_MAPPINGS_MAX_FILES) {
1185 error = E2BIG;
1186 goto done;
1187 }
1188 if (files_count == 0) {
1189 error = EINVAL;
1190 goto done;
1191 }
1192 *sr_file_mappings = kalloc_type(struct _sr_file_mappings, files_count,
1193 Z_WAITOK | Z_ZERO);
1194 if (*sr_file_mappings == NULL) {
1195 error = ENOMEM;
1196 goto done;
1197 }
1198 mappings_next = 0;
1199 for (i = 0; i < files_count; i++) {
1200 srfmp = &(*sr_file_mappings)[i];
1201 srfmp->fd = files[i].sf_fd;
1202 srfmp->mappings_count = files[i].sf_mappings_count;
1203 srfmp->mappings = &mappings[mappings_next];
1204 mappings_next += srfmp->mappings_count;
1205 if (mappings_next > mappings_count) {
1206 error = EINVAL;
1207 goto done;
1208 }
1209 srfmp->slide = files[i].sf_slide;
1210 }
1211
1212 /* get the process's shared region (setup in vm_map_exec()) */
1213 shared_region = vm_shared_region_get(current_task());
1214 *shared_region_ptr = shared_region;
1215 if (shared_region == NULL) {
1216 SHARED_REGION_TRACE_ERROR(
1217 ("shared_region: %p [%d(%s)] map(): "
1218 "no shared region\n",
1219 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1220 proc_getpid(p), p->p_comm));
1221 error = EINVAL;
1222 goto done;
1223 }
1224
1225 /*
1226 * Check the shared region matches the current root
1227 * directory of this process. Deny the mapping to
1228 * avoid tainting the shared region with something that
1229 * doesn't quite belong into it.
1230 */
1231 struct vnode *sr_vnode = vm_shared_region_root_dir(shared_region);
1232 if (sr_vnode != NULL ? rdir_vp != sr_vnode : rdir_vp != rootvnode) {
1233 SHARED_REGION_TRACE_ERROR(
1234 ("shared_region: map(%p) root_dir mismatch\n",
1235 (void *)VM_KERNEL_ADDRPERM(current_thread())));
1236 error = EPERM;
1237 goto done;
1238 }
1239
1240
1241 for (srfmp = &(*sr_file_mappings)[0];
1242 srfmp < &(*sr_file_mappings)[files_count];
1243 srfmp++) {
1244 if (srfmp->mappings_count == 0) {
1245 /* no mappings here... */
1246 continue;
1247 }
1248
1249 /*
1250 * A file descriptor of -1 is used to indicate that the data
1251 * to be put in the shared region for this mapping comes directly
1252 * from the processes address space. Ensure we have proper alignments.
1253 */
1254 if (srfmp->fd == -1) {
1255 /* only allow one mapping per fd */
1256 if (srfmp->mappings_count > 1) {
1257 SHARED_REGION_TRACE_ERROR(
1258 ("shared_region: %p [%d(%s)] map data >1 mapping\n",
1259 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1260 proc_getpid(p), p->p_comm));
1261 error = EINVAL;
1262 goto done;
1263 }
1264
1265 /*
1266 * The destination address and size must be page aligned.
1267 */
1268 struct shared_file_mapping_slide_np *mapping = &srfmp->mappings[0];
1269 mach_vm_address_t dest_addr = mapping->sms_address;
1270 mach_vm_size_t map_size = mapping->sms_size;
1271 if (!vm_map_page_aligned(dest_addr, vm_map_page_mask(current_map()))) {
1272 SHARED_REGION_TRACE_ERROR(
1273 ("shared_region: %p [%d(%s)] map data destination 0x%llx not aligned\n",
1274 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1275 proc_getpid(p), p->p_comm, dest_addr));
1276 error = EINVAL;
1277 goto done;
1278 }
1279 if (!vm_map_page_aligned(map_size, vm_map_page_mask(current_map()))) {
1280 SHARED_REGION_TRACE_ERROR(
1281 ("shared_region: %p [%d(%s)] map data size 0x%llx not aligned\n",
1282 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1283 proc_getpid(p), p->p_comm, map_size));
1284 error = EINVAL;
1285 goto done;
1286 }
1287 continue;
1288 }
1289
1290 /* get file structure from file descriptor */
1291 error = fp_get_ftype(p, srfmp->fd, DTYPE_VNODE, EINVAL, &srfmp->fp);
1292 if (error) {
1293 SHARED_REGION_TRACE_ERROR(
1294 ("shared_region: %p [%d(%s)] map: "
1295 "fd=%d lookup failed (error=%d)\n",
1296 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1297 proc_getpid(p), p->p_comm, srfmp->fd, error));
1298 goto done;
1299 }
1300
1301 /* we need at least read permission on the file */
1302 if (!(srfmp->fp->fp_glob->fg_flag & FREAD)) {
1303 SHARED_REGION_TRACE_ERROR(
1304 ("shared_region: %p [%d(%s)] map: "
1305 "fd=%d not readable\n",
1306 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1307 proc_getpid(p), p->p_comm, srfmp->fd));
1308 error = EPERM;
1309 goto done;
1310 }
1311
1312 /* get vnode from file structure */
1313 error = vnode_getwithref((vnode_t)fp_get_data(srfmp->fp));
1314 if (error) {
1315 SHARED_REGION_TRACE_ERROR(
1316 ("shared_region: %p [%d(%s)] map: "
1317 "fd=%d getwithref failed (error=%d)\n",
1318 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1319 proc_getpid(p), p->p_comm, srfmp->fd, error));
1320 goto done;
1321 }
1322 srfmp->vp = (struct vnode *)fp_get_data(srfmp->fp);
1323
1324 /* make sure the vnode is a regular file */
1325 if (srfmp->vp->v_type != VREG) {
1326 SHARED_REGION_TRACE_ERROR(
1327 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1328 "not a file (type=%d)\n",
1329 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1330 proc_getpid(p), p->p_comm,
1331 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1332 srfmp->vp->v_name, srfmp->vp->v_type));
1333 error = EINVAL;
1334 goto done;
1335 }
1336
1337 #if CONFIG_MACF
1338 /* pass in 0 for the offset argument because AMFI does not need the offset
1339 * of the shared cache */
1340 error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
1341 srfmp->fp->fp_glob, VM_PROT_ALL, MAP_FILE | MAP_PRIVATE | MAP_FIXED, 0, &maxprot);
1342 if (error) {
1343 goto done;
1344 }
1345 #endif /* MAC */
1346
1347 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1348 /*
1349 * Check if the shared cache is in the trust cache;
1350 * if so, we can skip the root ownership check.
1351 */
1352 #if DEVELOPMENT || DEBUG
1353 /*
1354 * Skip both root ownership and trust cache check if
1355 * enforcement is disabled.
1356 */
1357 if (!cs_system_enforcement()) {
1358 goto after_root_check;
1359 }
1360 #endif /* DEVELOPMENT || DEBUG */
1361 struct cs_blob *blob = csvnode_get_blob(srfmp->vp, 0);
1362 if (blob == NULL) {
1363 SHARED_REGION_TRACE_ERROR(
1364 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1365 "missing CS blob\n",
1366 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1367 proc_getpid(p), p->p_comm,
1368 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1369 srfmp->vp->v_name));
1370 goto root_check;
1371 }
1372 const uint8_t *cdhash = csblob_get_cdhash(blob);
1373 if (cdhash == NULL) {
1374 SHARED_REGION_TRACE_ERROR(
1375 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1376 "missing cdhash\n",
1377 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1378 proc_getpid(p), p->p_comm,
1379 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1380 srfmp->vp->v_name));
1381 goto root_check;
1382 }
1383
1384 bool in_trust_cache = false;
1385 TrustCacheQueryToken_t qt;
1386 if (query_trust_cache(kTCQueryTypeAll, cdhash, &qt) == KERN_SUCCESS) {
1387 TCType_t tc_type = kTCTypeInvalid;
1388 TCReturn_t tc_ret = amfi->TrustCache.queryGetTCType(&qt, &tc_type);
1389 in_trust_cache = (tc_ret.error == kTCReturnSuccess &&
1390 (tc_type == kTCTypeCryptex1BootOS ||
1391 tc_type == kTCTypeStatic ||
1392 tc_type == kTCTypeEngineering));
1393 }
1394 if (!in_trust_cache) {
1395 SHARED_REGION_TRACE_ERROR(
1396 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1397 "not in trust cache\n",
1398 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1399 proc_getpid(p), p->p_comm,
1400 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1401 srfmp->vp->v_name));
1402 goto root_check;
1403 }
1404 goto after_root_check;
1405 root_check:
1406 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1407
1408 /* The shared cache file must be owned by root */
1409 VATTR_INIT(&va);
1410 VATTR_WANTED(&va, va_uid);
1411 error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1412 if (error) {
1413 SHARED_REGION_TRACE_ERROR(
1414 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1415 "vnode_getattr(%p) failed (error=%d)\n",
1416 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1417 proc_getpid(p), p->p_comm,
1418 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1419 srfmp->vp->v_name,
1420 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1421 error));
1422 goto done;
1423 }
1424 if (va.va_uid != 0) {
1425 SHARED_REGION_TRACE_ERROR(
1426 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1427 "owned by uid=%d instead of 0\n",
1428 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1429 proc_getpid(p), p->p_comm,
1430 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1431 srfmp->vp->v_name, va.va_uid));
1432 error = EPERM;
1433 goto done;
1434 }
1435
1436 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1437 after_root_check:
1438 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1439
1440 #if CONFIG_CSR
1441 if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
1442 VATTR_INIT(&va);
1443 VATTR_WANTED(&va, va_flags);
1444 error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1445 if (error) {
1446 SHARED_REGION_TRACE_ERROR(
1447 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1448 "vnode_getattr(%p) failed (error=%d)\n",
1449 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1450 proc_getpid(p), p->p_comm,
1451 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1452 srfmp->vp->v_name,
1453 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1454 error));
1455 goto done;
1456 }
1457
1458 if (!(va.va_flags & SF_RESTRICTED)) {
1459 /*
1460 * CSR is not configured in CSR_ALLOW_UNRESTRICTED_FS mode, and
1461 * the shared cache file is NOT SIP-protected, so reject the
1462 * mapping request
1463 */
1464 SHARED_REGION_TRACE_ERROR(
1465 ("shared_region: %p [%d(%s)] map(%p:'%s'), "
1466 "vnode is not SIP-protected. \n",
1467 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1468 proc_getpid(p), p->p_comm,
1469 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1470 srfmp->vp->v_name));
1471 error = EPERM;
1472 goto done;
1473 }
1474 }
1475 #else /* CONFIG_CSR */
1476
1477 /*
1478 * Devices without SIP/ROSP need to make sure that the shared cache
1479 * is either on the root volume or in the preboot cryptex volume.
1480 */
1481 assert(rdir_vp != NULL);
1482 if (srfmp->vp->v_mount != rdir_vp->v_mount) {
1483 vnode_t preboot_vp = NULL;
1484 #if XNU_TARGET_OS_OSX
1485 #define PREBOOT_CRYPTEX_PATH "/System/Volumes/Preboot/Cryptexes"
1486 #else
1487 #define PREBOOT_CRYPTEX_PATH "/private/preboot/Cryptexes"
1488 #endif
1489 error = vnode_lookup(PREBOOT_CRYPTEX_PATH, 0, &preboot_vp, vfs_context_current());
1490 if (error || srfmp->vp->v_mount != preboot_vp->v_mount) {
1491 SHARED_REGION_TRACE_ERROR(
1492 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1493 "not on process' root volume nor preboot volume\n",
1494 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1495 proc_getpid(p), p->p_comm,
1496 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1497 srfmp->vp->v_name));
1498 error = EPERM;
1499 if (preboot_vp) {
1500 (void)vnode_put(preboot_vp);
1501 }
1502 goto done;
1503 } else if (preboot_vp) {
1504 (void)vnode_put(preboot_vp);
1505 }
1506 }
1507 #endif /* CONFIG_CSR */
1508
1509 if (scdir_enforce) {
1510 char **expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1511 struct vnode *scdir_vp = NULL;
1512 for (expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1513 *expected_scdir_path != NULL;
1514 expected_scdir_path++) {
1515 /* get vnode for expected_scdir_path */
1516 error = vnode_lookup(*expected_scdir_path, 0, &scdir_vp, vfs_context_current());
1517 if (error) {
1518 SHARED_REGION_TRACE_ERROR(
1519 ("shared_region: %p [%d(%s)]: "
1520 "vnode_lookup(%s) failed (error=%d)\n",
1521 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1522 proc_getpid(p), p->p_comm,
1523 *expected_scdir_path, error));
1524 continue;
1525 }
1526
1527 /* check if parent is scdir_vp */
1528 assert(scdir_vp != NULL);
1529 if (vnode_parent(srfmp->vp) == scdir_vp) {
1530 (void)vnode_put(scdir_vp);
1531 scdir_vp = NULL;
1532 goto scdir_ok;
1533 }
1534 (void)vnode_put(scdir_vp);
1535 scdir_vp = NULL;
1536 }
1537 /* nothing matches */
1538 SHARED_REGION_TRACE_ERROR(
1539 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1540 "shared cache file not in expected directory\n",
1541 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1542 proc_getpid(p), p->p_comm,
1543 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1544 srfmp->vp->v_name));
1545 error = EPERM;
1546 goto done;
1547 }
1548 scdir_ok:
1549
1550 /* get vnode size */
1551 error = vnode_size(srfmp->vp, &fs, vfs_context_current());
1552 if (error) {
1553 SHARED_REGION_TRACE_ERROR(
1554 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1555 "vnode_size(%p) failed (error=%d)\n",
1556 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1557 proc_getpid(p), p->p_comm,
1558 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1559 srfmp->vp->v_name,
1560 (void *)VM_KERNEL_ADDRPERM(srfmp->vp), error));
1561 goto done;
1562 }
1563 srfmp->file_size = fs;
1564
1565 /* get the file's memory object handle */
1566 srfmp->file_control = ubc_getobject(srfmp->vp, UBC_HOLDOBJECT);
1567 if (srfmp->file_control == MEMORY_OBJECT_CONTROL_NULL) {
1568 SHARED_REGION_TRACE_ERROR(
1569 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1570 "no memory object\n",
1571 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1572 proc_getpid(p), p->p_comm,
1573 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1574 srfmp->vp->v_name));
1575 error = EINVAL;
1576 goto done;
1577 }
1578
1579 /* check that the mappings are properly covered by code signatures */
1580 if (!cs_system_enforcement()) {
1581 /* code signing is not enforced: no need to check */
1582 } else {
1583 for (i = 0; i < srfmp->mappings_count; i++) {
1584 if (srfmp->mappings[i].sms_init_prot & VM_PROT_ZF) {
1585 /* zero-filled mapping: not backed by the file */
1586 continue;
1587 }
1588 if (ubc_cs_is_range_codesigned(srfmp->vp,
1589 srfmp->mappings[i].sms_file_offset,
1590 srfmp->mappings[i].sms_size)) {
1591 /* this mapping is fully covered by code signatures */
1592 continue;
1593 }
1594 SHARED_REGION_TRACE_ERROR(
1595 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1596 "mapping #%d/%d [0x%llx:0x%llx:0x%llx:0x%x:0x%x] "
1597 "is not code-signed\n",
1598 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1599 proc_getpid(p), p->p_comm,
1600 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1601 srfmp->vp->v_name,
1602 i, srfmp->mappings_count,
1603 srfmp->mappings[i].sms_address,
1604 srfmp->mappings[i].sms_size,
1605 srfmp->mappings[i].sms_file_offset,
1606 srfmp->mappings[i].sms_max_prot,
1607 srfmp->mappings[i].sms_init_prot));
1608 error = EINVAL;
1609 goto done;
1610 }
1611 }
1612 }
1613 done:
1614 if (error != 0) {
1615 shared_region_map_and_slide_cleanup(p, files_count, *sr_file_mappings, shared_region);
1616 *sr_file_mappings = NULL;
1617 *shared_region_ptr = NULL;
1618 }
1619 SHARED_REGION_TRACE_DEBUG(
1620 ("shared_region: %p [%d(%s)] map_and_slide_setup <- %d\n",
1621 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1622 proc_getpid(p), p->p_comm, error));
1623 return error;
1624 }
1625
1626 /*
1627 * shared_region_map_np()
1628 *
1629 * This system call is intended for dyld.
1630 *
1631 * dyld uses this to map a shared cache file into a shared region.
1632 * This is usually done only the first time a shared cache is needed.
1633 * Subsequent processes will just use the populated shared region without
1634 * requiring any further setup.
1635 */
1636 static int
_shared_region_map_and_slide(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings)1637 _shared_region_map_and_slide(
1638 struct proc *p,
1639 uint32_t files_count,
1640 struct shared_file_np *files,
1641 uint32_t mappings_count,
1642 struct shared_file_mapping_slide_np *mappings)
1643 {
1644 int error = 0;
1645 kern_return_t kr = KERN_SUCCESS;
1646 struct _sr_file_mappings *sr_file_mappings = NULL;
1647 struct vnode *rdir_vp = NULL;
1648 struct vm_shared_region *shared_region = NULL;
1649
1650 /*
1651 * Get a reference to the current proc's root dir.
1652 * Need this to prevent racing with chroot.
1653 */
1654 proc_fdlock(p);
1655 rdir_vp = p->p_fd.fd_rdir;
1656 if (rdir_vp == NULL) {
1657 rdir_vp = rootvnode;
1658 }
1659 assert(rdir_vp != NULL);
1660 vnode_get(rdir_vp);
1661 proc_fdunlock(p);
1662
1663 /*
1664 * Turn files, mappings into sr_file_mappings and other setup.
1665 */
1666 error = shared_region_map_and_slide_setup(p, files_count,
1667 files, mappings_count, mappings,
1668 &sr_file_mappings, &shared_region, rdir_vp);
1669 if (error != 0) {
1670 vnode_put(rdir_vp);
1671 return error;
1672 }
1673
1674 /* map the file(s) into that shared region's submap */
1675 kr = vm_shared_region_map_file(shared_region, files_count, sr_file_mappings);
1676 if (kr != KERN_SUCCESS) {
1677 SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] map(): "
1678 "vm_shared_region_map_file() failed kr=0x%x\n",
1679 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1680 proc_getpid(p), p->p_comm, kr));
1681 }
1682
1683 /* convert kern_return_t to errno */
1684 switch (kr) {
1685 case KERN_SUCCESS:
1686 error = 0;
1687 break;
1688 case KERN_INVALID_ADDRESS:
1689 error = EFAULT;
1690 break;
1691 case KERN_PROTECTION_FAILURE:
1692 error = EPERM;
1693 break;
1694 case KERN_NO_SPACE:
1695 error = ENOMEM;
1696 break;
1697 case KERN_FAILURE:
1698 case KERN_INVALID_ARGUMENT:
1699 default:
1700 error = EINVAL;
1701 break;
1702 }
1703
1704 /*
1705 * Mark that this process is now using split libraries.
1706 */
1707 if (error == 0 && (p->p_flag & P_NOSHLIB)) {
1708 OSBitAndAtomic(~((uint32_t)P_NOSHLIB), &p->p_flag);
1709 }
1710
1711 vnode_put(rdir_vp);
1712 shared_region_map_and_slide_cleanup(p, files_count, sr_file_mappings, shared_region);
1713
1714 SHARED_REGION_TRACE_DEBUG(
1715 ("shared_region: %p [%d(%s)] <- map\n",
1716 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1717 proc_getpid(p), p->p_comm));
1718
1719 return error;
1720 }
1721
1722 /*
1723 * Clean up part of _shared_region_map_and_slide()
1724 * It had to be broken out of _shared_region_map_and_slide() to
1725 * prevent compiler inlining from blowing out the stack.
1726 */
1727 __attribute__((noinline))
1728 static void
shared_region_map_and_slide_cleanup(struct proc * p,uint32_t files_count,struct _sr_file_mappings * sr_file_mappings,struct vm_shared_region * shared_region)1729 shared_region_map_and_slide_cleanup(
1730 struct proc *p,
1731 uint32_t files_count,
1732 struct _sr_file_mappings *sr_file_mappings,
1733 struct vm_shared_region *shared_region)
1734 {
1735 struct _sr_file_mappings *srfmp;
1736 struct vnode_attr va;
1737
1738 if (sr_file_mappings != NULL) {
1739 for (srfmp = &sr_file_mappings[0]; srfmp < &sr_file_mappings[files_count]; srfmp++) {
1740 if (srfmp->vp != NULL) {
1741 vnode_lock_spin(srfmp->vp);
1742 srfmp->vp->v_flag |= VSHARED_DYLD;
1743 vnode_unlock(srfmp->vp);
1744
1745 /* update the vnode's access time */
1746 if (!(vnode_vfsvisflags(srfmp->vp) & MNT_NOATIME)) {
1747 VATTR_INIT(&va);
1748 nanotime(&va.va_access_time);
1749 VATTR_SET_ACTIVE(&va, va_access_time);
1750 vnode_setattr(srfmp->vp, &va, vfs_context_current());
1751 }
1752
1753 #if NAMEDSTREAMS
1754 /*
1755 * If the shared cache is compressed, it may
1756 * have a namedstream vnode instantiated for
1757 * for it. That namedstream vnode will also
1758 * have to be marked with VSHARED_DYLD.
1759 */
1760 if (vnode_hasnamedstreams(srfmp->vp)) {
1761 vnode_t svp;
1762 if (vnode_getnamedstream(srfmp->vp, &svp, XATTR_RESOURCEFORK_NAME,
1763 NS_OPEN, 0, vfs_context_kernel()) == 0) {
1764 vnode_lock_spin(svp);
1765 svp->v_flag |= VSHARED_DYLD;
1766 vnode_unlock(svp);
1767 vnode_put(svp);
1768 }
1769 }
1770 #endif /* NAMEDSTREAMS */
1771 /*
1772 * release the vnode...
1773 * ubc_map() still holds it for us in the non-error case
1774 */
1775 (void) vnode_put(srfmp->vp);
1776 srfmp->vp = NULL;
1777 }
1778 if (srfmp->fp != NULL) {
1779 /* release the file descriptor */
1780 fp_drop(p, srfmp->fd, srfmp->fp, 0);
1781 srfmp->fp = NULL;
1782 }
1783 }
1784 kfree_type(struct _sr_file_mappings, files_count, sr_file_mappings);
1785 }
1786
1787 if (shared_region != NULL) {
1788 vm_shared_region_deallocate(shared_region);
1789 }
1790 }
1791
1792 /*
1793 * For each file mapped, we may have mappings for:
1794 * TEXT, EXECUTE, LINKEDIT, DATA_CONST, __AUTH, DATA
1795 * so let's round up to 8 mappings per file.
1796 */
1797 #define SFM_MAX (_SR_FILE_MAPPINGS_MAX_FILES * 8) /* max mapping structs allowed to pass in */
1798
1799 /*
1800 * This is the new interface for setting up shared region mappings.
1801 *
1802 * The slide used for shared regions setup using this interface is done differently
1803 * from the old interface. The slide value passed in the shared_files_np represents
1804 * a max value. The kernel will choose a random value based on that, then use it
1805 * for all shared regions.
1806 */
1807 #if defined (__x86_64__)
1808 #define SLIDE_AMOUNT_MASK ~FOURK_PAGE_MASK
1809 #else
1810 #define SLIDE_AMOUNT_MASK ~SIXTEENK_PAGE_MASK
1811 #endif
1812
1813 static inline __result_use_check kern_return_t
shared_region_map_and_slide_2_np_sanitize(struct proc * p,user_addr_t mappings_userspace_addr,unsigned int count,shared_file_mapping_slide_np_t * mappings)1814 shared_region_map_and_slide_2_np_sanitize(
1815 struct proc *p,
1816 user_addr_t mappings_userspace_addr,
1817 unsigned int count,
1818 shared_file_mapping_slide_np_t *mappings)
1819 {
1820 kern_return_t kr;
1821 vm_map_t map = current_map();
1822 mach_vm_address_t addr, end;
1823 mach_vm_offset_t offset, offset_end;
1824 mach_vm_size_t size, offset_size;
1825 user_addr_t slide_start, slide_end, slide_size;
1826 vm_prot_t cur;
1827 vm_prot_t max;
1828
1829 user_addr_t user_addr = mappings_userspace_addr;
1830
1831 for (size_t i = 0; i < count; i++) {
1832 shared_file_mapping_slide_np_ut mapping_u;
1833 /*
1834 * First we bring each mapping struct into our kernel stack to
1835 * avoid TOCTOU.
1836 */
1837 kr = shared_region_copyin(
1838 p,
1839 user_addr,
1840 1, // copy 1 element at a time
1841 sizeof(shared_file_mapping_slide_np_ut),
1842 &mapping_u);
1843 if (__improbable(kr != KERN_SUCCESS)) {
1844 return kr;
1845 }
1846
1847 /*
1848 * Then, we sanitize the data on the kernel stack.
1849 */
1850 kr = vm_sanitize_addr_size(
1851 mapping_u.sms_address_u,
1852 mapping_u.sms_size_u,
1853 VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1854 map,
1855 (VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1856 | VM_SANITIZE_FLAGS_CHECK_ALIGNED_START
1857 | VM_SANITIZE_FLAGS_CHECK_ALIGNED_SIZE),
1858 &addr,
1859 &end,
1860 &size);
1861 if (__improbable(kr != KERN_SUCCESS)) {
1862 return kr;
1863 }
1864
1865 kr = vm_sanitize_addr_size(
1866 mapping_u.sms_file_offset_u,
1867 mapping_u.sms_size_u,
1868 VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1869 PAGE_MASK,
1870 (VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1871 | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1872 &offset,
1873 &offset_end,
1874 &offset_size);
1875 if (__improbable(kr != KERN_SUCCESS)) {
1876 return kr;
1877 }
1878 if (__improbable(0 != (offset & vm_map_page_mask(map)))) {
1879 return KERN_INVALID_ARGUMENT;
1880 }
1881
1882 /*
1883 * Unsafe access is immediately followed by wrap to
1884 * convert from addr to size.
1885 */
1886 mach_vm_size_ut sms_slide_size_u =
1887 vm_sanitize_wrap_size(
1888 VM_SANITIZE_UNSAFE_UNWRAP(
1889 mapping_u.sms_slide_size_u));
1890
1891 kr = vm_sanitize_addr_size(
1892 mapping_u.sms_slide_start_u,
1893 sms_slide_size_u,
1894 VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1895 map,
1896 (VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1897 | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1898 &slide_start,
1899 &slide_end,
1900 &slide_size);
1901 if (__improbable(kr != KERN_SUCCESS)) {
1902 return kr;
1903 }
1904
1905 kr = vm_sanitize_cur_and_max_prots(
1906 mapping_u.sms_init_prot_u,
1907 mapping_u.sms_max_prot_u,
1908 VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1909 map,
1910 VM_PROT_SFM_EXTENSIONS_MASK | VM_PROT_TPRO,
1911 &cur,
1912 &max);
1913 if (__improbable(kr != KERN_SUCCESS)) {
1914 return kr;
1915 }
1916
1917 /*
1918 * Finally, we move the data from the kernel stack to our
1919 * caller-allocated kernel heap buffer.
1920 */
1921 mappings[i].sms_address = addr;
1922 mappings[i].sms_size = size;
1923 mappings[i].sms_file_offset = offset;
1924 mappings[i].sms_slide_size = slide_size;
1925 mappings[i].sms_slide_start = slide_start;
1926 mappings[i].sms_max_prot = max;
1927 mappings[i].sms_init_prot = cur;
1928
1929 if (__improbable(os_add_overflow(
1930 user_addr,
1931 sizeof(shared_file_mapping_slide_np_ut),
1932 &user_addr))) {
1933 return KERN_INVALID_ARGUMENT;
1934 }
1935 }
1936
1937 return KERN_SUCCESS;
1938 }
1939
1940 int
shared_region_map_and_slide_2_np(struct proc * p,struct shared_region_map_and_slide_2_np_args * uap,__unused int * retvalp)1941 shared_region_map_and_slide_2_np(
1942 struct proc *p,
1943 struct shared_region_map_and_slide_2_np_args *uap,
1944 __unused int *retvalp)
1945 {
1946 unsigned int files_count;
1947 struct shared_file_np *shared_files = NULL;
1948 unsigned int mappings_count;
1949 struct shared_file_mapping_slide_np *mappings = NULL;
1950 kern_return_t kr = KERN_SUCCESS;
1951
1952 files_count = uap->files_count;
1953 mappings_count = uap->mappings_count;
1954
1955 SHARED_REGION_TRACE_DEBUG(
1956 ("shared_region: %p [%d(%s)] -> map_and_slide(0x%llx)\n",
1957 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1958 proc_getpid(p), p->p_comm,
1959 (uint64_t)uap->mappings_u));
1960
1961 if (files_count == 0) {
1962 SHARED_REGION_TRACE_INFO(
1963 ("shared_region: %p [%d(%s)] map(): "
1964 "no files\n",
1965 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1966 proc_getpid(p), p->p_comm));
1967 kr = 0; /* no files to map: we're done ! */
1968 goto done;
1969 } else if (files_count <= _SR_FILE_MAPPINGS_MAX_FILES) {
1970 shared_files = kalloc_data(files_count * sizeof(shared_files[0]), Z_WAITOK);
1971 if (shared_files == NULL) {
1972 kr = KERN_RESOURCE_SHORTAGE;
1973 goto done;
1974 }
1975 } else {
1976 SHARED_REGION_TRACE_ERROR(
1977 ("shared_region: %p [%d(%s)] map(): "
1978 "too many files (%d) max %d\n",
1979 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1980 proc_getpid(p), p->p_comm,
1981 files_count, _SR_FILE_MAPPINGS_MAX_FILES));
1982 kr = KERN_FAILURE;
1983 goto done;
1984 }
1985
1986 if (mappings_count == 0) {
1987 SHARED_REGION_TRACE_INFO(
1988 ("shared_region: %p [%d(%s)] map(): "
1989 "no mappings\n",
1990 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1991 proc_getpid(p), p->p_comm));
1992 kr = 0; /* no mappings: we're done ! */
1993 goto done;
1994 } else if (mappings_count <= SFM_MAX) {
1995 mappings = kalloc_data(mappings_count * sizeof(mappings[0]), Z_WAITOK);
1996 if (mappings == NULL) {
1997 kr = KERN_RESOURCE_SHORTAGE;
1998 goto done;
1999 }
2000 } else {
2001 SHARED_REGION_TRACE_ERROR(
2002 ("shared_region: %p [%d(%s)] map(): "
2003 "too many mappings (%d) max %d\n",
2004 (void *)VM_KERNEL_ADDRPERM(current_thread()),
2005 proc_getpid(p), p->p_comm,
2006 mappings_count, SFM_MAX));
2007 kr = KERN_FAILURE;
2008 goto done;
2009 }
2010
2011 /*
2012 * struct shared_file_np does not have fields that are subject to
2013 * sanitization, it is thus copied from userspace as is.
2014 */
2015 kr = shared_region_copyin(p, uap->files, files_count, sizeof(shared_files[0]), shared_files);
2016 if (kr != KERN_SUCCESS) {
2017 SHARED_REGION_TRACE_ERROR(
2018 ("shared_region: %p [%d(%s)] copyin() returned 0x%x\n",
2019 (void *)VM_KERNEL_ADDRPERM(current_thread()),
2020 proc_getpid(p), p->p_comm, kr));
2021 goto done;
2022 }
2023
2024 kr = shared_region_map_and_slide_2_np_sanitize(
2025 p,
2026 uap->mappings_u,
2027 mappings_count,
2028 mappings);
2029 if (__improbable(kr != KERN_SUCCESS)) {
2030 SHARED_REGION_TRACE_ERROR(
2031 ("shared_region: %p [%d(%s)] sanitize() returned 0x%x\n",
2032 (void *)VM_KERNEL_ADDRPERM(current_thread()),
2033 proc_getpid(p), p->p_comm, kr));
2034 kr = vm_sanitize_get_kr(kr);
2035 goto done;
2036 }
2037
2038 uint32_t max_slide = shared_files[0].sf_slide;
2039 uint32_t random_val;
2040 uint32_t slide_amount;
2041
2042 if (max_slide != 0) {
2043 read_random(&random_val, sizeof random_val);
2044 slide_amount = ((random_val % max_slide) & SLIDE_AMOUNT_MASK);
2045 } else {
2046 slide_amount = 0;
2047 }
2048 #if DEVELOPMENT || DEBUG
2049 extern bool bootarg_disable_aslr;
2050 if (bootarg_disable_aslr) {
2051 slide_amount = 0;
2052 }
2053 #endif /* DEVELOPMENT || DEBUG */
2054
2055 /*
2056 * Fix up the mappings to reflect the desired slide.
2057 */
2058 unsigned int f;
2059 unsigned int m = 0;
2060 unsigned int i;
2061 for (f = 0; f < files_count; ++f) {
2062 shared_files[f].sf_slide = slide_amount;
2063 for (i = 0; i < shared_files[f].sf_mappings_count; ++i, ++m) {
2064 if (m >= mappings_count) {
2065 SHARED_REGION_TRACE_ERROR(
2066 ("shared_region: %p [%d(%s)] map(): "
2067 "mapping count argument was too small\n",
2068 (void *)VM_KERNEL_ADDRPERM(current_thread()),
2069 proc_getpid(p), p->p_comm));
2070 kr = KERN_FAILURE;
2071 goto done;
2072 }
2073 if (__improbable(
2074 os_add_overflow(
2075 mappings[m].sms_address,
2076 slide_amount,
2077 &mappings[m].sms_address))) {
2078 kr = KERN_INVALID_ARGUMENT;
2079 goto done;
2080 }
2081 if (mappings[m].sms_slide_size != 0) {
2082 mach_vm_address_t discard;
2083 /* Slide and check that new start/size pairs do not overflow. */
2084 if (__improbable(
2085 os_add_overflow(
2086 mappings[m].sms_slide_start,
2087 slide_amount,
2088 &mappings[m].sms_slide_start) ||
2089 os_add_overflow(
2090 mappings[m].sms_slide_start,
2091 mappings[m].sms_slide_size,
2092 &discard))) {
2093 kr = KERN_INVALID_ARGUMENT;
2094 goto done;
2095 }
2096 }
2097 }
2098 }
2099
2100 kr = _shared_region_map_and_slide(p, files_count, shared_files, mappings_count, mappings);
2101 done:
2102 kfree_data(shared_files, files_count * sizeof(shared_files[0]));
2103 kfree_data(mappings, mappings_count * sizeof(mappings[0]));
2104
2105 SHARED_REGION_TRACE_DEBUG(
2106 ("shared_region: %p [%d(%s)] map_and_slide(0x%llx) <- 0x%x\n",
2107 (void *)VM_KERNEL_ADDRPERM(current_thread()),
2108 proc_getpid(p), p->p_comm,
2109 (uint64_t)uap->mappings_u, kr));
2110
2111 return kr;
2112 }
2113
2114 /*
2115 * A syscall for dyld to use to map data pages that need load time relocation fixups.
2116 * The fixups are performed by a custom pager during page-in, so the pages still appear
2117 * "clean" and hence are easily discarded under memory pressure. They can be re-paged-in
2118 * on demand later, all w/o using the compressor.
2119 *
2120 * Note these page are treated as MAP_PRIVATE. So if the application dirties any pages while
2121 * running, they are COW'd as normal.
2122 */
2123 int
map_with_linking_np(struct proc * p,struct map_with_linking_np_args * uap,__unused int * retvalp)2124 map_with_linking_np(
2125 struct proc *p,
2126 struct map_with_linking_np_args *uap,
2127 __unused int *retvalp)
2128 {
2129 uint32_t region_count;
2130 uint32_t r;
2131 struct mwl_region *regions = NULL;
2132 struct mwl_region *rp;
2133 uint32_t link_info_size;
2134 void *link_info = NULL; /* starts with a struct mwl_info_hdr */
2135 struct mwl_info_hdr *info_hdr = NULL;
2136 uint64_t binds_size;
2137 int fd;
2138 struct fileproc *fp = NULL;
2139 struct vnode *vp = NULL;
2140 size_t file_size;
2141 off_t fs;
2142 struct vnode_attr va;
2143 memory_object_control_t file_control = NULL;
2144 int error;
2145 kern_return_t kr = KERN_SUCCESS;
2146
2147 /*
2148 * Check if dyld has told us it finished with this call.
2149 */
2150 if (p->p_disallow_map_with_linking) {
2151 printf("%s: [%d(%s)]: map__with_linking() was disabled\n",
2152 __func__, proc_getpid(p), p->p_comm);
2153 kr = KERN_FAILURE;
2154 goto done;
2155 }
2156
2157 /*
2158 * First we do some sanity checking on what dyld has passed us.
2159 */
2160 region_count = uap->region_count;
2161 link_info_size = uap->link_info_size;
2162 if (region_count == 0) {
2163 printf("%s: [%d(%s)]: region_count == 0\n",
2164 __func__, proc_getpid(p), p->p_comm);
2165 kr = KERN_FAILURE;
2166 goto done;
2167 }
2168 if (region_count > MWL_MAX_REGION_COUNT) {
2169 printf("%s: [%d(%s)]: region_count too big %d\n",
2170 __func__, proc_getpid(p), p->p_comm, region_count);
2171 kr = KERN_FAILURE;
2172 goto done;
2173 }
2174
2175 if (link_info_size <= MWL_MIN_LINK_INFO_SIZE) {
2176 printf("%s: [%d(%s)]: link_info_size too small\n",
2177 __func__, proc_getpid(p), p->p_comm);
2178 kr = KERN_FAILURE;
2179 goto done;
2180 }
2181 if (link_info_size >= MWL_MAX_LINK_INFO_SIZE) {
2182 printf("%s: [%d(%s)]: link_info_size too big %d\n",
2183 __func__, proc_getpid(p), p->p_comm, link_info_size);
2184 kr = KERN_FAILURE;
2185 goto done;
2186 }
2187
2188 /*
2189 * Allocate and copyin the regions and link info
2190 */
2191 regions = kalloc_data(region_count * sizeof(regions[0]), Z_WAITOK);
2192 if (regions == NULL) {
2193 printf("%s: [%d(%s)]: failed to allocate regions\n",
2194 __func__, proc_getpid(p), p->p_comm);
2195 kr = KERN_RESOURCE_SHORTAGE;
2196 goto done;
2197 }
2198 kr = shared_region_copyin(p, uap->regions, region_count, sizeof(regions[0]), regions);
2199 if (kr != KERN_SUCCESS) {
2200 printf("%s: [%d(%s)]: failed to copyin regions kr=%d\n",
2201 __func__, proc_getpid(p), p->p_comm, kr);
2202 goto done;
2203 }
2204
2205 link_info = kalloc_data(link_info_size, Z_WAITOK);
2206 if (link_info == NULL) {
2207 printf("%s: [%d(%s)]: failed to allocate link_info\n",
2208 __func__, proc_getpid(p), p->p_comm);
2209 kr = KERN_RESOURCE_SHORTAGE;
2210 goto done;
2211 }
2212 kr = shared_region_copyin(p, uap->link_info, 1, link_info_size, link_info);
2213 if (kr != KERN_SUCCESS) {
2214 printf("%s: [%d(%s)]: failed to copyin link_info kr=%d\n",
2215 __func__, proc_getpid(p), p->p_comm, kr);
2216 goto done;
2217 }
2218
2219 /*
2220 * Do some verification the data structures.
2221 */
2222 info_hdr = (struct mwl_info_hdr *)link_info;
2223 if (info_hdr->mwli_version != MWL_INFO_VERS) {
2224 printf("%s: [%d(%s)]: unrecognized mwli_version=%d\n",
2225 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_version);
2226 kr = KERN_FAILURE;
2227 goto done;
2228 }
2229
2230 if (info_hdr->mwli_binds_offset > link_info_size) {
2231 printf("%s: [%d(%s)]: mwli_binds_offset too large %d\n",
2232 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_offset);
2233 kr = KERN_FAILURE;
2234 goto done;
2235 }
2236
2237 /* some older devs have s/w page size > h/w page size, no need to support them */
2238 if (info_hdr->mwli_page_size != PAGE_SIZE) {
2239 /* no printf, since this is expected on some devices */
2240 kr = KERN_INVALID_ARGUMENT;
2241 goto done;
2242 }
2243
2244 binds_size = (uint64_t)info_hdr->mwli_binds_count *
2245 ((info_hdr->mwli_pointer_format == DYLD_CHAINED_PTR_32) ? 4 : 8);
2246 if (binds_size > link_info_size - info_hdr->mwli_binds_offset) {
2247 printf("%s: [%d(%s)]: mwli_binds_count too large %d\n",
2248 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_count);
2249 kr = KERN_FAILURE;
2250 goto done;
2251 }
2252
2253 if (info_hdr->mwli_chains_offset > link_info_size) {
2254 printf("%s: [%d(%s)]: mwli_chains_offset too large %d\n",
2255 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_offset);
2256 kr = KERN_FAILURE;
2257 goto done;
2258 }
2259
2260
2261 /*
2262 * Ensure the chained starts in the link info and make sure the
2263 * segment info offsets are within bounds.
2264 */
2265 if (info_hdr->mwli_chains_size < sizeof(struct dyld_chained_starts_in_image)) {
2266 printf("%s: [%d(%s)]: mwli_chains_size too small %d\n",
2267 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2268 kr = KERN_FAILURE;
2269 goto done;
2270 }
2271 if (info_hdr->mwli_chains_size > link_info_size - info_hdr->mwli_chains_offset) {
2272 printf("%s: [%d(%s)]: mwli_chains_size too large %d\n",
2273 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2274 kr = KERN_FAILURE;
2275 goto done;
2276 }
2277
2278 /* Note that more verification of offsets is done in the pager itself */
2279
2280 /*
2281 * Ensure we've only been given one FD and verify valid protections.
2282 */
2283 fd = regions[0].mwlr_fd;
2284 for (r = 0; r < region_count; ++r) {
2285 if (regions[r].mwlr_fd != fd) {
2286 printf("%s: [%d(%s)]: mwlr_fd mismatch %d and %d\n",
2287 __func__, proc_getpid(p), p->p_comm, fd, regions[r].mwlr_fd);
2288 kr = KERN_FAILURE;
2289 goto done;
2290 }
2291
2292 /*
2293 * Only allow data mappings and not zero fill. Permit TPRO
2294 * mappings only when VM_PROT_READ | VM_PROT_WRITE.
2295 */
2296 if (regions[r].mwlr_protections & VM_PROT_EXECUTE) {
2297 printf("%s: [%d(%s)]: mwlr_protections EXECUTE not allowed\n",
2298 __func__, proc_getpid(p), p->p_comm);
2299 kr = KERN_FAILURE;
2300 goto done;
2301 }
2302 if (regions[r].mwlr_protections & VM_PROT_ZF) {
2303 printf("%s: [%d(%s)]: region %d, found VM_PROT_ZF not allowed\n",
2304 __func__, proc_getpid(p), p->p_comm, r);
2305 kr = KERN_FAILURE;
2306 goto done;
2307 }
2308 if ((regions[r].mwlr_protections & VM_PROT_TPRO) &&
2309 !(regions[r].mwlr_protections & VM_PROT_WRITE)) {
2310 printf("%s: [%d(%s)]: region %d, found VM_PROT_TPRO without VM_PROT_WRITE\n",
2311 __func__, proc_getpid(p), p->p_comm, r);
2312 kr = KERN_FAILURE;
2313 goto done;
2314 }
2315 }
2316
2317
2318 /* get file structure from file descriptor */
2319 error = fp_get_ftype(p, fd, DTYPE_VNODE, EINVAL, &fp);
2320 if (error) {
2321 printf("%s: [%d(%s)]: fp_get_ftype() failed, error %d\n",
2322 __func__, proc_getpid(p), p->p_comm, error);
2323 kr = KERN_FAILURE;
2324 goto done;
2325 }
2326
2327 /* We need at least read permission on the file */
2328 if (!(fp->fp_glob->fg_flag & FREAD)) {
2329 printf("%s: [%d(%s)]: not readable\n",
2330 __func__, proc_getpid(p), p->p_comm);
2331 kr = KERN_FAILURE;
2332 goto done;
2333 }
2334
2335 /* Get the vnode from file structure */
2336 vp = (struct vnode *)fp_get_data(fp);
2337 error = vnode_getwithref(vp);
2338 if (error) {
2339 printf("%s: [%d(%s)]: failed to get vnode, error %d\n",
2340 __func__, proc_getpid(p), p->p_comm, error);
2341 kr = KERN_FAILURE;
2342 vp = NULL; /* just to be sure */
2343 goto done;
2344 }
2345
2346 /* Make sure the vnode is a regular file */
2347 if (vp->v_type != VREG) {
2348 printf("%s: [%d(%s)]: vnode not VREG\n",
2349 __func__, proc_getpid(p), p->p_comm);
2350 kr = KERN_FAILURE;
2351 goto done;
2352 }
2353
2354 /* get vnode size */
2355 error = vnode_size(vp, &fs, vfs_context_current());
2356 if (error) {
2357 goto done;
2358 }
2359 file_size = fs;
2360
2361 /* get the file's memory object handle */
2362 file_control = ubc_getobject(vp, UBC_HOLDOBJECT);
2363 if (file_control == MEMORY_OBJECT_CONTROL_NULL) {
2364 printf("%s: [%d(%s)]: no memory object\n",
2365 __func__, proc_getpid(p), p->p_comm);
2366 kr = KERN_FAILURE;
2367 goto done;
2368 }
2369
2370 for (r = 0; r < region_count; ++r) {
2371 rp = ®ions[r];
2372
2373 #if CONFIG_MACF
2374 vm_prot_t prot = (rp->mwlr_protections & VM_PROT_ALL);
2375 error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
2376 fp->fp_glob, prot, MAP_FILE | MAP_PRIVATE | MAP_FIXED, rp->mwlr_file_offset, &prot);
2377 if (error) {
2378 printf("%s: [%d(%s)]: mac_file_check_mmap() failed, region %d, error %d\n",
2379 __func__, proc_getpid(p), p->p_comm, r, error);
2380 kr = KERN_FAILURE;
2381 goto done;
2382 }
2383 #endif /* MAC */
2384
2385 /* check that the mappings are properly covered by code signatures */
2386 if (cs_system_enforcement()) {
2387 if (!ubc_cs_is_range_codesigned(vp, rp->mwlr_file_offset, rp->mwlr_size)) {
2388 printf("%s: [%d(%s)]: region %d, not code signed\n",
2389 __func__, proc_getpid(p), p->p_comm, r);
2390 kr = KERN_FAILURE;
2391 goto done;
2392 }
2393 }
2394 }
2395
2396 /* update the vnode's access time */
2397 if (!(vnode_vfsvisflags(vp) & MNT_NOATIME)) {
2398 VATTR_INIT(&va);
2399 nanotime(&va.va_access_time);
2400 VATTR_SET_ACTIVE(&va, va_access_time);
2401 vnode_setattr(vp, &va, vfs_context_current());
2402 }
2403
2404 /* get the VM to do the work */
2405 kr = vm_map_with_linking(proc_task(p), regions, region_count, &link_info, link_info_size, file_control);
2406
2407 done:
2408 if (fp != NULL) {
2409 /* release the file descriptor */
2410 fp_drop(p, fd, fp, 0);
2411 }
2412 if (vp != NULL) {
2413 (void)vnode_put(vp);
2414 }
2415 if (regions != NULL) {
2416 kfree_data(regions, region_count * sizeof(regions[0]));
2417 }
2418 /* link info is NULL if it is used in the pager, if things worked */
2419 if (link_info != NULL) {
2420 kfree_data(link_info, link_info_size);
2421 }
2422
2423 switch (kr) {
2424 case KERN_SUCCESS:
2425 return 0;
2426 case KERN_RESOURCE_SHORTAGE:
2427 return ENOMEM;
2428 default:
2429 return EINVAL;
2430 }
2431 }
2432
2433 #if DEBUG || DEVELOPMENT
2434 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count,
2435 CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count, 0, "");
2436 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count_max,
2437 CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count_max, 0, "");
2438 #endif /* DEBUG || DEVELOPMENT */
2439
2440 /* sysctl overflow room */
2441
2442 SYSCTL_INT(_vm, OID_AUTO, pagesize, CTLFLAG_RD | CTLFLAG_LOCKED,
2443 (int *) &page_size, 0, "vm page size");
2444
2445 /* vm_page_free_target is provided as a makeshift solution for applications that want to
2446 * allocate buffer space, possibly purgeable memory, but not cause inactive pages to be
2447 * reclaimed. It allows the app to calculate how much memory is free outside the free target. */
2448 extern unsigned int vm_page_free_target;
2449 SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED,
2450 &vm_page_free_target, 0, "Pageout daemon free target");
2451
2452 SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED,
2453 &vm_pageout_state.vm_memory_pressure, 0, "Memory pressure indicator");
2454
2455 static int
2456 vm_ctl_page_free_wanted SYSCTL_HANDLER_ARGS
2457 {
2458 #pragma unused(oidp, arg1, arg2)
2459 unsigned int page_free_wanted;
2460
2461 page_free_wanted = mach_vm_ctl_page_free_wanted();
2462 return SYSCTL_OUT(req, &page_free_wanted, sizeof(page_free_wanted));
2463 }
2464 SYSCTL_PROC(_vm, OID_AUTO, page_free_wanted,
2465 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
2466 0, 0, vm_ctl_page_free_wanted, "I", "");
2467
2468 extern unsigned int vm_page_purgeable_count;
2469 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2470 &vm_page_purgeable_count, 0, "Purgeable page count");
2471
2472 extern unsigned int vm_page_purgeable_wired_count;
2473 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2474 &vm_page_purgeable_wired_count, 0, "Wired purgeable page count");
2475
2476 extern unsigned int vm_page_kern_lpage_count;
2477 SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2478 &vm_page_kern_lpage_count, 0, "kernel used large pages");
2479
2480 SCALABLE_COUNTER_DECLARE(vm_page_grab_count);
2481 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed");
2482 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_kern);
2483 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_kern, vm_page_grab_count_kern, "Total pages grabbed (kernel)");
2484 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_iopl);
2485 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_iopl, vm_page_grab_count_iopl, "Total pages grabbed (iopl)");
2486 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_upl);
2487 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_upl, vm_page_grab_count_upl, "Total pages grabbed (upl)");
2488
2489
2490 #if DEVELOPMENT || DEBUG
2491 SCALABLE_COUNTER_DECLARE(vm_page_deactivate_behind_count);
2492 SYSCTL_SCALABLE_COUNTER(_vm, pages_deactivated_behind, vm_page_deactivate_behind_count,
2493 "Number of pages deactivated behind");
2494 #endif
2495
2496 #if DEVELOPMENT || DEBUG
2497 #if __ARM_MIXED_PAGE_SIZE__
2498 static int vm_mixed_pagesize_supported = 1;
2499 #else
2500 static int vm_mixed_pagesize_supported = 0;
2501 #endif /*__ARM_MIXED_PAGE_SIZE__ */
2502 SYSCTL_INT(_debug, OID_AUTO, vm_mixed_pagesize_supported, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED,
2503 &vm_mixed_pagesize_supported, 0, "kernel support for mixed pagesize");
2504
2505 SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
2506 &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed");
2507
2508 SYSCTL_INT(_vm, OID_AUTO, pageout_purged_objects, CTLFLAG_RD | CTLFLAG_LOCKED,
2509 &vm_pageout_debug.vm_pageout_purged_objects, 0, "System purged object count");
2510 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED,
2511 &vm_pageout_debug.vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)");
2512 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED,
2513 &vm_pageout_debug.vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)");
2514
2515 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2516 &vm_pageout_debug.vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated");
2517 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2518 &vm_pageout_debug.vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated");
2519 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2520 &vm_pageout_debug.vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated"); /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */
2521 SYSCTL_ULONG(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2522 &vm_pageout_vminfo.vm_pageout_freed_cleaned, "Cleaned pages freed");
2523 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2524 &vm_pageout_debug.vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated");
2525 SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2526 &vm_pageout_debug.vm_pageout_enqueued_cleaned, 0, ""); /* sum of next two */
2527 #endif /* DEVELOPMENT || DEBUG */
2528
2529 extern int madvise_free_debug;
2530 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
2531 &madvise_free_debug, 0, "zero-fill on madvise(MADV_FREE*)");
2532 extern int madvise_free_debug_sometimes;
2533 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug_sometimes, CTLFLAG_RW | CTLFLAG_LOCKED,
2534 &madvise_free_debug_sometimes, 0, "sometimes zero-fill on madvise(MADV_FREE*)");
2535
2536 SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2537 &vm_page_stats_reusable.reusable_count, 0, "Reusable page count");
2538 SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2539 &vm_page_stats_reusable.reusable_pages_success, "");
2540 SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2541 &vm_page_stats_reusable.reusable_pages_failure, "");
2542 SYSCTL_QUAD(_vm, OID_AUTO, reusable_pages_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2543 &vm_page_stats_reusable.reusable_pages_shared, "");
2544 SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2545 &vm_page_stats_reusable.all_reusable_calls, "");
2546 SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2547 &vm_page_stats_reusable.partial_reusable_calls, "");
2548 SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2549 &vm_page_stats_reusable.reuse_pages_success, "");
2550 SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2551 &vm_page_stats_reusable.reuse_pages_failure, "");
2552 SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2553 &vm_page_stats_reusable.all_reuse_calls, "");
2554 SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2555 &vm_page_stats_reusable.partial_reuse_calls, "");
2556 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2557 &vm_page_stats_reusable.can_reuse_success, "");
2558 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2559 &vm_page_stats_reusable.can_reuse_failure, "");
2560 SYSCTL_QUAD(_vm, OID_AUTO, reusable_reclaimed, CTLFLAG_RD | CTLFLAG_LOCKED,
2561 &vm_page_stats_reusable.reusable_reclaimed, "");
2562 SYSCTL_QUAD(_vm, OID_AUTO, reusable_nonwritable, CTLFLAG_RD | CTLFLAG_LOCKED,
2563 &vm_page_stats_reusable.reusable_nonwritable, "");
2564 SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2565 &vm_page_stats_reusable.reusable_shared, "");
2566 SYSCTL_QUAD(_vm, OID_AUTO, free_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2567 &vm_page_stats_reusable.free_shared, "");
2568
2569
2570 extern unsigned int vm_page_free_count, vm_page_speculative_count;
2571 SYSCTL_UINT(_vm, OID_AUTO, page_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_count, 0, "");
2572 SYSCTL_UINT(_vm, OID_AUTO, page_speculative_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_speculative_count, 0, "");
2573
2574 extern unsigned int vm_page_cleaned_count;
2575 SYSCTL_UINT(_vm, OID_AUTO, page_cleaned_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_cleaned_count, 0, "Cleaned queue size");
2576
2577 extern unsigned int vm_page_pageable_internal_count, vm_page_pageable_external_count;
2578 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_internal_count, 0, "");
2579 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, "");
2580
2581 /* pageout counts */
2582 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_clean, 0, "");
2583 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_used, 0, "");
2584
2585 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, "");
2586 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_external, "");
2587 SYSCTL_ULONG(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2588 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_external, "");
2589 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2590 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_cleaned, "");
2591
2592 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_sharedcache, "");
2593 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache, "");
2594 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_realtime, "");
2595 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime, "");
2596 extern unsigned int vm_page_realtime_count;
2597 SYSCTL_UINT(_vm, OID_AUTO, page_realtime_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_realtime_count, 0, "");
2598 extern int vm_pageout_protect_realtime;
2599 SYSCTL_INT(_vm, OID_AUTO, pageout_protect_realtime, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_protect_realtime, 0, "");
2600
2601 /* counts of pages prefaulted when entering a memory object */
2602 extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout;
2603 extern int64_t vm_prefault_nb_no_page, vm_prefault_nb_wrong_page;
2604 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_pages, "");
2605 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_bailout, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_bailout, "");
2606 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_no_page, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_no_page, "");
2607 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_wrong_page, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_wrong_page, "");
2608
2609 #if defined (__x86_64__)
2610 extern unsigned int vm_clump_promote_threshold;
2611 SYSCTL_UINT(_vm, OID_AUTO, vm_clump_promote_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_clump_promote_threshold, 0, "clump size threshold for promotes");
2612 #if DEVELOPMENT || DEBUG
2613 extern unsigned long vm_clump_stats[];
2614 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[1], "free page allocations from clump of 1 page");
2615 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[2], "free page allocations from clump of 2 pages");
2616 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[3], "free page allocations from clump of 3 pages");
2617 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats4, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[4], "free page allocations from clump of 4 pages");
2618 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats5, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[5], "free page allocations from clump of 5 pages");
2619 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats6, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[6], "free page allocations from clump of 6 pages");
2620 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats7, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[7], "free page allocations from clump of 7 pages");
2621 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats8, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[8], "free page allocations from clump of 8 pages");
2622 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats9, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[9], "free page allocations from clump of 9 pages");
2623 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats10, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[10], "free page allocations from clump of 10 pages");
2624 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats11, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[11], "free page allocations from clump of 11 pages");
2625 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats12, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[12], "free page allocations from clump of 12 pages");
2626 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats13, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[13], "free page allocations from clump of 13 pages");
2627 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats14, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[14], "free page allocations from clump of 14 pages");
2628 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats15, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[15], "free page allocations from clump of 15 pages");
2629 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats16, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[16], "free page allocations from clump of 16 pages");
2630 extern unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
2631 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_alloc, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_allocs, "free page allocations");
2632 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inserts, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inserts, "free page insertions");
2633 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inrange, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inrange, "free page insertions that are part of vm_pages");
2634 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_promotes, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_promotes, "pages promoted to head");
2635 #endif /* if DEVELOPMENT || DEBUG */
2636 #endif /* #if defined (__x86_64__) */
2637
2638 #if CONFIG_SECLUDED_MEMORY
2639
2640 SYSCTL_UINT(_vm, OID_AUTO, num_tasks_can_use_secluded_mem, CTLFLAG_RD | CTLFLAG_LOCKED, &num_tasks_can_use_secluded_mem, 0, "");
2641 extern unsigned int vm_page_secluded_target;
2642 extern unsigned int vm_page_secluded_count;
2643 extern unsigned int vm_page_secluded_count_free;
2644 extern unsigned int vm_page_secluded_count_inuse;
2645 extern unsigned int vm_page_secluded_count_over_target;
2646 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, "");
2647 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, "");
2648 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, "");
2649 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, "");
2650 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_over_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_over_target, 0, "");
2651
2652 extern struct vm_page_secluded_data vm_page_secluded;
2653 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, "");
2654 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_free, 0, "");
2655 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_other, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_other, 0, "");
2656 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_locked, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_locked, 0, "");
2657 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_state, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_state, 0, "");
2658 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_realtime, 0, "");
2659 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_dirty, 0, "");
2660 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, "");
2661 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, "");
2662
2663 #endif /* CONFIG_SECLUDED_MEMORY */
2664
2665 #if CONFIG_DEFERRED_RECLAIM
2666 #pragma mark Deferred Reclaim
2667 SYSCTL_NODE(_vm, OID_AUTO, reclaim, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Deferred Memory Reclamation");
2668 #if DEVELOPMENT || DEBUG
2669 /*
2670 * VM reclaim testing
2671 */
2672 extern bool vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task);
2673
2674 static int
2675 sysctl_vm_reclaim_wait_for_pid SYSCTL_HANDLER_ARGS
2676 {
2677 int error = EINVAL, pid = 0;
2678 /*
2679 * Only send on write
2680 */
2681 error = sysctl_handle_int(oidp, &pid, 0, req);
2682 if (error || !req->newptr) {
2683 return error;
2684 }
2685 if (pid <= 0) {
2686 return EINVAL;
2687 }
2688 proc_t p = proc_find(pid);
2689 if (p == PROC_NULL) {
2690 return ESRCH;
2691 }
2692 task_t t = proc_task(p);
2693 if (t == TASK_NULL) {
2694 proc_rele(p);
2695 return ESRCH;
2696 }
2697 task_reference(t);
2698 proc_rele(p);
2699
2700 bool success = vm_deferred_reclamation_block_until_task_has_been_reclaimed(t);
2701 if (success) {
2702 error = 0;
2703 }
2704 task_deallocate(t);
2705
2706 return error;
2707 }
2708
2709 SYSCTL_PROC(_vm_reclaim, OID_AUTO, wait_for_pid,
2710 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2711 &sysctl_vm_reclaim_wait_for_pid, "I",
2712 "Block until the given pid has been drained by kernel GC");
2713
2714 static int
2715 sysctl_vm_reclaim_drain_pid SYSCTL_HANDLER_ARGS
2716 {
2717 int error = EINVAL;
2718 kern_return_t kr;
2719 pid_t pid;
2720 error = sysctl_handle_int(oidp, &pid, 0, req);
2721 /* Only reclaim on write */
2722 if (error || !req->newptr) {
2723 return error;
2724 }
2725 if (pid <= 0) {
2726 return EINVAL;
2727 }
2728 proc_t p = proc_find(pid);
2729 if (p == PROC_NULL) {
2730 return ESRCH;
2731 }
2732 task_t t = proc_task(p);
2733 if (t == TASK_NULL) {
2734 proc_rele(p);
2735 return ESRCH;
2736 }
2737 task_reference(t);
2738 proc_rele(p);
2739 kr = vm_deferred_reclamation_task_drain(t, RECLAIM_OPTIONS_NONE);
2740 task_deallocate(t);
2741 return mach_to_bsd_errno(kr);
2742 }
2743
2744 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_pid,
2745 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2746 &sysctl_vm_reclaim_drain_pid, "I",
2747 "Drain the deferred reclamation buffer for a pid");
2748
2749 static int
proc_filter_reclaimable(proc_t p,__unused void * arg)2750 proc_filter_reclaimable(proc_t p, __unused void *arg)
2751 {
2752 task_t task = proc_task(p);
2753 return vm_deferred_reclamation_task_has_ring(task);
2754 }
2755
2756 static int
proc_reclaim_drain(proc_t p,__unused void * arg)2757 proc_reclaim_drain(proc_t p, __unused void *arg)
2758 {
2759 kern_return_t kr;
2760 task_t task = proc_task(p);
2761 kr = vm_deferred_reclamation_task_drain(task, RECLAIM_OPTIONS_NONE);
2762 return mach_to_bsd_errno(kr);
2763 }
2764
2765 static int
2766 sysctl_vm_reclaim_drain_all SYSCTL_HANDLER_ARGS
2767 {
2768 int error;
2769 int val;
2770 if (!req->newptr) {
2771 return EINVAL;
2772 }
2773 error = sysctl_handle_int(oidp, &val, 0, req);
2774 if (error || val == FALSE) {
2775 return error;
2776 }
2777 proc_iterate(PROC_ALLPROCLIST, proc_reclaim_drain, NULL,
2778 proc_filter_reclaimable, NULL);
2779 return 0;
2780 }
2781
2782 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_all,
2783 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2784 &sysctl_vm_reclaim_drain_all, "I",
2785 "Fully reclaim from every deferred reclamation buffer on the system");
2786
2787 extern uint32_t vm_reclaim_buffer_count;
2788 extern uint64_t vm_reclaim_gc_epoch;
2789 extern uint64_t vm_reclaim_gc_reclaim_count;
2790 extern uint64_t vm_reclaim_sampling_period_abs;
2791 extern uint64_t vm_reclaim_sampling_period_ns;
2792 extern bool vm_reclaim_debug;
2793 extern bool vm_reclaim_enabled;
2794 extern uint32_t vm_reclaim_autotrim_pct_normal;
2795 extern uint32_t vm_reclaim_autotrim_pct_pressure;
2796 extern uint32_t vm_reclaim_autotrim_pct_critical;
2797 extern uint32_t vm_reclaim_wma_weight_base;
2798 extern uint32_t vm_reclaim_wma_weight_cur;
2799 extern uint32_t vm_reclaim_wma_denom;
2800 extern uint64_t vm_reclaim_abandonment_threshold;
2801
2802 SYSCTL_UINT(_vm_reclaim, OID_AUTO, reclaim_buffer_count,
2803 CTLFLAG_RD | CTLFLAG_LOCKED, (uint32_t *)&vm_reclaim_buffer_count, 0,
2804 "The number of deferred memory buffers currently alive");
2805 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_epoch,
2806 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_epoch,
2807 "Number of times the global GC thread has run");
2808 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_reclaim_count,
2809 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_reclaim_count,
2810 "Number of times the global GC thread has reclaimed from a buffer");
2811 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, debug,
2812 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_debug, 0,
2813 "Debug logs for vm.reclaim");
2814 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, enabled,
2815 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_enabled, 0,
2816 "Whether deferred memory reclamation is enabled on this system");
2817 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_normal,
2818 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_normal, 0,
2819 "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2820 "to engage auto-trim when the system is operating normally");
2821 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_pressure,
2822 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_pressure, 0,
2823 "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2824 "to engage auto-trim when the system is under memory pressure");
2825 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_critical,
2826 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_critical, 0,
2827 "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2828 "to engage auto-trim when the system is under critical memory pressure");
2829 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_base,
2830 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_base, 0,
2831 "Weight applied to historical minimum buffer size samples");
2832 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_cur,
2833 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_cur, 0,
2834 "Weight applied to current sampled minimum buffer size");
2835 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_denom,
2836 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_denom, 0,
2837 "Denominator for weighted moving average calculation");
2838 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, abandonment_threshold,
2839 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_abandonment_threshold,
2840 "The number of sampling periods between accounting updates that may elapse "
2841 "before the buffer is considered \"abandoned\"");
2842
2843 static int
2844 sysctl_vm_reclaim_sampling_period SYSCTL_HANDLER_ARGS
2845 {
2846 uint64_t new_val_ns;
2847 uint64_t old_val_ns = vm_reclaim_sampling_period_ns;
2848 int err = sysctl_io_number(req, vm_reclaim_sampling_period_ns,
2849 sizeof(vm_reclaim_sampling_period_ns), &new_val_ns, NULL);
2850 if (err || !req->newptr) {
2851 return err;
2852 }
2853 if (new_val_ns != old_val_ns) {
2854 vm_reclaim_sampling_period_ns = new_val_ns;
2855 nanoseconds_to_absolutetime(vm_reclaim_sampling_period_ns, &vm_reclaim_sampling_period_abs);
2856 }
2857 return 0;
2858 }
2859
2860 SYSCTL_PROC(_vm_reclaim, OID_AUTO, sampling_period_ns,
2861 CTLFLAG_RW | CTLTYPE_QUAD | CTLFLAG_LOCKED, NULL, 0, sysctl_vm_reclaim_sampling_period, "QU",
2862 "Interval (nanoseconds) at which to sample the minimum buffer size and "
2863 "consider trimming excess");
2864 #endif /* DEVELOPMENT || DEBUG */
2865 #endif /* CONFIG_DEFERRED_RECLAIM */
2866
2867 #include <kern/thread.h>
2868 #include <sys/user.h>
2869
2870 void vm_pageout_io_throttle(void);
2871
2872 void
vm_pageout_io_throttle(void)2873 vm_pageout_io_throttle(void)
2874 {
2875 struct uthread *uthread = current_uthread();
2876
2877 /*
2878 * thread is marked as a low priority I/O type
2879 * and the I/O we issued while in this cleaning operation
2880 * collided with normal I/O operations... we'll
2881 * delay in order to mitigate the impact of this
2882 * task on the normal operation of the system
2883 */
2884
2885 if (uthread->uu_lowpri_window) {
2886 throttle_lowpri_io(1);
2887 }
2888 }
2889
2890 int
vm_pressure_monitor(__unused struct proc * p,struct vm_pressure_monitor_args * uap,int * retval)2891 vm_pressure_monitor(
2892 __unused struct proc *p,
2893 struct vm_pressure_monitor_args *uap,
2894 int *retval)
2895 {
2896 kern_return_t kr;
2897 uint32_t pages_reclaimed;
2898 uint32_t pages_wanted;
2899
2900 kr = mach_vm_pressure_monitor(
2901 (boolean_t) uap->wait_for_pressure,
2902 uap->nsecs_monitored,
2903 (uap->pages_reclaimed) ? &pages_reclaimed : NULL,
2904 &pages_wanted);
2905
2906 switch (kr) {
2907 case KERN_SUCCESS:
2908 break;
2909 case KERN_ABORTED:
2910 return EINTR;
2911 default:
2912 return EINVAL;
2913 }
2914
2915 if (uap->pages_reclaimed) {
2916 if (copyout((void *)&pages_reclaimed,
2917 uap->pages_reclaimed,
2918 sizeof(pages_reclaimed)) != 0) {
2919 return EFAULT;
2920 }
2921 }
2922
2923 *retval = (int) pages_wanted;
2924 return 0;
2925 }
2926
2927 int
kas_info(struct proc * p,struct kas_info_args * uap,int * retval __unused)2928 kas_info(struct proc *p,
2929 struct kas_info_args *uap,
2930 int *retval __unused)
2931 {
2932 #ifndef CONFIG_KAS_INFO
2933 (void)p;
2934 (void)uap;
2935 return ENOTSUP;
2936 #else /* CONFIG_KAS_INFO */
2937 int selector = uap->selector;
2938 user_addr_t valuep = uap->value;
2939 user_addr_t sizep = uap->size;
2940 user_size_t size, rsize;
2941 int error;
2942
2943 if (!kauth_cred_issuser(kauth_cred_get())) {
2944 return EPERM;
2945 }
2946
2947 #if CONFIG_MACF
2948 error = mac_system_check_kas_info(kauth_cred_get(), selector);
2949 if (error) {
2950 return error;
2951 }
2952 #endif
2953
2954 if (IS_64BIT_PROCESS(p)) {
2955 user64_size_t size64;
2956 error = copyin(sizep, &size64, sizeof(size64));
2957 size = (user_size_t)size64;
2958 } else {
2959 user32_size_t size32;
2960 error = copyin(sizep, &size32, sizeof(size32));
2961 size = (user_size_t)size32;
2962 }
2963 if (error) {
2964 return error;
2965 }
2966
2967 switch (selector) {
2968 case KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR:
2969 {
2970 uint64_t slide = vm_kernel_slide;
2971
2972 if (sizeof(slide) != size) {
2973 return EINVAL;
2974 }
2975
2976 error = copyout(&slide, valuep, sizeof(slide));
2977 if (error) {
2978 return error;
2979 }
2980 rsize = size;
2981 }
2982 break;
2983 case KAS_INFO_KERNEL_SEGMENT_VMADDR_SELECTOR:
2984 {
2985 uint32_t i;
2986 kernel_mach_header_t *mh = &_mh_execute_header;
2987 struct load_command *cmd;
2988 cmd = (struct load_command*) &mh[1];
2989 uint64_t *bases;
2990 rsize = mh->ncmds * sizeof(uint64_t);
2991
2992 /*
2993 * Return the size if no data was passed
2994 */
2995 if (valuep == 0) {
2996 break;
2997 }
2998
2999 if (rsize > size) {
3000 return EINVAL;
3001 }
3002
3003 bases = kalloc_data(rsize, Z_WAITOK | Z_ZERO);
3004
3005 for (i = 0; i < mh->ncmds; i++) {
3006 if (cmd->cmd == LC_SEGMENT_KERNEL) {
3007 __IGNORE_WCASTALIGN(kernel_segment_command_t * sg = (kernel_segment_command_t *) cmd);
3008 bases[i] = (uint64_t)sg->vmaddr;
3009 }
3010 cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize);
3011 }
3012
3013 error = copyout(bases, valuep, rsize);
3014
3015 kfree_data(bases, rsize);
3016
3017 if (error) {
3018 return error;
3019 }
3020 }
3021 break;
3022 case KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR:
3023 case KAS_INFO_TXM_TEXT_SLIDE_SELECTOR:
3024 {
3025 #if CONFIG_SPTM
3026 const uint64_t slide =
3027 (selector == KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR) ? vm_sptm_offsets.slide : vm_txm_offsets.slide;
3028 #else
3029 const uint64_t slide = 0;
3030 #endif
3031
3032 if (sizeof(slide) != size) {
3033 return EINVAL;
3034 }
3035
3036 error = copyout(&slide, valuep, sizeof(slide));
3037 if (error) {
3038 return error;
3039 }
3040 rsize = size;
3041 }
3042 break;
3043 default:
3044 return EINVAL;
3045 }
3046
3047 if (IS_64BIT_PROCESS(p)) {
3048 user64_size_t size64 = (user64_size_t)rsize;
3049 error = copyout(&size64, sizep, sizeof(size64));
3050 } else {
3051 user32_size_t size32 = (user32_size_t)rsize;
3052 error = copyout(&size32, sizep, sizeof(size32));
3053 }
3054
3055 return error;
3056 #endif /* CONFIG_KAS_INFO */
3057 }
3058
3059 #pragma clang diagnostic push
3060 #pragma clang diagnostic ignored "-Wcast-qual"
3061 #pragma clang diagnostic ignored "-Wunused-function"
3062
3063 static void
asserts()3064 asserts()
3065 {
3066 static_assert(sizeof(vm_min_kernel_address) == sizeof(unsigned long));
3067 static_assert(sizeof(vm_max_kernel_address) == sizeof(unsigned long));
3068 }
3069
3070 SYSCTL_ULONG(_vm, OID_AUTO, vm_min_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_min_kernel_address, "");
3071 SYSCTL_ULONG(_vm, OID_AUTO, vm_max_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_max_kernel_address, "");
3072 #pragma clang diagnostic pop
3073
3074 extern uint32_t vm_page_pages;
3075 SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0, "");
3076
3077 extern uint32_t vm_page_busy_absent_skipped;
3078 SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, "");
3079
3080 extern uint32_t vm_page_upl_tainted;
3081 SYSCTL_UINT(_vm, OID_AUTO, upl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_upl_tainted, 0, "");
3082
3083 extern uint32_t vm_page_iopl_tainted;
3084 SYSCTL_UINT(_vm, OID_AUTO, iopl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_iopl_tainted, 0, "");
3085
3086 #if __arm64__ && (DEVELOPMENT || DEBUG)
3087 extern int vm_footprint_suspend_allowed;
3088 SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, "");
3089
3090 extern void pmap_footprint_suspend(vm_map_t map, boolean_t suspend);
3091 static int
3092 sysctl_vm_footprint_suspend SYSCTL_HANDLER_ARGS
3093 {
3094 #pragma unused(oidp, arg1, arg2)
3095 int error = 0;
3096 int new_value;
3097
3098 if (req->newptr == USER_ADDR_NULL) {
3099 return 0;
3100 }
3101 error = SYSCTL_IN(req, &new_value, sizeof(int));
3102 if (error) {
3103 return error;
3104 }
3105 if (!vm_footprint_suspend_allowed) {
3106 if (new_value != 0) {
3107 /* suspends are not allowed... */
3108 return 0;
3109 }
3110 /* ... but let resumes proceed */
3111 }
3112 DTRACE_VM2(footprint_suspend,
3113 vm_map_t, current_map(),
3114 int, new_value);
3115
3116 pmap_footprint_suspend(current_map(), new_value);
3117
3118 return 0;
3119 }
3120 SYSCTL_PROC(_vm, OID_AUTO, footprint_suspend,
3121 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3122 0, 0, &sysctl_vm_footprint_suspend, "I", "");
3123 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
3124
3125 extern uint64_t vm_map_corpse_footprint_count;
3126 extern uint64_t vm_map_corpse_footprint_size_avg;
3127 extern uint64_t vm_map_corpse_footprint_size_max;
3128 extern uint64_t vm_map_corpse_footprint_full;
3129 extern uint64_t vm_map_corpse_footprint_no_buf;
3130 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_count,
3131 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_count, "");
3132 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_avg,
3133 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_avg, "");
3134 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_max,
3135 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_max, "");
3136 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_full,
3137 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_full, "");
3138 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_no_buf,
3139 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_no_buf, "");
3140
3141 #if CODE_SIGNING_MONITOR
3142 extern uint64_t vm_cs_defer_to_csm;
3143 extern uint64_t vm_cs_defer_to_csm_not;
3144 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm,
3145 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm, "");
3146 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm_not,
3147 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm_not, "");
3148 #endif /* CODE_SIGNING_MONITOR */
3149
3150 extern uint64_t shared_region_pager_copied;
3151 extern uint64_t shared_region_pager_slid;
3152 extern uint64_t shared_region_pager_slid_error;
3153 extern uint64_t shared_region_pager_reclaimed;
3154 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_copied,
3155 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_copied, "");
3156 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid,
3157 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid, "");
3158 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid_error,
3159 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid_error, "");
3160 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed,
3161 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_reclaimed, "");
3162 extern int shared_region_destroy_delay;
3163 SYSCTL_INT(_vm, OID_AUTO, shared_region_destroy_delay,
3164 CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_destroy_delay, 0, "");
3165
3166 #if MACH_ASSERT
3167 extern int pmap_ledgers_panic_leeway;
3168 SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
3169 #endif /* MACH_ASSERT */
3170
3171
3172 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_count;
3173 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_size;
3174 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_max;
3175 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart;
3176 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_error;
3177 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_count;
3178 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_size;
3179 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_max;
3180 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart;
3181 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_error;
3182 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_count;
3183 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_size;
3184 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_max;
3185 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_count,
3186 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_count, "");
3187 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_size,
3188 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_size, "");
3189 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_max,
3190 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_max, "");
3191 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_restart,
3192 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_restart, "");
3193 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_error,
3194 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_error, "");
3195 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_count,
3196 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_count, "");
3197 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_size,
3198 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_size, "");
3199 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_max,
3200 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_max, "");
3201 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_restart,
3202 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_restart, "");
3203 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_error,
3204 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_error, "");
3205 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_count,
3206 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_count, "");
3207 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_size,
3208 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_size, "");
3209 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_max,
3210 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_max, "");
3211
3212 extern int vm_protect_privileged_from_untrusted;
3213 SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted,
3214 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, "");
3215 extern uint64_t vm_copied_on_read;
3216 extern uint64_t vm_copied_on_read_kernel_map;
3217 extern uint64_t vm_copied_on_read_platform_map;
3218 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read,
3219 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, "");
3220 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read_kernel_map,
3221 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read_kernel_map, "");
3222 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read_platform_map,
3223 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read_platform_map, "");
3224
3225 extern int vm_shared_region_count;
3226 extern int vm_shared_region_peak;
3227 SYSCTL_INT(_vm, OID_AUTO, shared_region_count,
3228 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_count, 0, "");
3229 SYSCTL_INT(_vm, OID_AUTO, shared_region_peak,
3230 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_peak, 0, "");
3231 #if DEVELOPMENT || DEBUG
3232 extern unsigned int shared_region_pagers_resident_count;
3233 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_count,
3234 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_count, 0, "");
3235 extern unsigned int shared_region_pagers_resident_peak;
3236 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_peak,
3237 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_peak, 0, "");
3238 extern int shared_region_pager_count;
3239 SYSCTL_INT(_vm, OID_AUTO, shared_region_pager_count,
3240 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_count, 0, "");
3241 #if __has_feature(ptrauth_calls)
3242 extern int shared_region_key_count;
3243 SYSCTL_INT(_vm, OID_AUTO, shared_region_key_count,
3244 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_key_count, 0, "");
3245 extern int vm_shared_region_reslide_count;
3246 SYSCTL_INT(_vm, OID_AUTO, shared_region_reslide_count,
3247 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_reslide_count, 0, "");
3248 #endif /* __has_feature(ptrauth_calls) */
3249 #endif /* DEVELOPMENT || DEBUG */
3250
3251 #if MACH_ASSERT
3252 extern int debug4k_filter;
3253 SYSCTL_INT(_vm, OID_AUTO, debug4k_filter, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_filter, 0, "");
3254 extern int debug4k_panic_on_terminate;
3255 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_terminate, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_terminate, 0, "");
3256 extern int debug4k_panic_on_exception;
3257 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_exception, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_exception, 0, "");
3258 extern int debug4k_panic_on_misaligned_sharing;
3259 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_misaligned_sharing, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_misaligned_sharing, 0, "");
3260 #endif /* MACH_ASSERT */
3261
3262 extern uint64_t vm_map_set_size_limit_count;
3263 extern uint64_t vm_map_set_data_limit_count;
3264 extern uint64_t vm_map_enter_RLIMIT_AS_count;
3265 extern uint64_t vm_map_enter_RLIMIT_DATA_count;
3266 SYSCTL_QUAD(_vm, OID_AUTO, map_set_size_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_size_limit_count, "");
3267 SYSCTL_QUAD(_vm, OID_AUTO, map_set_data_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_data_limit_count, "");
3268 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_AS_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_AS_count, "");
3269 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_DATA_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_DATA_count, "");
3270
3271 extern uint64_t vm_fault_resilient_media_initiate;
3272 extern uint64_t vm_fault_resilient_media_retry;
3273 extern uint64_t vm_fault_resilient_media_proceed;
3274 extern uint64_t vm_fault_resilient_media_release;
3275 extern uint64_t vm_fault_resilient_media_abort1;
3276 extern uint64_t vm_fault_resilient_media_abort2;
3277 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_initiate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_initiate, "");
3278 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_retry, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_retry, "");
3279 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_proceed, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_proceed, "");
3280 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_release, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_release, "");
3281 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort1, "");
3282 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort2, "");
3283 #if MACH_ASSERT
3284 extern int vm_fault_resilient_media_inject_error1_rate;
3285 extern int vm_fault_resilient_media_inject_error1;
3286 extern int vm_fault_resilient_media_inject_error2_rate;
3287 extern int vm_fault_resilient_media_inject_error2;
3288 extern int vm_fault_resilient_media_inject_error3_rate;
3289 extern int vm_fault_resilient_media_inject_error3;
3290 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1_rate, 0, "");
3291 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1, 0, "");
3292 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2_rate, 0, "");
3293 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2, 0, "");
3294 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3_rate, 0, "");
3295 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3, 0, "");
3296 #endif /* MACH_ASSERT */
3297
3298 extern uint64_t pmap_query_page_info_retries;
3299 SYSCTL_QUAD(_vm, OID_AUTO, pmap_query_page_info_retries, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_query_page_info_retries, "");
3300
3301 /*
3302 * A sysctl which causes all existing shared regions to become stale. They
3303 * will no longer be used by anything new and will be torn down as soon as
3304 * the last existing user exits. A write of non-zero value causes that to happen.
3305 * This should only be used by launchd, so we check that this is initproc.
3306 */
3307 static int
shared_region_pivot(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3308 shared_region_pivot(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3309 {
3310 unsigned int value = 0;
3311 int changed = 0;
3312 int error = sysctl_io_number(req, 0, sizeof(value), &value, &changed);
3313 if (error || !changed) {
3314 return error;
3315 }
3316 if (current_proc() != initproc) {
3317 return EPERM;
3318 }
3319
3320 vm_shared_region_pivot();
3321
3322 return 0;
3323 }
3324
3325 SYSCTL_PROC(_vm, OID_AUTO, shared_region_pivot,
3326 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
3327 0, 0, shared_region_pivot, "I", "");
3328
3329 extern uint64_t vm_object_shadow_forced;
3330 extern uint64_t vm_object_shadow_skipped;
3331 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
3332 &vm_object_shadow_forced, "");
3333 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_skipped, CTLFLAG_RD | CTLFLAG_LOCKED,
3334 &vm_object_shadow_skipped, "");
3335
3336 extern uint64_t vm_object_upl_throttle_cnt;
3337 SYSCTL_QUAD(_vm, OID_AUTO, object_upl_throttle_cnt, CTLFLAG_RD | CTLFLAG_LOCKED,
3338 &vm_object_upl_throttle_cnt,
3339 "The number of times in which a UPL write was throttled due to pageout starvation");
3340
3341 #if HAS_MTE
3342 #pragma mark MTE
3343
3344 SYSCTL_NODE(_vm, OID_AUTO, mte, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "mte");
3345
3346 /* sysctls for vm.mte.* counters. */
3347
3348 SYSCTL_UINT(_vm_mte, OID_AUTO, tagged, CTLFLAG_RD,
3349 &vm_page_tagged_count, 0, "tagged pages in use");
3350
3351 SYSCTL_QUAD(_vm_mte, OID_AUTO, refill_thread_wakeups, CTLFLAG_RD,
3352 &vm_mte_refill_thread_wakeups,
3353 "the number of times the refill thread was woken up");
3354
3355 /* sysctls for vm.mte.free.* counters. */
3356
3357 SYSCTL_NODE(_vm_mte, OID_AUTO, free, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "free counts");
3358
3359 SYSCTL_UINT(_vm_mte_free, OID_AUTO, total, CTLFLAG_RD,
3360 &vm_page_free_count, 0,
3361 "total free pages (same as vm.page_free_count)");
3362 SYSCTL_UINT(_vm_mte_free, OID_AUTO, taggable, CTLFLAG_RD,
3363 &vm_page_free_taggable_count, 0,
3364 "free taggable pages in the MTE free queue");
3365 SYSCTL_UINT(_vm_mte_free, OID_AUTO, claimable, CTLFLAG_RD,
3366 &mte_claimable_queue.vmpfq_count, 0,
3367 "free tag storage pages on the MTE claimable queue");
3368
3369 SYSCTL_SCALABLE_COUNTER(_vm_mte_free, cpu_untagged, vm_cpu_free_count,
3370 "free untagged pages in CPU lists");
3371 SYSCTL_SCALABLE_COUNTER(_vm_mte_free, cpu_claimed, vm_cpu_free_claimed_count,
3372 "free claimed pages in CPU lists");
3373 SYSCTL_SCALABLE_COUNTER(_vm_mte_free, cpu_tagged, vm_cpu_free_tagged_count,
3374 "free tagged pages in CPU lists");
3375
3376 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_0, CTLFLAG_RD,
3377 &mte_free_queues[MTE_FREE_UNTAGGABLE_0].vmpfq_count, 0,
3378 "disabled/pinned/deactivating/claimed (with 16 free pages or less) tag storage pages")
3379 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_1, CTLFLAG_RD,
3380 &mte_free_queues[MTE_FREE_UNTAGGABLE_1].vmpfq_count, 0,
3381 "claimed (with 17 free pages or more) or disabled (with 16 pages or less) tag storage pages")
3382 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_2, CTLFLAG_RD,
3383 &mte_free_queues[MTE_FREE_UNTAGGABLE_2].vmpfq_count, 0,
3384 "disabled (with 17 pages or more) tag storage pages")
3385 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_0, CTLFLAG_RD,
3386 &mte_free_queues[MTE_FREE_ACTIVE_0].vmpfq_count, 0,
3387 "active tag storages with free covered pages (bucket 0)");
3388 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_1, CTLFLAG_RD,
3389 &mte_free_queues[MTE_FREE_ACTIVE_1].vmpfq_count, 0,
3390 "active tag storages with free covered pages (bucket 1)");
3391 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_2, CTLFLAG_RD,
3392 &mte_free_queues[MTE_FREE_ACTIVE_2].vmpfq_count, 0,
3393 "active tag storages with free covered pages (bucket 2)");
3394 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_active_3, CTLFLAG_RD,
3395 &mte_free_queues[MTE_FREE_ACTIVE_3].vmpfq_count, 0,
3396 "active tag storages with free covered pages (bucket 3)");
3397 SYSCTL_UINT(_vm_mte_free, OID_AUTO, tag_storage_untaggable_activating, CTLFLAG_RD,
3398 &mte_free_queues[MTE_FREE_UNTAGGABLE_ACTIVATING].vmpfq_count, 0,
3399 "activating/reclaiming tag storages with free covered pages");
3400
3401 /* sysctls for vm.mte.tag_storage.cell_* counters. */
3402
3403 SYSCTL_NODE(_vm_mte, OID_AUTO, cell, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "mte cell");
3404
3405 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, disabled, CTLFLAG_RD,
3406 &mte_info_lists[MTE_LIST_DISABLED_IDX].count, 0,
3407 "free inactive tag storage pages");
3408 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, disabled_recursive, CTLFLAG_RD,
3409 &vm_page_recursive_tag_storage_count, 0,
3410 "recursive tag storage pages");
3411 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, disabled_unmanaged, CTLFLAG_RD,
3412 &vm_page_unmanaged_tag_storage_count, 0,
3413 "unmanaged tag storage pages");
3414 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, retired, CTLFLAG_RD,
3415 &vm_page_retired_tag_storage_count, 0,
3416 "retired tag storage pages");
3417 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, pinned, CTLFLAG_RD,
3418 &mte_info_lists[MTE_LIST_PINNED_IDX].count, 0,
3419 "unreclaimable tag storage pages");
3420 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, deactivating, CTLFLAG_RD,
3421 &mte_info_lists[MTE_LIST_DEACTIVATING_IDX].count, 0,
3422 "deactivating tag storage pages");
3423 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, claimed, CTLFLAG_RD,
3424 &mte_info_lists[MTE_LIST_CLAIMED_IDX].count, 0,
3425 "claimed tag storage pages");
3426 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, inactive, CTLFLAG_RD,
3427 &mte_info_lists[MTE_LIST_INACTIVE_IDX].count, 0,
3428 "free inactive tag storage pages");
3429 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, reclaiming, CTLFLAG_RD,
3430 &mte_info_lists[MTE_LIST_RECLAIMING_IDX].count, 0,
3431 "reclaiming tag storage pages");
3432 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, activating, CTLFLAG_RD,
3433 &mte_info_lists[MTE_LIST_ACTIVATING_IDX].count, 0,
3434 "activating tag storage pages");
3435 SYSCTL_UINT(_vm_mte_cell, OID_AUTO, active_0, CTLFLAG_RD,
3436 &mte_info_lists[MTE_LIST_ACTIVE_0_IDX].count, 0,
3437 "active tag storage pages with no used page tagged");
3438 static int
3439 tag_storage_active SYSCTL_HANDLER_ARGS
3440 {
3441 #pragma unused(arg1, arg2, oidp)
3442 uint32_t value = mteinfo_tag_storage_active(false);
3443
3444 return SYSCTL_OUT(req, &value, sizeof(value));
3445 }
3446 SYSCTL_PROC(_vm_mte_cell, OID_AUTO, active,
3447 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3448 0, 0, &tag_storage_active, "I",
3449 "active tag storage pages");
3450
3451 /* sysctls for vm.mte.tag_storage.* counters. */
3452
3453 SYSCTL_NODE(_vm_mte, OID_AUTO, tag_storage, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "mte tag storage");
3454
3455 SYSCTL_UINT(_vm_mte_tag_storage, OID_AUTO, reserved, CTLFLAG_RD,
3456 &vm_page_tag_storage_reserved, 0,
3457 "free tag storage pages reserve");
3458 SYSCTL_UINT(_vm_mte_tag_storage, OID_AUTO, wired, CTLFLAG_RD,
3459 &vm_page_wired_tag_storage_count, 0,
3460 "wired tag storage pages");
3461 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, activations, CTLFLAG_RD,
3462 &vm_page_tag_storage_activation_count,
3463 "tag storage activations (inactive/claimed -> active)");
3464 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, deactivations, CTLFLAG_RD,
3465 &vm_page_tag_storage_deactivation_count,
3466 "tag storage deactivations (active -> inactive)");
3467 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaims, CTLFLAG_RD,
3468 &vm_page_tag_storage_reclaim_success_count,
3469 "successful tag storage reclamations");
3470 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaims_from_cpu, CTLFLAG_RD,
3471 &vm_page_tag_storage_reclaim_from_cpu_count,
3472 "successful tag storage reclamations from the cpu free lists");
3473 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaim_failures, CTLFLAG_RD,
3474 &vm_page_tag_storage_reclaim_failure_count,
3475 "failed tag storage reclamations");
3476 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaim_wired_failures, CTLFLAG_RD,
3477 &vm_page_tag_storage_reclaim_wired_failure_count,
3478 "failed tag storage reclamations due to tag storage being wired");
3479 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, wire_relocations, CTLFLAG_RD,
3480 &vm_page_tag_storage_wire_relocation_count,
3481 "tag storage relocations due to wiring");
3482 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, reclaim_compressor_failures, CTLFLAG_RD,
3483 &vm_page_tag_storage_reclaim_compressor_failure_count,
3484 "failed tag storage reclamations due to tag storage used in compressor pool");
3485 SYSCTL_QUAD(_vm_mte_tag_storage, OID_AUTO, compressor_relocations, CTLFLAG_RD,
3486 &vm_page_tag_storage_compressor_relocation_count,
3487 "tag storage relocations due to compressor pool");
3488 SYSCTL_UINT(_vm_mte_tag_storage, OID_AUTO, free_unmanaged, CTLFLAG_RD,
3489 &vm_page_free_unmanaged_tag_storage_count, 0,
3490 "number of free unmanaged tag storage pages");
3491
3492 SYSCTL_SCALABLE_COUNTER(_vm_mte_tag_storage, cpu_allocated_claimed,
3493 vm_cpu_claimed_count, "claimed tag storage pages allocated");
3494
3495 static int
3496 tag_storage_fragmentation SYSCTL_HANDLER_ARGS
3497 {
3498 #pragma unused(arg1, arg2, oidp)
3499 uint32_t value = mteinfo_tag_storage_fragmentation(false);
3500
3501 return SYSCTL_OUT(req, &value, sizeof(value));
3502 }
3503 SYSCTL_PROC(_vm_mte_tag_storage, OID_AUTO, fragmentation,
3504 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3505 0, 0, &tag_storage_fragmentation, "I",
3506 "the achievable the fragmentation of the tag storage space (in parts per thousand)");
3507
3508 static int
3509 tag_storage_fragmentation_actual SYSCTL_HANDLER_ARGS
3510 {
3511 #pragma unused(arg1, arg2, oidp)
3512 uint32_t value = mteinfo_tag_storage_fragmentation(true);
3513
3514 return SYSCTL_OUT(req, &value, sizeof(value));
3515 }
3516 SYSCTL_PROC(_vm_mte_tag_storage, OID_AUTO, fragmentation_actual,
3517 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3518 0, 0, &tag_storage_fragmentation_actual, "I",
3519 "the actual the fragmentation of the tag storage space (in parts per thousand)");
3520
3521 /* sysctls for vm.mte.compresor_* */
3522
3523 extern unsigned int vm_object_no_compressor_pager_for_mte_count;
3524 SYSCTL_INT(_vm_mte, OID_AUTO, no_compressor_pager_for_mte, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_no_compressor_pager_for_mte_count, 0, "");
3525
3526 /* sysctls for MTE compression stats */
3527
3528 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_compressed, compressor_tagged_pages_compressed, "");
3529 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_decompressed, compressor_tagged_pages_decompressed, "");
3530 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_freed, compressor_tagged_pages_freed, "");
3531 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages_corrupted, compressor_tagged_pages_corrupted, "");
3532 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_overhead_bytes, compressor_tags_overhead_bytes, "");
3533 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_pages, compressor_tagged_pages, "");
3534 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_ts_pages_used, compressor_tag_storage_pages_in_pool,
3535 "the number of tag storage pages used in the compressor");
3536 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_non_ts_pages_used, compressor_non_tag_storage_pages_in_pool,
3537 "the number of non-tag storage pages used in the compressor");
3538 #if DEVELOPMENT || DEBUG
3539 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_all_zero, compressor_tags_all_zero, "");
3540 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_same_value, compressor_tags_same_value, "");
3541 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_below_align, compressor_tags_below_align, "");
3542 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_above_align, compressor_tags_above_align, "");
3543 SYSCTL_SCALABLE_COUNTER(_vm_mte, compress_incompressible, compressor_tags_incompressible, "");
3544 #endif /* DEVELOPMENT || DEBUG */
3545
3546 #endif /* HAS_MTE */
3547
3548 SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED,
3549 &vmtc_total, 0, "total text page corruptions detected");
3550
3551
3552 #if DEBUG || DEVELOPMENT
3553 /*
3554 * A sysctl that can be used to corrupt a text page with an illegal instruction.
3555 * Used for testing text page self healing.
3556 */
3557 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
3558 static int
corrupt_text_addr(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3559 corrupt_text_addr(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3560 {
3561 uint64_t value = 0;
3562 int error = sysctl_handle_quad(oidp, &value, 0, req);
3563 if (error || !req->newptr) {
3564 return error;
3565 }
3566
3567 if (vm_corrupt_text_addr((uintptr_t)value) == KERN_SUCCESS) {
3568 return 0;
3569 } else {
3570 return EINVAL;
3571 }
3572 }
3573
3574 SYSCTL_PROC(_vm, OID_AUTO, corrupt_text_addr,
3575 CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3576 0, 0, corrupt_text_addr, "-", "");
3577 #endif /* DEBUG || DEVELOPMENT */
3578
3579 #if CONFIG_MAP_RANGES
3580 /*
3581 * vm.malloc_ranges
3582 *
3583 * space-separated list of <left:right> hexadecimal addresses.
3584 */
3585 static int
3586 vm_map_malloc_ranges SYSCTL_HANDLER_ARGS
3587 {
3588 vm_map_t map = current_map();
3589 struct mach_vm_range r1, r2;
3590 char str[20 * 4];
3591 int len;
3592 mach_vm_offset_t right_hole_max;
3593
3594 if (vm_map_get_user_range(map, UMEM_RANGE_ID_DEFAULT, &r1)) {
3595 return ENOENT;
3596 }
3597 if (vm_map_get_user_range(map, UMEM_RANGE_ID_HEAP, &r2)) {
3598 return ENOENT;
3599 }
3600
3601 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
3602 right_hole_max = MACH_VM_JUMBO_ADDRESS;
3603 #else /* !XNU_TARGET_OS_IOS || !EXTENDED_USER_VA_SUPPORT */
3604 right_hole_max = get_map_max(map);
3605 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
3606
3607 len = scnprintf(str, sizeof(str), "0x%llx:0x%llx 0x%llx:0x%llx",
3608 r1.max_address, r2.min_address,
3609 r2.max_address, right_hole_max);
3610
3611 return SYSCTL_OUT(req, str, len);
3612 }
3613
3614 SYSCTL_PROC(_vm, OID_AUTO, malloc_ranges,
3615 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3616 0, 0, &vm_map_malloc_ranges, "A", "");
3617
3618 #if DEBUG || DEVELOPMENT
3619 static int
3620 vm_map_user_range_default SYSCTL_HANDLER_ARGS
3621 {
3622 #pragma unused(arg1, arg2, oidp)
3623 struct mach_vm_range range;
3624
3625 if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_DEFAULT, &range)
3626 != KERN_SUCCESS) {
3627 return EINVAL;
3628 }
3629
3630 return SYSCTL_OUT(req, &range, sizeof(range));
3631 }
3632
3633 static int
3634 vm_map_user_range_heap SYSCTL_HANDLER_ARGS
3635 {
3636 #pragma unused(arg1, arg2, oidp)
3637 struct mach_vm_range range;
3638
3639 if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_HEAP, &range)
3640 != KERN_SUCCESS) {
3641 return EINVAL;
3642 }
3643
3644 return SYSCTL_OUT(req, &range, sizeof(range));
3645 }
3646
3647 static int
3648 vm_map_user_range_large_file SYSCTL_HANDLER_ARGS
3649 {
3650 #pragma unused(arg1, arg2, oidp)
3651 struct mach_vm_range range;
3652
3653 if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_LARGE_FILE, &range)
3654 != KERN_SUCCESS) {
3655 return EINVAL;
3656 }
3657
3658 return SYSCTL_OUT(req, &range, sizeof(range));
3659 }
3660
3661 /*
3662 * A sysctl that can be used to return ranges for the current VM map.
3663 * Used for testing VM ranges.
3664 */
3665 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_default, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3666 0, 0, &vm_map_user_range_default, "S,mach_vm_range", "");
3667 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_heap, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3668 0, 0, &vm_map_user_range_heap, "S,mach_vm_range", "");
3669 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_large_file, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3670 0, 0, &vm_map_user_range_large_file, "S,mach_vm_range", "");
3671
3672 #endif /* DEBUG || DEVELOPMENT */
3673 #endif /* CONFIG_MAP_RANGES */
3674
3675 #if DEBUG || DEVELOPMENT
3676 #endif /* DEBUG || DEVELOPMENT */
3677
3678 extern uint64_t vm_map_range_overflows_count;
3679 SYSCTL_QUAD(_vm, OID_AUTO, map_range_overflows_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_range_overflows_count, "");
3680 extern boolean_t vm_map_range_overflows_log;
3681 SYSCTL_INT(_vm, OID_AUTO, map_range_oveflows_log, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_range_overflows_log, 0, "");
3682
3683 extern uint64_t c_seg_filled_no_contention;
3684 extern uint64_t c_seg_filled_contention;
3685 extern clock_sec_t c_seg_filled_contention_sec_max;
3686 extern clock_nsec_t c_seg_filled_contention_nsec_max;
3687 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_no_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_no_contention, "");
3688 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention, "");
3689 SYSCTL_ULONG(_vm, OID_AUTO, c_seg_filled_contention_sec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_sec_max, "");
3690 SYSCTL_UINT(_vm, OID_AUTO, c_seg_filled_contention_nsec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_nsec_max, 0, "");
3691 #if (XNU_TARGET_OS_OSX && __arm64__)
3692 extern clock_nsec_t c_process_major_report_over_ms; /* report if over ? ms */
3693 extern int c_process_major_yield_after; /* yield after moving ? segments */
3694 extern uint64_t c_process_major_reports;
3695 extern clock_sec_t c_process_major_max_sec;
3696 extern clock_nsec_t c_process_major_max_nsec;
3697 extern uint32_t c_process_major_peak_segcount;
3698 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_report_over_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_report_over_ms, 0, "");
3699 SYSCTL_INT(_vm, OID_AUTO, c_process_major_yield_after, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_yield_after, 0, "");
3700 SYSCTL_QUAD(_vm, OID_AUTO, c_process_major_reports, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_reports, "");
3701 SYSCTL_ULONG(_vm, OID_AUTO, c_process_major_max_sec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_sec, "");
3702 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_max_nsec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_nsec, 0, "");
3703 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_peak_segcount, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_peak_segcount, 0, "");
3704 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3705
3706 #if DEVELOPMENT || DEBUG
3707 extern int panic_object_not_alive;
3708 SYSCTL_INT(_vm, OID_AUTO, panic_object_not_alive, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &panic_object_not_alive, 0, "");
3709 #endif /* DEVELOPMENT || DEBUG */
3710
3711 #if FBDP_DEBUG_OBJECT_NO_PAGER
3712 extern int fbdp_no_panic;
3713 SYSCTL_INT(_vm, OID_AUTO, fbdp_no_panic, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &fbdp_no_panic, 0, "");
3714 #endif /* MACH_ASSERT */
3715
3716 extern uint64_t cluster_direct_write_wired;
3717 SYSCTL_QUAD(_vm, OID_AUTO, cluster_direct_write_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &cluster_direct_write_wired, "");
3718
3719 extern uint64_t vm_object_pageout_not_on_queue;
3720 extern uint64_t vm_object_pageout_not_pageable;
3721 extern uint64_t vm_object_pageout_pageable;
3722 extern uint64_t vm_object_pageout_active_local;
3723 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_on_queue, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_on_queue, "");
3724 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_pageable, "");
3725 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_pageable, "");
3726 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_active_local, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_active_local, "");
3727
3728
3729 #if DEVELOPMENT || DEBUG
3730
3731 static uint32_t
sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)3732 sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)
3733 {
3734 #if HAS_MTE
3735 if (with_data == VM_C_SERIALIZE_DATA_TAGS) {
3736 return VM_C_SEGMENT_INFO_MAGIC_WITH_TAGS;
3737 }
3738 #else
3739 #pragma unused(with_data)
3740 #endif /* HAS_MTE */
3741 return VM_C_SEGMENT_INFO_MAGIC;
3742 }
3743
3744 /* The largest possible single segment + its slots is
3745 * (sizeof(c_segment_info) + C_SLOT_MAX_INDEX * sizeof(c_slot_info)) + (data of a single segment) */
3746 #define SYSCTL_SEG_BUF_SIZE (8 * 1024 + 64 * 1024)
3747
3748 extern uint32_t c_segments_available;
3749
3750 struct sysctl_buf_header {
3751 uint32_t magic;
3752 } __attribute__((packed));
3753
3754 /* This sysctl iterates over the populated c_segments and writes some info about each one and its slots.
3755 * instead of doing everything here, the function calls a function vm_compressor.c. */
3756 static int
sysctl_compressor_segments_stream(struct sysctl_req * req,vm_c_serialize_add_data_t with_data)3757 sysctl_compressor_segments_stream(struct sysctl_req *req, vm_c_serialize_add_data_t with_data)
3758 {
3759 char* buf = kalloc_data(SYSCTL_SEG_BUF_SIZE, Z_WAITOK | Z_ZERO);
3760 if (!buf) {
3761 return ENOMEM;
3762 }
3763 size_t offset = 0;
3764 int error = 0;
3765 int segno = 0;
3766 /* 4 byte header to identify the version of the formatting of the data.
3767 * This should be incremented if c_segment_info or c_slot_info are changed */
3768 ((struct sysctl_buf_header*)buf)->magic = sysctl_compressor_seg_magic(with_data);
3769 offset += sizeof(uint32_t);
3770
3771 while (segno < c_segments_available) {
3772 size_t left_sz = SYSCTL_SEG_BUF_SIZE - offset;
3773 kern_return_t kr = vm_compressor_serialize_segment_debug_info(segno, buf + offset, &left_sz, with_data);
3774 if (kr == KERN_NO_SPACE) {
3775 /* failed to add another segment, push the current buffer out and try again */
3776 if (offset == 0) {
3777 error = EINVAL; /* no space to write but I didn't write anything, shouldn't really happen */
3778 goto out;
3779 }
3780 /* write out chunk */
3781 error = SYSCTL_OUT(req, buf, offset);
3782 if (error) {
3783 goto out;
3784 }
3785 offset = 0;
3786 bzero(buf, SYSCTL_SEG_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3787 /* don't increment segno, need to try again saving the current one */
3788 } else if (kr != KERN_SUCCESS) {
3789 error = EINVAL;
3790 goto out;
3791 } else {
3792 offset += left_sz;
3793 ++segno;
3794 assert(offset <= SYSCTL_SEG_BUF_SIZE);
3795 }
3796 }
3797
3798 if (offset > 0) { /* write last chunk */
3799 error = SYSCTL_OUT(req, buf, offset);
3800 }
3801
3802 out:
3803 kfree_data(buf, SYSCTL_SEG_BUF_SIZE)
3804 return error;
3805 }
3806
3807 static int
sysctl_compressor_segments(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3808 sysctl_compressor_segments(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3809 {
3810 return sysctl_compressor_segments_stream(req, VM_C_SERIALIZE_DATA_NONE);
3811 }
3812 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments, "S", "");
3813
3814 #if HAS_MTE
3815 static int
sysctl_compressor_segments_data(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3816 sysctl_compressor_segments_data(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3817 {
3818 return sysctl_compressor_segments_stream(req, VM_C_SERIALIZE_DATA_TAGS);
3819 }
3820 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments_data, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments_data, "S", "");
3821 #endif /* HAS_MTE */
3822
3823 extern uint32_t vm_compressor_fragmentation_level(void);
3824
3825 static int
sysctl_compressor_fragmentation_level(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3826 sysctl_compressor_fragmentation_level(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3827 {
3828 uint32_t value = vm_compressor_fragmentation_level();
3829 return SYSCTL_OUT(req, &value, sizeof(value));
3830 }
3831
3832 SYSCTL_PROC(_vm, OID_AUTO, compressor_fragmentation_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_fragmentation_level, "IU", "");
3833
3834 extern uint32_t vm_compressor_incore_fragmentation_wasted_pages(void);
3835
3836 static int
sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3837 sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3838 {
3839 uint32_t value = vm_compressor_incore_fragmentation_wasted_pages();
3840 return SYSCTL_OUT(req, &value, sizeof(value));
3841 }
3842
3843 SYSCTL_PROC(_vm, OID_AUTO, compressor_incore_fragmentation_wasted_pages, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_incore_fragmentation_wasted_pages, "IU", "");
3844
3845
3846
3847 #define SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE (8 * 1024)
3848
3849
3850 /* This sysctl iterates over all the entries of the vm_map of the a given process and write some info about the vm_object pointed by the entries.
3851 * This can be used for mapping where are all the pages of a process located in the compressor.
3852 */
3853 static int
sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)3854 sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3855 {
3856 int error = 0;
3857 char *buf = NULL;
3858 proc_t p = PROC_NULL;
3859 task_t task = TASK_NULL;
3860 vm_map_t map = VM_MAP_NULL;
3861 __block size_t offset = 0;
3862
3863 /* go from pid to proc to task to vm_map. see sysctl_procargsx() for another example of this procession */
3864 int *name = arg1;
3865 int namelen = arg2;
3866 if (namelen < 1) {
3867 return EINVAL;
3868 }
3869 int pid = name[0];
3870 p = proc_find(pid); /* this increments a reference to the proc */
3871 if (p == PROC_NULL) {
3872 return EINVAL;
3873 }
3874 task = proc_task(p);
3875 proc_rele(p); /* decrement ref of proc */
3876 p = PROC_NULL;
3877 if (task == TASK_NULL) {
3878 return EINVAL;
3879 }
3880 /* convert proc reference to task reference */
3881 task_reference(task);
3882 /* task reference to map reference */
3883 map = get_task_map_reference(task);
3884 task_deallocate(task);
3885
3886 if (map == VM_MAP_NULL) {
3887 return EINVAL; /* nothing allocated yet */
3888 }
3889
3890 buf = kalloc_data(SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE, Z_WAITOK | Z_ZERO);
3891 if (!buf) {
3892 error = ENOMEM;
3893 goto out;
3894 }
3895
3896 /* 4 byte header to identify the version of the formatting of the data.
3897 * This should be incremented if c_segment_info or c_slot_info are changed */
3898 ((struct sysctl_buf_header*)buf)->magic = VM_MAP_ENTRY_INFO_MAGIC;
3899 offset += sizeof(uint32_t);
3900
3901 kern_return_t (^write_header)(int) = ^kern_return_t (int nentries) {
3902 /* write the header, happens only once at the beginning so we should have enough space */
3903 assert(offset + sizeof(struct vm_map_info_hdr) < SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE);
3904 struct vm_map_info_hdr* out_hdr = (struct vm_map_info_hdr*)(buf + offset);
3905 out_hdr->vmi_nentries = nentries;
3906 offset += sizeof(struct vm_map_info_hdr);
3907 return KERN_SUCCESS;
3908 };
3909
3910 kern_return_t (^write_entry)(void*) = ^kern_return_t (void* entry) {
3911 while (true) { /* try up to 2 times, first try write the the current buffer, otherwise to a new buffer */
3912 size_t left_sz = SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE - offset;
3913 kern_return_t kr = vm_map_dump_entry_and_compressor_pager(entry, buf + offset, &left_sz);
3914 if (kr == KERN_NO_SPACE) {
3915 /* failed to write anything, flush the current buffer and try again */
3916 if (offset == 0) {
3917 return KERN_FAILURE; /* no space to write but I didn't write anything yet, shouldn't really happen */
3918 }
3919 /* write out chunk */
3920 int out_error = SYSCTL_OUT(req, buf, offset);
3921 if (out_error) {
3922 return KERN_FAILURE;
3923 }
3924 offset = 0;
3925 bzero(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3926 continue; /* need to retry the entry dump again with the cleaned buffer */
3927 } else if (kr != KERN_SUCCESS) {
3928 return kr;
3929 }
3930 offset += left_sz;
3931 break;
3932 }
3933 return KERN_SUCCESS;
3934 };
3935
3936 /* this foreach first calls to the first callback with the number of entries, then calls the second for every entry
3937 * when the buffer is exhausted, it is flushed to the sysctl and restarted */
3938 kern_return_t kr = vm_map_entries_foreach(map, write_header, write_entry);
3939
3940 if (kr != KERN_SUCCESS) {
3941 goto out;
3942 }
3943
3944 if (offset > 0) { /* last chunk */
3945 error = SYSCTL_OUT(req, buf, offset);
3946 }
3947
3948 out:
3949 if (buf != NULL) {
3950 kfree_data(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE)
3951 }
3952 if (map != NULL) {
3953 vm_map_deallocate(map);
3954 }
3955 return error;
3956 }
3957
3958 SYSCTL_PROC(_vm, OID_AUTO, task_vm_objects_slotmap, CTLTYPE_NODE | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_task_vm_objects_slotmap, "S", "");
3959 static int
3960 systctl_vm_reset_tag SYSCTL_HANDLER_ARGS
3961 {
3962 #pragma unused(oidp, arg1, arg2)
3963 int error;
3964 int tag;
3965 kern_return_t kr;
3966
3967 /* Need to be root */
3968 if (!kauth_cred_issuser(kauth_cred_get())) {
3969 return EPERM;
3970 }
3971
3972 error = SYSCTL_IN(req, &tag, sizeof(tag));
3973 if (error) {
3974 return error;
3975 }
3976
3977 if (tag > VM_MAX_TAG_VALUE) {
3978 return EINVAL;
3979 }
3980
3981 kr = vm_tag_reset_peak((vm_tag_t)tag);
3982
3983 return mach_to_bsd_errno(kr);
3984 }
3985
3986 SYSCTL_PROC(_vm, OID_AUTO, reset_tag,
3987 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED,
3988 0, 0, &systctl_vm_reset_tag, "I", "");
3989
3990 static int
3991 systctl_vm_reset_all_tags SYSCTL_HANDLER_ARGS
3992 {
3993 #pragma unused(oidp, arg1, arg2)
3994 /* Only reset the values if the sysctl is a write */
3995 if (!req->newptr) {
3996 return EINVAL;
3997 }
3998
3999 /* Need to be root */
4000 if (!kauth_cred_issuser(kauth_cred_get())) {
4001 return EPERM;
4002 }
4003
4004 vm_tag_reset_all_peaks();
4005
4006 return 0;
4007 }
4008
4009 SYSCTL_PROC(_vm, OID_AUTO, reset_all_tags,
4010 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED,
4011 0, 0, &systctl_vm_reset_all_tags, "I", "");
4012
4013 #endif /* DEVELOPMENT || DEBUG */
4014
4015 SYSCTL_NODE(_vm, OID_AUTO, compressor, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor");
4016
4017 SYSCTL_INT(_vm_compressor, OID_AUTO, mode, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_mode, 0, "");
4018 SYSCTL_INT(_vm_compressor, OID_AUTO, is_active, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_is_active, 0, "");
4019 SYSCTL_INT(_vm_compressor, OID_AUTO, is_available, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_compressor_available, 0, "");
4020 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed, CTLFLAG_RD | CTLFLAG_LOCKED,
4021 &c_segment_pages_compressed, 0, "The amount of uncompressed data stored in the compressor (in pages)");
4022 #if CONFIG_FREEZE
4023 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed_incore, CTLFLAG_RD | CTLFLAG_LOCKED,
4024 &c_segment_pages_compressed_incore, 0, "The amount of uncompressed data stored in the in-core compressor (in pages)");
4025 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed_incore_late_swapout, CTLFLAG_RD | CTLFLAG_LOCKED,
4026 &c_segment_pages_compressed_incore_late_swapout, 0, "The amount of uncompressed data stored in the in-core compressor and queued for swapout (in pages)");
4027 #endif
4028 SYSCTL_UINT(_vm_compressor, OID_AUTO, pages_compressed_limit, CTLFLAG_RD | CTLFLAG_LOCKED,
4029 &c_segment_pages_compressed_limit, 0, "The limit on the amount of uncompressed data the compressor will store (in pages)");
4030
4031 SYSCTL_NODE(_vm_compressor, OID_AUTO, segment, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor Segment Counts");
4032 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, total, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_count, 0, "Number of allocated segments");
4033 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, aging, CTLFLAG_RD | CTLFLAG_LOCKED, &c_age_count, 0, "Number of aging segments");
4034 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedin_early, CTLFLAG_RD | CTLFLAG_LOCKED, &c_early_swappedin_count, 0, "Number of (early) swapped-in segments");
4035 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedin_regular, CTLFLAG_RD | CTLFLAG_LOCKED, &c_regular_swappedin_count, 0, "Number of (regular) swapped-in segments");
4036 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedin_late, CTLFLAG_RD | CTLFLAG_LOCKED, &c_late_swappedin_count, 0, "Number of (late) swapped-in segments");
4037 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapout_early, CTLFLAG_RD | CTLFLAG_LOCKED, &c_early_swapout_count, 0, "Number of (early) ready-to-swap segments");
4038 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapout_regular, CTLFLAG_RD | CTLFLAG_LOCKED, &c_regular_swapout_count, 0, "Number of (regular) ready-to-swap segments");
4039 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapout_late, CTLFLAG_RD | CTLFLAG_LOCKED, &c_late_swapout_count, 0, "Number of (late) ready-to-swap segments");
4040 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swapio, CTLFLAG_RD | CTLFLAG_LOCKED, &c_swapio_count, 0, "Number of swapping-out segments");
4041 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedout, CTLFLAG_RD | CTLFLAG_LOCKED, &c_swappedout_count, 0, "Number of (non-sparse) swapped-out segments");
4042 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, swappedout_sparse, CTLFLAG_RD | CTLFLAG_LOCKED, &c_swappedout_sparse_count, 0, "Number of (sparse) swapped-out segments");
4043 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, majorcompact, CTLFLAG_RD | CTLFLAG_LOCKED, &c_major_count, 0, "Number of recently-compacted segments");
4044 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, minorcompact, CTLFLAG_RD | CTLFLAG_LOCKED, &c_minor_count, 0, "Number of segments queued for deferred minor compaction");
4045 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, filling, CTLFLAG_RD | CTLFLAG_LOCKED, &c_filling_count, 0, "Number of filling segments");
4046 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, empty, CTLFLAG_RD | CTLFLAG_LOCKED, &c_empty_count, 0, "Number of empty segments");
4047 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, bad, CTLFLAG_RD | CTLFLAG_LOCKED, &c_bad_count, 0, "Number of bad segments");
4048 SYSCTL_UINT(_vm_compressor_segment, OID_AUTO, limit, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segments_limit, 0, "Limit on the number of allocated segments");
4049
4050 SYSCTL_NODE(_vm_compressor, OID_AUTO, svp, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor Single-Value");
4051 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, in_hash, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_in_hash, 0, "");
4052 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, hash_succeeded, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_hash_succeeded, 0, "");
4053 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, hash_failed, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_hash_failed, 0, "");
4054 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, zval_compressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_zero_compressions, 0, "");
4055 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, zval_decompressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_zero_decompressions, 0, "");
4056 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, nzval_compressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_nonzero_compressions, 0, "");
4057 SYSCTL_UINT(_vm_compressor_svp, OID_AUTO, nzval_decompressions, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_svp_nonzero_decompressions, 0, "");
4058
4059 SYSCTL_NODE(_vm_compressor, OID_AUTO, compactor, CTLFLAG_RD | CTLFLAG_LOCKED, 0, "VM Compressor Compactor");
4060 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compactions_completed, CTLFLAG_RD | CTLFLAG_LOCKED,
4061 &vm_pageout_vminfo.vm_compactor_major_compactions_completed, "Major compactions completed");
4062 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compactions_considered, CTLFLAG_RD | CTLFLAG_LOCKED,
4063 &vm_pageout_vminfo.vm_compactor_major_compactions_considered, "Major compactions considered");
4064 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compactions_bailed, CTLFLAG_RD | CTLFLAG_LOCKED,
4065 &vm_pageout_vminfo.vm_compactor_major_compactions_bailed, "Major compactions bailed (due to contention)");
4066 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_bytes_moved, CTLFLAG_RD | CTLFLAG_LOCKED,
4067 &vm_pageout_vminfo.vm_compactor_major_compaction_bytes_moved, "Bytes moved between segments during major compactions");
4068 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_slots_moved, CTLFLAG_RD | CTLFLAG_LOCKED,
4069 &vm_pageout_vminfo.vm_compactor_major_compaction_slots_moved, "Slots moved between segments during major compactions");
4070 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_bytes_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
4071 &vm_pageout_vminfo.vm_compactor_major_compaction_bytes_freed, "Bytes freed as a result of major compaction");
4072 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, major_compaction_segments_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
4073 &vm_pageout_vminfo.vm_compactor_major_compaction_segments_freed, "Segments freed as a result of major compaction");
4074 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, swapouts_queued, CTLFLAG_RD | CTLFLAG_LOCKED,
4075 &vm_pageout_vminfo.vm_compactor_swapouts_queued, "The number of segments queued for swapout after a major compaction");
4076 SYSCTL_QUAD(_vm_compressor_compactor, OID_AUTO, swapout_bytes_wasted, CTLFLAG_RD | CTLFLAG_LOCKED,
4077 &vm_pageout_vminfo.vm_compactor_swapout_bytes_wasted, "The number of unused bytes in segments queued for swapout");
4078