1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Mach Operating System
30 * Copyright (c) 1987 Carnegie-Mellon University
31 * All rights reserved. The CMU software License Agreement specifies
32 * the terms and conditions for use and redistribution.
33 */
34 /*
35 * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
36 * support for mandatory and extensible security protections. This notice
37 * is included in support of clause 2.2 (b) of the Apple Public License,
38 * Version 2.0.
39 */
40 #include <vm/vm_options.h>
41
42 #include <kern/ecc.h>
43 #include <kern/task.h>
44 #include <kern/thread.h>
45 #include <kern/debug.h>
46 #include <kern/extmod_statistics.h>
47 #include <mach/mach_traps.h>
48 #include <mach/port.h>
49 #include <mach/sdt.h>
50 #include <mach/task.h>
51 #include <mach/task_access.h>
52 #include <mach/task_special_ports.h>
53 #include <mach/time_value.h>
54 #include <mach/vm_map.h>
55 #include <mach/vm_param.h>
56 #include <mach/vm_prot.h>
57 #include <machine/machine_routines.h>
58
59 #include <sys/file_internal.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/dir.h>
63 #include <sys/namei.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/vm.h>
67 #include <sys/file.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/mount.h>
70 #include <sys/xattr.h>
71 #include <sys/trace.h>
72 #include <sys/kernel.h>
73 #include <sys/ubc_internal.h>
74 #include <sys/user.h>
75 #include <sys/syslog.h>
76 #include <sys/stat.h>
77 #include <sys/sysproto.h>
78 #include <sys/mman.h>
79 #include <sys/sysctl.h>
80 #include <sys/cprotect.h>
81 #include <sys/kpi_socket.h>
82 #include <sys/kas_info.h>
83 #include <sys/socket.h>
84 #include <sys/socketvar.h>
85 #include <sys/random.h>
86 #include <sys/code_signing.h>
87 #if NECP
88 #include <net/necp.h>
89 #endif /* NECP */
90 #if SKYWALK
91 #include <skywalk/os_channel.h>
92 #endif /* SKYWALK */
93
94 #include <security/audit/audit.h>
95 #include <security/mac.h>
96 #include <bsm/audit_kevents.h>
97
98 #include <kern/kalloc.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_kern_xnu.h>
101 #include <vm/vm_pageout_xnu.h>
102
103 #include <mach/shared_region.h>
104 #include <vm/vm_shared_region_internal.h>
105
106 #include <vm/vm_dyld_pager_internal.h>
107 #include <vm/vm_protos_internal.h>
108 #if DEVELOPMENT || DEBUG
109 #include <vm/vm_compressor_info.h> /* for c_segment_info */
110 #include <vm/vm_compressor_xnu.h> /* for vm_compressor_serialize_segment_debug_info() */
111 #endif
112 #include <vm/vm_reclaim_xnu.h>
113
114 #include <sys/kern_memorystatus.h>
115 #include <sys/kern_memorystatus_freeze.h>
116 #include <sys/proc_internal.h>
117
118 #include <mach-o/fixup-chains.h>
119
120 #if CONFIG_MACF
121 #include <security/mac_framework.h>
122 #endif
123
124 #include <kern/bits.h>
125
126 #if CONFIG_CSR
127 #include <sys/csr.h>
128 #endif /* CONFIG_CSR */
129 #include <sys/trust_caches.h>
130 #include <libkern/amfi/amfi.h>
131 #include <IOKit/IOBSD.h>
132
133 #if VM_MAP_DEBUG_APPLE_PROTECT
134 SYSCTL_INT(_vm, OID_AUTO, map_debug_apple_protect, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_apple_protect, 0, "");
135 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
136
137 #if DEVELOPMENT || DEBUG
138
139 static int
140 sysctl_kmem_alloc_contig SYSCTL_HANDLER_ARGS
141 {
142 #pragma unused(arg1, arg2)
143 vm_offset_t kaddr;
144 kern_return_t kr;
145 int error = 0;
146 int size = 0;
147
148 error = sysctl_handle_int(oidp, &size, 0, req);
149 if (error || !req->newptr) {
150 return error;
151 }
152
153 kr = kmem_alloc_contig(kernel_map, &kaddr, (vm_size_t)size,
154 0, 0, 0, KMA_DATA, VM_KERN_MEMORY_IOKIT);
155
156 if (kr == KERN_SUCCESS) {
157 kmem_free(kernel_map, kaddr, size);
158 }
159
160 return error;
161 }
162
163 SYSCTL_PROC(_vm, OID_AUTO, kmem_alloc_contig, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
164 0, 0, &sysctl_kmem_alloc_contig, "I", "");
165
166 extern int vm_region_footprint;
167 SYSCTL_INT(_vm, OID_AUTO, region_footprint, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, &vm_region_footprint, 0, "");
168
169 static int
170 sysctl_kmem_gobj_stats SYSCTL_HANDLER_ARGS
171 {
172 #pragma unused(arg1, arg2, oidp)
173 kmem_gobj_stats stats = kmem_get_gobj_stats();
174
175 return SYSCTL_OUT(req, &stats, sizeof(stats));
176 }
177
178 SYSCTL_PROC(_vm, OID_AUTO, kmem_gobj_stats,
179 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
180 0, 0, &sysctl_kmem_gobj_stats, "S,kmem_gobj_stats", "");
181
182 #endif /* DEVELOPMENT || DEBUG */
183
184 static int
185 sysctl_vm_self_region_footprint SYSCTL_HANDLER_ARGS
186 {
187 #pragma unused(arg1, arg2, oidp)
188 int error = 0;
189 int value;
190
191 value = task_self_region_footprint();
192 error = SYSCTL_OUT(req, &value, sizeof(int));
193 if (error) {
194 return error;
195 }
196
197 if (!req->newptr) {
198 return 0;
199 }
200
201 error = SYSCTL_IN(req, &value, sizeof(int));
202 if (error) {
203 return error;
204 }
205 task_self_region_footprint_set(value);
206 return 0;
207 }
208 SYSCTL_PROC(_vm, OID_AUTO, self_region_footprint, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_footprint, "I", "");
209
210 static int
211 sysctl_vm_self_region_page_size SYSCTL_HANDLER_ARGS
212 {
213 #pragma unused(arg1, arg2, oidp)
214 int error = 0;
215 int value;
216
217 value = (1 << thread_self_region_page_shift());
218 error = SYSCTL_OUT(req, &value, sizeof(int));
219 if (error) {
220 return error;
221 }
222
223 if (!req->newptr) {
224 return 0;
225 }
226
227 error = SYSCTL_IN(req, &value, sizeof(int));
228 if (error) {
229 return error;
230 }
231
232 if (value != 0 && value != 4096 && value != 16384) {
233 return EINVAL;
234 }
235
236 #if !__ARM_MIXED_PAGE_SIZE__
237 if (value != vm_map_page_size(current_map())) {
238 return EINVAL;
239 }
240 #endif /* !__ARM_MIXED_PAGE_SIZE__ */
241
242 thread_self_region_page_shift_set(bit_first(value));
243 return 0;
244 }
245 SYSCTL_PROC(_vm, OID_AUTO, self_region_page_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_page_size, "I", "");
246
247 static int
248 sysctl_vm_self_region_info_flags SYSCTL_HANDLER_ARGS
249 {
250 #pragma unused(arg1, arg2, oidp)
251 int error = 0;
252 int value;
253 kern_return_t kr;
254
255 value = task_self_region_info_flags();
256 error = SYSCTL_OUT(req, &value, sizeof(int));
257 if (error) {
258 return error;
259 }
260
261 if (!req->newptr) {
262 return 0;
263 }
264
265 error = SYSCTL_IN(req, &value, sizeof(int));
266 if (error) {
267 return error;
268 }
269
270 kr = task_self_region_info_flags_set(value);
271 if (kr != KERN_SUCCESS) {
272 return EINVAL;
273 }
274
275 return 0;
276 }
277 SYSCTL_PROC(_vm, OID_AUTO, self_region_info_flags, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_info_flags, "I", "");
278
279
280 #if DEVELOPMENT || DEBUG
281 extern int panic_on_unsigned_execute;
282 SYSCTL_INT(_vm, OID_AUTO, panic_on_unsigned_execute, CTLFLAG_RW | CTLFLAG_LOCKED, &panic_on_unsigned_execute, 0, "");
283
284 extern int vm_log_xnu_user_debug;
285 SYSCTL_INT(_vm, OID_AUTO, log_xnu_user_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_xnu_user_debug, 0, "");
286 #endif /* DEVELOPMENT || DEBUG */
287
288 extern int vm_log_map_delete_permanent_prot_none;
289 SYSCTL_INT(_vm, OID_AUTO, log_map_delete_permanent_prot_none, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_map_delete_permanent_prot_none, 0, "");
290
291 extern int cs_executable_create_upl;
292 extern int cs_executable_wire;
293 SYSCTL_INT(_vm, OID_AUTO, cs_executable_create_upl, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_create_upl, 0, "");
294 SYSCTL_INT(_vm, OID_AUTO, cs_executable_wire, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_wire, 0, "");
295
296 extern int apple_protect_pager_count;
297 extern int apple_protect_pager_count_mapped;
298 extern unsigned int apple_protect_pager_cache_limit;
299 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count, 0, "");
300 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count_mapped, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count_mapped, 0, "");
301 SYSCTL_UINT(_vm, OID_AUTO, apple_protect_pager_cache_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_cache_limit, 0, "");
302
303 #if DEVELOPMENT || DEBUG
304 extern int radar_20146450;
305 SYSCTL_INT(_vm, OID_AUTO, radar_20146450, CTLFLAG_RW | CTLFLAG_LOCKED, &radar_20146450, 0, "");
306
307 extern int macho_printf;
308 SYSCTL_INT(_vm, OID_AUTO, macho_printf, CTLFLAG_RW | CTLFLAG_LOCKED, &macho_printf, 0, "");
309
310 extern int apple_protect_pager_data_request_debug;
311 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_data_request_debug, 0, "");
312
313 extern unsigned int vm_object_copy_delayed_paging_wait_disable;
314 EXPERIMENT_FACTOR_UINT(_vm, vm_object_copy_delayed_paging_wait_disable, &vm_object_copy_delayed_paging_wait_disable, FALSE, TRUE, "");
315
316 #if __arm64__
317 /* These are meant to support the page table accounting unit test. */
318 extern unsigned int arm_hardware_page_size;
319 extern unsigned int arm_pt_desc_size;
320 extern unsigned int arm_pt_root_size;
321 extern unsigned int inuse_user_tteroot_count;
322 extern unsigned int inuse_kernel_tteroot_count;
323 extern unsigned int inuse_user_ttepages_count;
324 extern unsigned int inuse_kernel_ttepages_count;
325 extern unsigned int inuse_user_ptepages_count;
326 extern unsigned int inuse_kernel_ptepages_count;
327 SYSCTL_UINT(_vm, OID_AUTO, native_hw_pagesize, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_hardware_page_size, 0, "");
328 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_desc_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_desc_size, 0, "");
329 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_root_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_root_size, 0, "");
330 SYSCTL_UINT(_vm, OID_AUTO, user_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_tteroot_count, 0, "");
331 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_tteroot_count, 0, "");
332 SYSCTL_UINT(_vm, OID_AUTO, user_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ttepages_count, 0, "");
333 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ttepages_count, 0, "");
334 SYSCTL_UINT(_vm, OID_AUTO, user_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ptepages_count, 0, "");
335 SYSCTL_UINT(_vm, OID_AUTO, kernel_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ptepages_count, 0, "");
336 #if !CONFIG_SPTM
337 extern unsigned int free_page_size_tt_count;
338 extern unsigned int free_tt_count;
339 SYSCTL_UINT(_vm, OID_AUTO, free_1page_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_page_size_tt_count, 0, "");
340 SYSCTL_UINT(_vm, OID_AUTO, free_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_tt_count, 0, "");
341 #endif
342 #if DEVELOPMENT || DEBUG
343 extern unsigned long pmap_asid_flushes;
344 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_flushes, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_flushes, "");
345 extern unsigned long pmap_asid_hits;
346 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_hits, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_hits, "");
347 extern unsigned long pmap_asid_misses;
348 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_misses, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_misses, "");
349 extern unsigned long pmap_speculation_restrictions;
350 SYSCTL_ULONG(_vm, OID_AUTO, pmap_speculation_restrictions, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_speculation_restrictions, "");
351 #endif
352 #endif /* __arm64__ */
353 #endif /* DEVELOPMENT || DEBUG */
354
355 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor, 0, "");
356 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor_pages, 0, "");
357 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate, 0, "");
358 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate_failure, 0, "");
359 SYSCTL_INT(_vm, OID_AUTO, vm_should_cow_but_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.should_cow_but_wired, 0, "");
360 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow, 0, "");
361 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow_pages, 0, "");
362 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_write, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_write, 0, "");
363 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_copy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_copy, 0, "");
364 #if VM_SCAN_FOR_SHADOW_CHAIN
365 static int vm_shadow_max_enabled = 0; /* Disabled by default */
366 extern int proc_shadow_max(void);
367 static int
368 vm_shadow_max SYSCTL_HANDLER_ARGS
369 {
370 #pragma unused(arg1, arg2, oidp)
371 int value = 0;
372
373 if (vm_shadow_max_enabled) {
374 value = proc_shadow_max();
375 }
376
377 return SYSCTL_OUT(req, &value, sizeof(value));
378 }
379 SYSCTL_PROC(_vm, OID_AUTO, vm_shadow_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
380 0, 0, &vm_shadow_max, "I", "");
381
382 SYSCTL_INT(_vm, OID_AUTO, vm_shadow_max_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_shadow_max_enabled, 0, "");
383
384 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
385
386 SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, "");
387
388 #if PAGE_SLEEP_WITH_INHERITOR
389 #if DEVELOPMENT || DEBUG
390 extern uint32_t page_worker_table_size;
391 SYSCTL_INT(_vm, OID_AUTO, page_worker_table_size, CTLFLAG_RD | CTLFLAG_LOCKED, &page_worker_table_size, 0, "");
392 SCALABLE_COUNTER_DECLARE(page_worker_hash_collisions);
393 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_hash_collisions, page_worker_hash_collisions, "");
394 SCALABLE_COUNTER_DECLARE(page_worker_inheritor_sleeps);
395 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_inheritor_sleeps, page_worker_inheritor_sleeps, "");
396 #endif /* DEVELOPMENT || DEBUG */
397 #endif /* PAGE_SLEEP_WITH_INHERITOR */
398
399 /*
400 * Sysctl's related to data/stack execution. See osfmk/vm/vm_map.c
401 */
402
403 #if DEVELOPMENT || DEBUG
404 extern int allow_stack_exec, allow_data_exec;
405
406 SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, "");
407 SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, "");
408
409 #endif /* DEVELOPMENT || DEBUG */
410
411 static const char *prot_values[] = {
412 "none",
413 "read-only",
414 "write-only",
415 "read-write",
416 "execute-only",
417 "read-execute",
418 "write-execute",
419 "read-write-execute"
420 };
421
422 void
log_stack_execution_failure(addr64_t vaddr,vm_prot_t prot)423 log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot)
424 {
425 printf("Data/Stack execution not permitted: %s[pid %d] at virtual address 0x%qx, protections were %s\n",
426 current_proc()->p_comm, proc_getpid(current_proc()), vaddr, prot_values[prot & VM_PROT_ALL]);
427 }
428
429 /*
430 * shared_region_unnest_logging: level of logging of unnesting events
431 * 0 - no logging
432 * 1 - throttled logging of unexpected unnesting events (default)
433 * 2 - unthrottled logging of unexpected unnesting events
434 * 3+ - unthrottled logging of all unnesting events
435 */
436 int shared_region_unnest_logging = 1;
437
438 SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED,
439 &shared_region_unnest_logging, 0, "");
440
441 int vm_shared_region_unnest_log_interval = 10;
442 int shared_region_unnest_log_count_threshold = 5;
443
444
445 #if XNU_TARGET_OS_OSX
446
447 #if defined (__x86_64__)
448 static int scdir_enforce = 1;
449 #else /* defined (__x86_64__) */
450 static int scdir_enforce = 0; /* AOT caches live elsewhere */
451 #endif /* defined (__x86_64__) */
452
453 static char *scdir_path[] = {
454 "/System/Library/dyld/",
455 "/System/Volumes/Preboot/Cryptexes/OS/System/Library/dyld",
456 "/System/Cryptexes/OS/System/Library/dyld",
457 NULL
458 };
459
460 #else /* XNU_TARGET_OS_OSX */
461
462 static int scdir_enforce = 0;
463 static char *scdir_path[] = {
464 "/System/Library/Caches/com.apple.dyld/",
465 "/private/preboot/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
466 "/System/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
467 NULL
468 };
469
470 #endif /* XNU_TARGET_OS_OSX */
471
472 static char *driverkit_scdir_path[] = {
473 "/System/DriverKit/System/Library/dyld/",
474 #if XNU_TARGET_OS_OSX
475 "/System/Volumes/Preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
476 #else
477 "/private/preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
478 #endif /* XNU_TARGET_OS_OSX */
479 "/System/Cryptexes/OS/System/DriverKit/System/Library/dyld",
480 NULL
481 };
482
483 #ifndef SECURE_KERNEL
484 static int sysctl_scdir_enforce SYSCTL_HANDLER_ARGS
485 {
486 #if CONFIG_CSR
487 if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
488 printf("Failed attempt to set vm.enforce_shared_cache_dir sysctl\n");
489 return EPERM;
490 }
491 #endif /* CONFIG_CSR */
492 return sysctl_handle_int(oidp, arg1, arg2, req);
493 }
494
495 SYSCTL_PROC(_vm, OID_AUTO, enforce_shared_cache_dir, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, sysctl_scdir_enforce, "I", "");
496 #endif
497
498 /* These log rate throttling state variables aren't thread safe, but
499 * are sufficient unto the task.
500 */
501 static int64_t last_unnest_log_time = 0;
502 static int shared_region_unnest_log_count = 0;
503
504 void
log_unnest_badness(vm_map_t m,vm_map_offset_t s,vm_map_offset_t e,boolean_t is_nested_map,vm_map_offset_t lowest_unnestable_addr)505 log_unnest_badness(
506 vm_map_t m,
507 vm_map_offset_t s,
508 vm_map_offset_t e,
509 boolean_t is_nested_map,
510 vm_map_offset_t lowest_unnestable_addr)
511 {
512 struct timeval tv;
513
514 if (shared_region_unnest_logging == 0) {
515 return;
516 }
517
518 if (shared_region_unnest_logging <= 2 &&
519 is_nested_map &&
520 s >= lowest_unnestable_addr) {
521 /*
522 * Unnesting of writable map entries is fine.
523 */
524 return;
525 }
526
527 if (shared_region_unnest_logging <= 1) {
528 microtime(&tv);
529 if ((tv.tv_sec - last_unnest_log_time) <
530 vm_shared_region_unnest_log_interval) {
531 if (shared_region_unnest_log_count++ >
532 shared_region_unnest_log_count_threshold) {
533 return;
534 }
535 } else {
536 last_unnest_log_time = tv.tv_sec;
537 shared_region_unnest_log_count = 0;
538 }
539 }
540
541 DTRACE_VM4(log_unnest_badness,
542 vm_map_t, m,
543 vm_map_offset_t, s,
544 vm_map_offset_t, e,
545 vm_map_offset_t, lowest_unnestable_addr);
546 printf("%s[%d] triggered unnest of range 0x%qx->0x%qx of DYLD shared region in VM map %p. While not abnormal for debuggers, this increases system memory footprint until the target exits.\n", current_proc()->p_comm, proc_getpid(current_proc()), (uint64_t)s, (uint64_t)e, (void *) VM_KERNEL_ADDRPERM(m));
547 }
548
549 uint64_t
vm_purge_filebacked_pagers(void)550 vm_purge_filebacked_pagers(void)
551 {
552 uint64_t pages_purged;
553
554 pages_purged = 0;
555 pages_purged += apple_protect_pager_purge_all();
556 pages_purged += shared_region_pager_purge_all();
557 pages_purged += dyld_pager_purge_all();
558 #if DEVELOPMENT || DEBUG
559 printf("%s:%d pages purged: %llu\n", __FUNCTION__, __LINE__, pages_purged);
560 #endif /* DEVELOPMENT || DEBUG */
561 return pages_purged;
562 }
563
564 int
useracc(user_addr_ut addr_u,user_size_ut len_u,int prot)565 useracc(
566 user_addr_ut addr_u,
567 user_size_ut len_u,
568 int prot)
569 {
570 vm_map_t map;
571 vm_prot_t vm_prot = VM_PROT_WRITE;
572
573 map = current_map();
574
575 if (prot == B_READ) {
576 vm_prot = VM_PROT_READ;
577 }
578
579 return vm_map_check_protection(map, addr_u,
580 vm_sanitize_compute_ut_end(addr_u, len_u), vm_prot,
581 VM_SANITIZE_CALLER_USERACC);
582 }
583
584 #if XNU_PLATFORM_MacOSX
585 static __attribute__((always_inline, warn_unused_result))
586 kern_return_t
vslock_sanitize(vm_map_t map,user_addr_ut addr_u,user_size_ut len_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)587 vslock_sanitize(
588 vm_map_t map,
589 user_addr_ut addr_u,
590 user_size_ut len_u,
591 vm_sanitize_caller_t vm_sanitize_caller,
592 vm_map_offset_t *start,
593 vm_map_offset_t *end,
594 vm_map_size_t *size)
595 {
596 return vm_sanitize_addr_size(addr_u, len_u, vm_sanitize_caller,
597 map,
598 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
599 size);
600 }
601 #endif /* XNU_PLATFORM_MacOSX */
602
603 int
vslock(user_addr_ut addr,user_size_ut len)604 vslock(user_addr_ut addr, user_size_ut len)
605 {
606 kern_return_t kret;
607
608 #if XNU_PLATFORM_MacOSX
609 /*
610 * Preserve previous behavior on macOS for overflows due to bin
611 * compatibility i.e. return success for overflows without doing
612 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
613 * for overflow errors which gets converted to KERN_SUCCESS by
614 * vm_sanitize_get_kr.
615 */
616 vm_map_offset_t start, end;
617 vm_map_size_t size;
618
619 kret = vslock_sanitize(current_map(),
620 addr,
621 len,
622 VM_SANITIZE_CALLER_VSLOCK,
623 &start,
624 &end,
625 &size);
626 if (__improbable(kret != KERN_SUCCESS)) {
627 switch (vm_sanitize_get_kr(kret)) {
628 case KERN_SUCCESS:
629 return 0;
630 case KERN_INVALID_ADDRESS:
631 case KERN_NO_SPACE:
632 return ENOMEM;
633 case KERN_PROTECTION_FAILURE:
634 return EACCES;
635 default:
636 return EINVAL;
637 }
638 }
639 #endif /* XNU_PLATFORM_MacOSX */
640
641 kret = vm_map_wire_kernel(current_map(), addr,
642 vm_sanitize_compute_ut_end(addr, len),
643 vm_sanitize_wrap_prot(VM_PROT_READ | VM_PROT_WRITE),
644 VM_KERN_MEMORY_BSD,
645 FALSE);
646
647 switch (kret) {
648 case KERN_SUCCESS:
649 return 0;
650 case KERN_INVALID_ADDRESS:
651 case KERN_NO_SPACE:
652 return ENOMEM;
653 case KERN_PROTECTION_FAILURE:
654 return EACCES;
655 default:
656 return EINVAL;
657 }
658 }
659
660 int
vsunlock(user_addr_ut addr,user_size_ut len,__unused int dirtied)661 vsunlock(user_addr_ut addr, user_size_ut len, __unused int dirtied)
662 {
663 #if FIXME /* [ */
664 pmap_t pmap;
665 vm_page_t pg;
666 vm_map_offset_t vaddr;
667 ppnum_t paddr;
668 #endif /* FIXME ] */
669 kern_return_t kret;
670 vm_map_t map;
671
672 map = current_map();
673
674 #if FIXME /* [ */
675 if (dirtied) {
676 pmap = get_task_pmap(current_task());
677 for (vaddr = vm_map_trunc_page(addr, PAGE_MASK);
678 vaddr < vm_map_round_page(addr + len, PAGE_MASK);
679 vaddr += PAGE_SIZE) {
680 paddr = pmap_find_phys(pmap, vaddr);
681 pg = PHYS_TO_VM_PAGE(paddr);
682 vm_page_set_modified(pg);
683 }
684 }
685 #endif /* FIXME ] */
686 #ifdef lint
687 dirtied++;
688 #endif /* lint */
689
690 #if XNU_PLATFORM_MacOSX
691 /*
692 * Preserve previous behavior on macOS for overflows due to bin
693 * compatibility i.e. return success for overflows without doing
694 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
695 * for overflow errors which gets converted to KERN_SUCCESS by
696 * vm_sanitize_get_kr.
697 */
698 vm_map_offset_t start, end;
699 vm_map_size_t size;
700
701 kret = vslock_sanitize(map,
702 addr,
703 len,
704 VM_SANITIZE_CALLER_VSUNLOCK,
705 &start,
706 &end,
707 &size);
708 if (__improbable(kret != KERN_SUCCESS)) {
709 switch (vm_sanitize_get_kr(kret)) {
710 case KERN_SUCCESS:
711 return 0;
712 case KERN_INVALID_ADDRESS:
713 case KERN_NO_SPACE:
714 return ENOMEM;
715 case KERN_PROTECTION_FAILURE:
716 return EACCES;
717 default:
718 return EINVAL;
719 }
720 }
721 #endif /* XNU_PLATFORM_MacOSX */
722
723 kret = vm_map_unwire(map, addr,
724 vm_sanitize_compute_ut_end(addr, len), false);
725 switch (kret) {
726 case KERN_SUCCESS:
727 return 0;
728 case KERN_INVALID_ADDRESS:
729 case KERN_NO_SPACE:
730 return ENOMEM;
731 case KERN_PROTECTION_FAILURE:
732 return EACCES;
733 default:
734 return EINVAL;
735 }
736 }
737
738 int
subyte(user_addr_t addr,int byte)739 subyte(
740 user_addr_t addr,
741 int byte)
742 {
743 char character;
744
745 character = (char)byte;
746 return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
747 }
748
749 int
suibyte(user_addr_t addr,int byte)750 suibyte(
751 user_addr_t addr,
752 int byte)
753 {
754 char character;
755
756 character = (char)byte;
757 return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
758 }
759
760 int
fubyte(user_addr_t addr)761 fubyte(user_addr_t addr)
762 {
763 unsigned char byte;
764
765 if (copyin(addr, (void *) &byte, sizeof(char))) {
766 return -1;
767 }
768 return byte;
769 }
770
771 int
fuibyte(user_addr_t addr)772 fuibyte(user_addr_t addr)
773 {
774 unsigned char byte;
775
776 if (copyin(addr, (void *) &(byte), sizeof(char))) {
777 return -1;
778 }
779 return byte;
780 }
781
782 int
suword(user_addr_t addr,long word)783 suword(
784 user_addr_t addr,
785 long word)
786 {
787 return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
788 }
789
790 long
fuword(user_addr_t addr)791 fuword(user_addr_t addr)
792 {
793 long word = 0;
794
795 if (copyin(addr, (void *) &word, sizeof(int))) {
796 return -1;
797 }
798 return word;
799 }
800
801 /* suiword and fuiword are the same as suword and fuword, respectively */
802
803 int
suiword(user_addr_t addr,long word)804 suiword(
805 user_addr_t addr,
806 long word)
807 {
808 return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
809 }
810
811 long
fuiword(user_addr_t addr)812 fuiword(user_addr_t addr)
813 {
814 long word = 0;
815
816 if (copyin(addr, (void *) &word, sizeof(int))) {
817 return -1;
818 }
819 return word;
820 }
821
822 /*
823 * With a 32-bit kernel and mixed 32/64-bit user tasks, this interface allows the
824 * fetching and setting of process-sized size_t and pointer values.
825 */
826 int
sulong(user_addr_t addr,int64_t word)827 sulong(user_addr_t addr, int64_t word)
828 {
829 if (IS_64BIT_PROCESS(current_proc())) {
830 return copyout((void *)&word, addr, sizeof(word)) == 0 ? 0 : -1;
831 } else {
832 return suiword(addr, (long)word);
833 }
834 }
835
836 int64_t
fulong(user_addr_t addr)837 fulong(user_addr_t addr)
838 {
839 int64_t longword;
840
841 if (IS_64BIT_PROCESS(current_proc())) {
842 if (copyin(addr, (void *)&longword, sizeof(longword)) != 0) {
843 return -1;
844 }
845 return longword;
846 } else {
847 return (int64_t)fuiword(addr);
848 }
849 }
850
851 int
suulong(user_addr_t addr,uint64_t uword)852 suulong(user_addr_t addr, uint64_t uword)
853 {
854 if (IS_64BIT_PROCESS(current_proc())) {
855 return copyout((void *)&uword, addr, sizeof(uword)) == 0 ? 0 : -1;
856 } else {
857 return suiword(addr, (uint32_t)uword);
858 }
859 }
860
861 uint64_t
fuulong(user_addr_t addr)862 fuulong(user_addr_t addr)
863 {
864 uint64_t ulongword;
865
866 if (IS_64BIT_PROCESS(current_proc())) {
867 if (copyin(addr, (void *)&ulongword, sizeof(ulongword)) != 0) {
868 return -1ULL;
869 }
870 return ulongword;
871 } else {
872 return (uint64_t)fuiword(addr);
873 }
874 }
875
876 int
swapon(__unused proc_t procp,__unused struct swapon_args * uap,__unused int * retval)877 swapon(__unused proc_t procp, __unused struct swapon_args *uap, __unused int *retval)
878 {
879 return ENOTSUP;
880 }
881
882 #if defined(SECURE_KERNEL)
883 static int kern_secure_kernel = 1;
884 #else
885 static int kern_secure_kernel = 0;
886 #endif
887
888 SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_secure_kernel, 0, "");
889 SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW | CTLFLAG_LOCKED,
890 &shared_region_trace_level, 0, "");
891 SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD | CTLFLAG_LOCKED,
892 &shared_region_version, 0, "");
893 SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED,
894 &shared_region_persistence, 0, "");
895
896 /*
897 * shared_region_check_np:
898 *
899 * This system call is intended for dyld.
900 *
901 * dyld calls this when any process starts to see if the process's shared
902 * region is already set up and ready to use.
903 * This call returns the base address of the first mapping in the
904 * process's shared region's first mapping.
905 * dyld will then check what's mapped at that address.
906 *
907 * If the shared region is empty, dyld will then attempt to map the shared
908 * cache file in the shared region via the shared_region_map_np() system call.
909 *
910 * If something's already mapped in the shared region, dyld will check if it
911 * matches the shared cache it would like to use for that process.
912 * If it matches, evrything's ready and the process can proceed and use the
913 * shared region.
914 * If it doesn't match, dyld will unmap the shared region and map the shared
915 * cache into the process's address space via mmap().
916 *
917 * A NULL pointer argument can be used by dyld to indicate it has unmapped
918 * the shared region. We will remove the shared_region reference from the task.
919 *
920 * ERROR VALUES
921 * EINVAL no shared region
922 * ENOMEM shared region is empty
923 * EFAULT bad address for "start_address"
924 */
925 int
shared_region_check_np(__unused struct proc * p,struct shared_region_check_np_args * uap,__unused int * retvalp)926 shared_region_check_np(
927 __unused struct proc *p,
928 struct shared_region_check_np_args *uap,
929 __unused int *retvalp)
930 {
931 vm_shared_region_t shared_region;
932 mach_vm_offset_t start_address = 0;
933 int error = 0;
934 kern_return_t kr;
935 task_t task = current_task();
936
937 SHARED_REGION_TRACE_DEBUG(
938 ("shared_region: %p [%d(%s)] -> check_np(0x%llx)\n",
939 (void *)VM_KERNEL_ADDRPERM(current_thread()),
940 proc_getpid(p), p->p_comm,
941 (uint64_t)uap->start_address));
942
943 /*
944 * Special value of start_address used to indicate that map_with_linking() should
945 * no longer be allowed in this process
946 */
947 if (uap->start_address == (task_get_64bit_addr(task) ? DYLD_VM_END_MWL : (uint32_t)DYLD_VM_END_MWL)) {
948 p->p_disallow_map_with_linking = TRUE;
949 return 0;
950 }
951
952 /* retrieve the current tasks's shared region */
953 shared_region = vm_shared_region_get(task);
954 if (shared_region != NULL) {
955 /*
956 * A NULL argument is used by dyld to indicate the task
957 * has unmapped its shared region.
958 */
959 if (uap->start_address == 0) {
960 /* unmap it first */
961 vm_shared_region_remove(task, shared_region);
962 vm_shared_region_set(task, NULL);
963 } else {
964 /* retrieve address of its first mapping... */
965 kr = vm_shared_region_start_address(shared_region, &start_address, task);
966 if (kr != KERN_SUCCESS) {
967 SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
968 "check_np(0x%llx) "
969 "vm_shared_region_start_address() failed\n",
970 (void *)VM_KERNEL_ADDRPERM(current_thread()),
971 proc_getpid(p), p->p_comm,
972 (uint64_t)uap->start_address));
973 error = ENOMEM;
974 } else {
975 #if __has_feature(ptrauth_calls)
976 /*
977 * Remap any section of the shared library that
978 * has authenticated pointers into private memory.
979 */
980 if (vm_shared_region_auth_remap(shared_region) != KERN_SUCCESS) {
981 SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
982 "check_np(0x%llx) "
983 "vm_shared_region_auth_remap() failed\n",
984 (void *)VM_KERNEL_ADDRPERM(current_thread()),
985 proc_getpid(p), p->p_comm,
986 (uint64_t)uap->start_address));
987 error = ENOMEM;
988 }
989 #endif /* __has_feature(ptrauth_calls) */
990
991 /* ... and give it to the caller */
992 if (error == 0) {
993 error = copyout(&start_address,
994 (user_addr_t) uap->start_address,
995 sizeof(start_address));
996 if (error != 0) {
997 SHARED_REGION_TRACE_ERROR(
998 ("shared_region: %p [%d(%s)] "
999 "check_np(0x%llx) "
1000 "copyout(0x%llx) error %d\n",
1001 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1002 proc_getpid(p), p->p_comm,
1003 (uint64_t)uap->start_address, (uint64_t)start_address,
1004 error));
1005 }
1006 }
1007 }
1008 }
1009 vm_shared_region_deallocate(shared_region);
1010 } else {
1011 /* no shared region ! */
1012 error = EINVAL;
1013 }
1014
1015 SHARED_REGION_TRACE_DEBUG(
1016 ("shared_region: %p [%d(%s)] check_np(0x%llx) <- 0x%llx %d\n",
1017 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1018 proc_getpid(p), p->p_comm,
1019 (uint64_t)uap->start_address, (uint64_t)start_address, error));
1020
1021 return error;
1022 }
1023
1024
1025 static int
shared_region_copyin(struct proc * p,user_addr_t user_addr,unsigned int count,unsigned int element_size,void * kernel_data)1026 shared_region_copyin(
1027 struct proc *p,
1028 user_addr_t user_addr,
1029 unsigned int count,
1030 unsigned int element_size,
1031 void *kernel_data)
1032 {
1033 int error = 0;
1034 vm_size_t size = count * element_size;
1035
1036 error = copyin(user_addr, kernel_data, size);
1037 if (error) {
1038 SHARED_REGION_TRACE_ERROR(
1039 ("shared_region: %p [%d(%s)] map(): "
1040 "copyin(0x%llx, %ld) failed (error=%d)\n",
1041 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1042 proc_getpid(p), p->p_comm,
1043 (uint64_t)user_addr, (long)size, error));
1044 }
1045 return error;
1046 }
1047
1048 /*
1049 * A reasonable upper limit to prevent overflow of allocation/copyin.
1050 */
1051 #define _SR_FILE_MAPPINGS_MAX_FILES 256
1052
1053 /* forward declaration */
1054 __attribute__((noinline))
1055 static void shared_region_map_and_slide_cleanup(
1056 struct proc *p,
1057 uint32_t files_count,
1058 struct _sr_file_mappings *sr_file_mappings,
1059 struct vm_shared_region *shared_region);
1060
1061 /*
1062 * Setup part of _shared_region_map_and_slide().
1063 * It had to be broken out of _shared_region_map_and_slide() to
1064 * prevent compiler inlining from blowing out the stack.
1065 */
1066 __attribute__((noinline))
1067 static int
shared_region_map_and_slide_setup(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings,struct _sr_file_mappings ** sr_file_mappings,struct vm_shared_region ** shared_region_ptr,struct vnode * rdir_vp)1068 shared_region_map_and_slide_setup(
1069 struct proc *p,
1070 uint32_t files_count,
1071 struct shared_file_np *files,
1072 uint32_t mappings_count,
1073 struct shared_file_mapping_slide_np *mappings,
1074 struct _sr_file_mappings **sr_file_mappings,
1075 struct vm_shared_region **shared_region_ptr,
1076 struct vnode *rdir_vp)
1077 {
1078 int error = 0;
1079 struct _sr_file_mappings *srfmp;
1080 uint32_t mappings_next;
1081 struct vnode_attr va;
1082 off_t fs;
1083 #if CONFIG_MACF
1084 vm_prot_t maxprot = VM_PROT_ALL;
1085 #endif
1086 uint32_t i;
1087 struct vm_shared_region *shared_region = NULL;
1088 boolean_t is_driverkit = task_is_driver(current_task());
1089
1090 SHARED_REGION_TRACE_DEBUG(
1091 ("shared_region: %p [%d(%s)] -> map\n",
1092 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1093 proc_getpid(p), p->p_comm));
1094
1095 if (files_count > _SR_FILE_MAPPINGS_MAX_FILES) {
1096 error = E2BIG;
1097 goto done;
1098 }
1099 if (files_count == 0) {
1100 error = EINVAL;
1101 goto done;
1102 }
1103 *sr_file_mappings = kalloc_type(struct _sr_file_mappings, files_count,
1104 Z_WAITOK | Z_ZERO);
1105 if (*sr_file_mappings == NULL) {
1106 error = ENOMEM;
1107 goto done;
1108 }
1109 mappings_next = 0;
1110 for (i = 0; i < files_count; i++) {
1111 srfmp = &(*sr_file_mappings)[i];
1112 srfmp->fd = files[i].sf_fd;
1113 srfmp->mappings_count = files[i].sf_mappings_count;
1114 srfmp->mappings = &mappings[mappings_next];
1115 mappings_next += srfmp->mappings_count;
1116 if (mappings_next > mappings_count) {
1117 error = EINVAL;
1118 goto done;
1119 }
1120 srfmp->slide = files[i].sf_slide;
1121 }
1122
1123 /* get the process's shared region (setup in vm_map_exec()) */
1124 shared_region = vm_shared_region_trim_and_get(current_task());
1125 *shared_region_ptr = shared_region;
1126 if (shared_region == NULL) {
1127 SHARED_REGION_TRACE_ERROR(
1128 ("shared_region: %p [%d(%s)] map(): "
1129 "no shared region\n",
1130 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1131 proc_getpid(p), p->p_comm));
1132 error = EINVAL;
1133 goto done;
1134 }
1135
1136 /*
1137 * Check the shared region matches the current root
1138 * directory of this process. Deny the mapping to
1139 * avoid tainting the shared region with something that
1140 * doesn't quite belong into it.
1141 */
1142 struct vnode *sr_vnode = vm_shared_region_root_dir(shared_region);
1143 if (sr_vnode != NULL ? rdir_vp != sr_vnode : rdir_vp != rootvnode) {
1144 SHARED_REGION_TRACE_ERROR(
1145 ("shared_region: map(%p) root_dir mismatch\n",
1146 (void *)VM_KERNEL_ADDRPERM(current_thread())));
1147 error = EPERM;
1148 goto done;
1149 }
1150
1151
1152 for (srfmp = &(*sr_file_mappings)[0];
1153 srfmp < &(*sr_file_mappings)[files_count];
1154 srfmp++) {
1155 if (srfmp->mappings_count == 0) {
1156 /* no mappings here... */
1157 continue;
1158 }
1159
1160 /*
1161 * A file descriptor of -1 is used to indicate that the data
1162 * to be put in the shared region for this mapping comes directly
1163 * from the processes address space. Ensure we have proper alignments.
1164 */
1165 if (srfmp->fd == -1) {
1166 /* only allow one mapping per fd */
1167 if (srfmp->mappings_count > 1) {
1168 SHARED_REGION_TRACE_ERROR(
1169 ("shared_region: %p [%d(%s)] map data >1 mapping\n",
1170 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1171 proc_getpid(p), p->p_comm));
1172 error = EINVAL;
1173 goto done;
1174 }
1175
1176 /*
1177 * The destination address and size must be page aligned.
1178 */
1179 struct shared_file_mapping_slide_np *mapping = &srfmp->mappings[0];
1180 mach_vm_address_t dest_addr = mapping->sms_address;
1181 mach_vm_size_t map_size = mapping->sms_size;
1182 if (!vm_map_page_aligned(dest_addr, vm_map_page_mask(current_map()))) {
1183 SHARED_REGION_TRACE_ERROR(
1184 ("shared_region: %p [%d(%s)] map data destination 0x%llx not aligned\n",
1185 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1186 proc_getpid(p), p->p_comm, dest_addr));
1187 error = EINVAL;
1188 goto done;
1189 }
1190 if (!vm_map_page_aligned(map_size, vm_map_page_mask(current_map()))) {
1191 SHARED_REGION_TRACE_ERROR(
1192 ("shared_region: %p [%d(%s)] map data size 0x%llx not aligned\n",
1193 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1194 proc_getpid(p), p->p_comm, map_size));
1195 error = EINVAL;
1196 goto done;
1197 }
1198 continue;
1199 }
1200
1201 /* get file structure from file descriptor */
1202 error = fp_get_ftype(p, srfmp->fd, DTYPE_VNODE, EINVAL, &srfmp->fp);
1203 if (error) {
1204 SHARED_REGION_TRACE_ERROR(
1205 ("shared_region: %p [%d(%s)] map: "
1206 "fd=%d lookup failed (error=%d)\n",
1207 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1208 proc_getpid(p), p->p_comm, srfmp->fd, error));
1209 goto done;
1210 }
1211
1212 /* we need at least read permission on the file */
1213 if (!(srfmp->fp->fp_glob->fg_flag & FREAD)) {
1214 SHARED_REGION_TRACE_ERROR(
1215 ("shared_region: %p [%d(%s)] map: "
1216 "fd=%d not readable\n",
1217 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1218 proc_getpid(p), p->p_comm, srfmp->fd));
1219 error = EPERM;
1220 goto done;
1221 }
1222
1223 /* get vnode from file structure */
1224 error = vnode_getwithref((vnode_t)fp_get_data(srfmp->fp));
1225 if (error) {
1226 SHARED_REGION_TRACE_ERROR(
1227 ("shared_region: %p [%d(%s)] map: "
1228 "fd=%d getwithref failed (error=%d)\n",
1229 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1230 proc_getpid(p), p->p_comm, srfmp->fd, error));
1231 goto done;
1232 }
1233 srfmp->vp = (struct vnode *)fp_get_data(srfmp->fp);
1234
1235 /* make sure the vnode is a regular file */
1236 if (srfmp->vp->v_type != VREG) {
1237 SHARED_REGION_TRACE_ERROR(
1238 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1239 "not a file (type=%d)\n",
1240 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1241 proc_getpid(p), p->p_comm,
1242 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1243 srfmp->vp->v_name, srfmp->vp->v_type));
1244 error = EINVAL;
1245 goto done;
1246 }
1247
1248 #if CONFIG_MACF
1249 /* pass in 0 for the offset argument because AMFI does not need the offset
1250 * of the shared cache */
1251 error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
1252 srfmp->fp->fp_glob, VM_PROT_ALL, MAP_FILE | MAP_PRIVATE | MAP_FIXED, 0, &maxprot);
1253 if (error) {
1254 goto done;
1255 }
1256 #endif /* MAC */
1257
1258 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1259 /*
1260 * Check if the shared cache is in the trust cache;
1261 * if so, we can skip the root ownership check.
1262 */
1263 #if DEVELOPMENT || DEBUG
1264 /*
1265 * Skip both root ownership and trust cache check if
1266 * enforcement is disabled.
1267 */
1268 if (!cs_system_enforcement()) {
1269 goto after_root_check;
1270 }
1271 #endif /* DEVELOPMENT || DEBUG */
1272 struct cs_blob *blob = csvnode_get_blob(srfmp->vp, 0);
1273 if (blob == NULL) {
1274 SHARED_REGION_TRACE_ERROR(
1275 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1276 "missing CS blob\n",
1277 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1278 proc_getpid(p), p->p_comm,
1279 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1280 srfmp->vp->v_name));
1281 goto root_check;
1282 }
1283 const uint8_t *cdhash = csblob_get_cdhash(blob);
1284 if (cdhash == NULL) {
1285 SHARED_REGION_TRACE_ERROR(
1286 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1287 "missing cdhash\n",
1288 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1289 proc_getpid(p), p->p_comm,
1290 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1291 srfmp->vp->v_name));
1292 goto root_check;
1293 }
1294
1295 bool in_trust_cache = false;
1296 TrustCacheQueryToken_t qt;
1297 if (query_trust_cache(kTCQueryTypeAll, cdhash, &qt) == KERN_SUCCESS) {
1298 TCType_t tc_type = kTCTypeInvalid;
1299 TCReturn_t tc_ret = amfi->TrustCache.queryGetTCType(&qt, &tc_type);
1300 in_trust_cache = (tc_ret.error == kTCReturnSuccess &&
1301 (tc_type == kTCTypeCryptex1BootOS ||
1302 tc_type == kTCTypeStatic ||
1303 tc_type == kTCTypeEngineering));
1304 }
1305 if (!in_trust_cache) {
1306 SHARED_REGION_TRACE_ERROR(
1307 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1308 "not in trust cache\n",
1309 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1310 proc_getpid(p), p->p_comm,
1311 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1312 srfmp->vp->v_name));
1313 goto root_check;
1314 }
1315 goto after_root_check;
1316 root_check:
1317 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1318
1319 /* The shared cache file must be owned by root */
1320 VATTR_INIT(&va);
1321 VATTR_WANTED(&va, va_uid);
1322 error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1323 if (error) {
1324 SHARED_REGION_TRACE_ERROR(
1325 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1326 "vnode_getattr(%p) failed (error=%d)\n",
1327 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1328 proc_getpid(p), p->p_comm,
1329 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1330 srfmp->vp->v_name,
1331 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1332 error));
1333 goto done;
1334 }
1335 if (va.va_uid != 0) {
1336 SHARED_REGION_TRACE_ERROR(
1337 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1338 "owned by uid=%d instead of 0\n",
1339 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1340 proc_getpid(p), p->p_comm,
1341 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1342 srfmp->vp->v_name, va.va_uid));
1343 error = EPERM;
1344 goto done;
1345 }
1346
1347 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1348 after_root_check:
1349 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1350
1351 #if CONFIG_CSR
1352 if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
1353 VATTR_INIT(&va);
1354 VATTR_WANTED(&va, va_flags);
1355 error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1356 if (error) {
1357 SHARED_REGION_TRACE_ERROR(
1358 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1359 "vnode_getattr(%p) failed (error=%d)\n",
1360 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1361 proc_getpid(p), p->p_comm,
1362 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1363 srfmp->vp->v_name,
1364 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1365 error));
1366 goto done;
1367 }
1368
1369 if (!(va.va_flags & SF_RESTRICTED)) {
1370 /*
1371 * CSR is not configured in CSR_ALLOW_UNRESTRICTED_FS mode, and
1372 * the shared cache file is NOT SIP-protected, so reject the
1373 * mapping request
1374 */
1375 SHARED_REGION_TRACE_ERROR(
1376 ("shared_region: %p [%d(%s)] map(%p:'%s'), "
1377 "vnode is not SIP-protected. \n",
1378 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1379 proc_getpid(p), p->p_comm,
1380 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1381 srfmp->vp->v_name));
1382 error = EPERM;
1383 goto done;
1384 }
1385 }
1386 #else /* CONFIG_CSR */
1387
1388 /*
1389 * Devices without SIP/ROSP need to make sure that the shared cache
1390 * is either on the root volume or in the preboot cryptex volume.
1391 */
1392 assert(rdir_vp != NULL);
1393 if (srfmp->vp->v_mount != rdir_vp->v_mount) {
1394 vnode_t preboot_vp = NULL;
1395 #if XNU_TARGET_OS_OSX
1396 #define PREBOOT_CRYPTEX_PATH "/System/Volumes/Preboot/Cryptexes"
1397 #else
1398 #define PREBOOT_CRYPTEX_PATH "/private/preboot/Cryptexes"
1399 #endif
1400 error = vnode_lookup(PREBOOT_CRYPTEX_PATH, 0, &preboot_vp, vfs_context_current());
1401 if (error || srfmp->vp->v_mount != preboot_vp->v_mount) {
1402 SHARED_REGION_TRACE_ERROR(
1403 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1404 "not on process' root volume nor preboot volume\n",
1405 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1406 proc_getpid(p), p->p_comm,
1407 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1408 srfmp->vp->v_name));
1409 error = EPERM;
1410 if (preboot_vp) {
1411 (void)vnode_put(preboot_vp);
1412 }
1413 goto done;
1414 } else if (preboot_vp) {
1415 (void)vnode_put(preboot_vp);
1416 }
1417 }
1418 #endif /* CONFIG_CSR */
1419
1420 if (scdir_enforce) {
1421 char **expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1422 struct vnode *scdir_vp = NULL;
1423 for (expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1424 *expected_scdir_path != NULL;
1425 expected_scdir_path++) {
1426 /* get vnode for expected_scdir_path */
1427 error = vnode_lookup(*expected_scdir_path, 0, &scdir_vp, vfs_context_current());
1428 if (error) {
1429 SHARED_REGION_TRACE_ERROR(
1430 ("shared_region: %p [%d(%s)]: "
1431 "vnode_lookup(%s) failed (error=%d)\n",
1432 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1433 proc_getpid(p), p->p_comm,
1434 *expected_scdir_path, error));
1435 continue;
1436 }
1437
1438 /* check if parent is scdir_vp */
1439 assert(scdir_vp != NULL);
1440 if (vnode_parent(srfmp->vp) == scdir_vp) {
1441 (void)vnode_put(scdir_vp);
1442 scdir_vp = NULL;
1443 goto scdir_ok;
1444 }
1445 (void)vnode_put(scdir_vp);
1446 scdir_vp = NULL;
1447 }
1448 /* nothing matches */
1449 SHARED_REGION_TRACE_ERROR(
1450 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1451 "shared cache file not in expected directory\n",
1452 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1453 proc_getpid(p), p->p_comm,
1454 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1455 srfmp->vp->v_name));
1456 error = EPERM;
1457 goto done;
1458 }
1459 scdir_ok:
1460
1461 /* get vnode size */
1462 error = vnode_size(srfmp->vp, &fs, vfs_context_current());
1463 if (error) {
1464 SHARED_REGION_TRACE_ERROR(
1465 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1466 "vnode_size(%p) failed (error=%d)\n",
1467 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1468 proc_getpid(p), p->p_comm,
1469 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1470 srfmp->vp->v_name,
1471 (void *)VM_KERNEL_ADDRPERM(srfmp->vp), error));
1472 goto done;
1473 }
1474 srfmp->file_size = fs;
1475
1476 /* get the file's memory object handle */
1477 srfmp->file_control = ubc_getobject(srfmp->vp, UBC_HOLDOBJECT);
1478 if (srfmp->file_control == MEMORY_OBJECT_CONTROL_NULL) {
1479 SHARED_REGION_TRACE_ERROR(
1480 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1481 "no memory object\n",
1482 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1483 proc_getpid(p), p->p_comm,
1484 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1485 srfmp->vp->v_name));
1486 error = EINVAL;
1487 goto done;
1488 }
1489
1490 /* check that the mappings are properly covered by code signatures */
1491 if (!cs_system_enforcement()) {
1492 /* code signing is not enforced: no need to check */
1493 } else {
1494 for (i = 0; i < srfmp->mappings_count; i++) {
1495 if (srfmp->mappings[i].sms_init_prot & VM_PROT_ZF) {
1496 /* zero-filled mapping: not backed by the file */
1497 continue;
1498 }
1499 if (ubc_cs_is_range_codesigned(srfmp->vp,
1500 srfmp->mappings[i].sms_file_offset,
1501 srfmp->mappings[i].sms_size)) {
1502 /* this mapping is fully covered by code signatures */
1503 continue;
1504 }
1505 SHARED_REGION_TRACE_ERROR(
1506 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1507 "mapping #%d/%d [0x%llx:0x%llx:0x%llx:0x%x:0x%x] "
1508 "is not code-signed\n",
1509 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1510 proc_getpid(p), p->p_comm,
1511 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1512 srfmp->vp->v_name,
1513 i, srfmp->mappings_count,
1514 srfmp->mappings[i].sms_address,
1515 srfmp->mappings[i].sms_size,
1516 srfmp->mappings[i].sms_file_offset,
1517 srfmp->mappings[i].sms_max_prot,
1518 srfmp->mappings[i].sms_init_prot));
1519 error = EINVAL;
1520 goto done;
1521 }
1522 }
1523 }
1524 done:
1525 if (error != 0) {
1526 shared_region_map_and_slide_cleanup(p, files_count, *sr_file_mappings, shared_region);
1527 *sr_file_mappings = NULL;
1528 *shared_region_ptr = NULL;
1529 }
1530 return error;
1531 }
1532
1533 /*
1534 * shared_region_map_np()
1535 *
1536 * This system call is intended for dyld.
1537 *
1538 * dyld uses this to map a shared cache file into a shared region.
1539 * This is usually done only the first time a shared cache is needed.
1540 * Subsequent processes will just use the populated shared region without
1541 * requiring any further setup.
1542 */
1543 static int
_shared_region_map_and_slide(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings)1544 _shared_region_map_and_slide(
1545 struct proc *p,
1546 uint32_t files_count,
1547 struct shared_file_np *files,
1548 uint32_t mappings_count,
1549 struct shared_file_mapping_slide_np *mappings)
1550 {
1551 int error = 0;
1552 kern_return_t kr = KERN_SUCCESS;
1553 struct _sr_file_mappings *sr_file_mappings = NULL;
1554 struct vnode *rdir_vp = NULL;
1555 struct vm_shared_region *shared_region = NULL;
1556
1557 /*
1558 * Get a reference to the current proc's root dir.
1559 * Need this to prevent racing with chroot.
1560 */
1561 proc_fdlock(p);
1562 rdir_vp = p->p_fd.fd_rdir;
1563 if (rdir_vp == NULL) {
1564 rdir_vp = rootvnode;
1565 }
1566 assert(rdir_vp != NULL);
1567 vnode_get(rdir_vp);
1568 proc_fdunlock(p);
1569
1570 /*
1571 * Turn files, mappings into sr_file_mappings and other setup.
1572 */
1573 error = shared_region_map_and_slide_setup(p, files_count,
1574 files, mappings_count, mappings,
1575 &sr_file_mappings, &shared_region, rdir_vp);
1576 if (error != 0) {
1577 vnode_put(rdir_vp);
1578 return error;
1579 }
1580
1581 /* map the file(s) into that shared region's submap */
1582 kr = vm_shared_region_map_file(shared_region, files_count, sr_file_mappings);
1583 if (kr != KERN_SUCCESS) {
1584 SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] map(): "
1585 "vm_shared_region_map_file() failed kr=0x%x\n",
1586 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1587 proc_getpid(p), p->p_comm, kr));
1588 }
1589
1590 /* convert kern_return_t to errno */
1591 switch (kr) {
1592 case KERN_SUCCESS:
1593 error = 0;
1594 break;
1595 case KERN_INVALID_ADDRESS:
1596 error = EFAULT;
1597 break;
1598 case KERN_PROTECTION_FAILURE:
1599 error = EPERM;
1600 break;
1601 case KERN_NO_SPACE:
1602 error = ENOMEM;
1603 break;
1604 case KERN_FAILURE:
1605 case KERN_INVALID_ARGUMENT:
1606 default:
1607 error = EINVAL;
1608 break;
1609 }
1610
1611 /*
1612 * Mark that this process is now using split libraries.
1613 */
1614 if (error == 0 && (p->p_flag & P_NOSHLIB)) {
1615 OSBitAndAtomic(~((uint32_t)P_NOSHLIB), &p->p_flag);
1616 }
1617
1618 vnode_put(rdir_vp);
1619 shared_region_map_and_slide_cleanup(p, files_count, sr_file_mappings, shared_region);
1620
1621 SHARED_REGION_TRACE_DEBUG(
1622 ("shared_region: %p [%d(%s)] <- map\n",
1623 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1624 proc_getpid(p), p->p_comm));
1625
1626 return error;
1627 }
1628
1629 /*
1630 * Clean up part of _shared_region_map_and_slide()
1631 * It had to be broken out of _shared_region_map_and_slide() to
1632 * prevent compiler inlining from blowing out the stack.
1633 */
1634 __attribute__((noinline))
1635 static void
shared_region_map_and_slide_cleanup(struct proc * p,uint32_t files_count,struct _sr_file_mappings * sr_file_mappings,struct vm_shared_region * shared_region)1636 shared_region_map_and_slide_cleanup(
1637 struct proc *p,
1638 uint32_t files_count,
1639 struct _sr_file_mappings *sr_file_mappings,
1640 struct vm_shared_region *shared_region)
1641 {
1642 struct _sr_file_mappings *srfmp;
1643 struct vnode_attr va;
1644
1645 if (sr_file_mappings != NULL) {
1646 for (srfmp = &sr_file_mappings[0]; srfmp < &sr_file_mappings[files_count]; srfmp++) {
1647 if (srfmp->vp != NULL) {
1648 vnode_lock_spin(srfmp->vp);
1649 srfmp->vp->v_flag |= VSHARED_DYLD;
1650 vnode_unlock(srfmp->vp);
1651
1652 /* update the vnode's access time */
1653 if (!(vnode_vfsvisflags(srfmp->vp) & MNT_NOATIME)) {
1654 VATTR_INIT(&va);
1655 nanotime(&va.va_access_time);
1656 VATTR_SET_ACTIVE(&va, va_access_time);
1657 vnode_setattr(srfmp->vp, &va, vfs_context_current());
1658 }
1659
1660 #if NAMEDSTREAMS
1661 /*
1662 * If the shared cache is compressed, it may
1663 * have a namedstream vnode instantiated for
1664 * for it. That namedstream vnode will also
1665 * have to be marked with VSHARED_DYLD.
1666 */
1667 if (vnode_hasnamedstreams(srfmp->vp)) {
1668 vnode_t svp;
1669 if (vnode_getnamedstream(srfmp->vp, &svp, XATTR_RESOURCEFORK_NAME,
1670 NS_OPEN, 0, vfs_context_kernel()) == 0) {
1671 vnode_lock_spin(svp);
1672 svp->v_flag |= VSHARED_DYLD;
1673 vnode_unlock(svp);
1674 vnode_put(svp);
1675 }
1676 }
1677 #endif /* NAMEDSTREAMS */
1678 /*
1679 * release the vnode...
1680 * ubc_map() still holds it for us in the non-error case
1681 */
1682 (void) vnode_put(srfmp->vp);
1683 srfmp->vp = NULL;
1684 }
1685 if (srfmp->fp != NULL) {
1686 /* release the file descriptor */
1687 fp_drop(p, srfmp->fd, srfmp->fp, 0);
1688 srfmp->fp = NULL;
1689 }
1690 }
1691 kfree_type(struct _sr_file_mappings, files_count, sr_file_mappings);
1692 }
1693
1694 if (shared_region != NULL) {
1695 vm_shared_region_deallocate(shared_region);
1696 }
1697 }
1698
1699 /*
1700 * For each file mapped, we may have mappings for:
1701 * TEXT, EXECUTE, LINKEDIT, DATA_CONST, __AUTH, DATA
1702 * so let's round up to 8 mappings per file.
1703 */
1704 #define SFM_MAX (_SR_FILE_MAPPINGS_MAX_FILES * 8) /* max mapping structs allowed to pass in */
1705
1706 /*
1707 * This is the new interface for setting up shared region mappings.
1708 *
1709 * The slide used for shared regions setup using this interface is done differently
1710 * from the old interface. The slide value passed in the shared_files_np represents
1711 * a max value. The kernel will choose a random value based on that, then use it
1712 * for all shared regions.
1713 */
1714 #if defined (__x86_64__)
1715 #define SLIDE_AMOUNT_MASK ~FOURK_PAGE_MASK
1716 #else
1717 #define SLIDE_AMOUNT_MASK ~SIXTEENK_PAGE_MASK
1718 #endif
1719
1720 static inline __result_use_check kern_return_t
shared_region_map_and_slide_2_np_sanitize(struct proc * p,user_addr_t mappings_userspace_addr,unsigned int count,shared_file_mapping_slide_np_t * mappings)1721 shared_region_map_and_slide_2_np_sanitize(
1722 struct proc *p,
1723 user_addr_t mappings_userspace_addr,
1724 unsigned int count,
1725 shared_file_mapping_slide_np_t *mappings)
1726 {
1727 kern_return_t kr;
1728 vm_map_t map = current_map();
1729 mach_vm_address_t addr, end;
1730 mach_vm_offset_t offset, offset_end;
1731 mach_vm_size_t size, offset_size;
1732 user_addr_t slide_start, slide_end, slide_size;
1733 vm_prot_t cur;
1734 vm_prot_t max;
1735
1736 user_addr_t user_addr = mappings_userspace_addr;
1737
1738 for (size_t i = 0; i < count; i++) {
1739 shared_file_mapping_slide_np_ut mapping_u;
1740 /*
1741 * First we bring each mapping struct into our kernel stack to
1742 * avoid TOCTOU.
1743 */
1744 kr = shared_region_copyin(
1745 p,
1746 user_addr,
1747 1, // copy 1 element at a time
1748 sizeof(shared_file_mapping_slide_np_ut),
1749 &mapping_u);
1750 if (__improbable(kr != KERN_SUCCESS)) {
1751 return kr;
1752 }
1753
1754 /*
1755 * Then, we sanitize the data on the kernel stack.
1756 */
1757 kr = vm_sanitize_addr_size(
1758 mapping_u.sms_address_u,
1759 mapping_u.sms_size_u,
1760 VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1761 map,
1762 (VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1763 | VM_SANITIZE_FLAGS_CHECK_ALIGNED_START
1764 | VM_SANITIZE_FLAGS_CHECK_ALIGNED_SIZE),
1765 &addr,
1766 &end,
1767 &size);
1768 if (__improbable(kr != KERN_SUCCESS)) {
1769 return kr;
1770 }
1771
1772 kr = vm_sanitize_addr_size(
1773 mapping_u.sms_file_offset_u,
1774 mapping_u.sms_size_u,
1775 VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1776 PAGE_MASK,
1777 (VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1778 | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1779 &offset,
1780 &offset_end,
1781 &offset_size);
1782 if (__improbable(kr != KERN_SUCCESS)) {
1783 return kr;
1784 }
1785 if (__improbable(0 != (offset & vm_map_page_mask(map)))) {
1786 return KERN_INVALID_ARGUMENT;
1787 }
1788
1789 /*
1790 * Unsafe access is immediately followed by wrap to
1791 * convert from addr to size.
1792 */
1793 mach_vm_size_ut sms_slide_size_u =
1794 vm_sanitize_wrap_size(
1795 VM_SANITIZE_UNSAFE_UNWRAP(
1796 mapping_u.sms_slide_size_u));
1797
1798 kr = vm_sanitize_addr_size(
1799 mapping_u.sms_slide_start_u,
1800 sms_slide_size_u,
1801 VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1802 map,
1803 (VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1804 | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1805 &slide_start,
1806 &slide_end,
1807 &slide_size);
1808 if (__improbable(kr != KERN_SUCCESS)) {
1809 return kr;
1810 }
1811
1812 kr = vm_sanitize_cur_and_max_prots(
1813 mapping_u.sms_init_prot_u,
1814 mapping_u.sms_max_prot_u,
1815 VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1816 map,
1817 VM_PROT_SFM_EXTENSIONS_MASK | VM_PROT_TPRO,
1818 &cur,
1819 &max);
1820 if (__improbable(kr != KERN_SUCCESS)) {
1821 return kr;
1822 }
1823
1824 /*
1825 * Finally, we move the data from the kernel stack to our
1826 * caller-allocated kernel heap buffer.
1827 */
1828 mappings[i].sms_address = addr;
1829 mappings[i].sms_size = size;
1830 mappings[i].sms_file_offset = offset;
1831 mappings[i].sms_slide_size = slide_size;
1832 mappings[i].sms_slide_start = slide_start;
1833 mappings[i].sms_max_prot = max;
1834 mappings[i].sms_init_prot = cur;
1835
1836 if (__improbable(os_add_overflow(
1837 user_addr,
1838 sizeof(shared_file_mapping_slide_np_ut),
1839 &user_addr))) {
1840 return KERN_INVALID_ARGUMENT;
1841 }
1842 }
1843
1844 return KERN_SUCCESS;
1845 }
1846
1847 int
shared_region_map_and_slide_2_np(struct proc * p,struct shared_region_map_and_slide_2_np_args * uap,__unused int * retvalp)1848 shared_region_map_and_slide_2_np(
1849 struct proc *p,
1850 struct shared_region_map_and_slide_2_np_args *uap,
1851 __unused int *retvalp)
1852 {
1853 unsigned int files_count;
1854 struct shared_file_np *shared_files = NULL;
1855 unsigned int mappings_count;
1856 struct shared_file_mapping_slide_np *mappings = NULL;
1857 kern_return_t kr = KERN_SUCCESS;
1858
1859 files_count = uap->files_count;
1860 mappings_count = uap->mappings_count;
1861
1862 if (files_count == 0) {
1863 SHARED_REGION_TRACE_INFO(
1864 ("shared_region: %p [%d(%s)] map(): "
1865 "no files\n",
1866 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1867 proc_getpid(p), p->p_comm));
1868 kr = 0; /* no files to map: we're done ! */
1869 goto done;
1870 } else if (files_count <= _SR_FILE_MAPPINGS_MAX_FILES) {
1871 shared_files = kalloc_data(files_count * sizeof(shared_files[0]), Z_WAITOK);
1872 if (shared_files == NULL) {
1873 kr = KERN_RESOURCE_SHORTAGE;
1874 goto done;
1875 }
1876 } else {
1877 SHARED_REGION_TRACE_ERROR(
1878 ("shared_region: %p [%d(%s)] map(): "
1879 "too many files (%d) max %d\n",
1880 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1881 proc_getpid(p), p->p_comm,
1882 files_count, _SR_FILE_MAPPINGS_MAX_FILES));
1883 kr = KERN_FAILURE;
1884 goto done;
1885 }
1886
1887 if (mappings_count == 0) {
1888 SHARED_REGION_TRACE_INFO(
1889 ("shared_region: %p [%d(%s)] map(): "
1890 "no mappings\n",
1891 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1892 proc_getpid(p), p->p_comm));
1893 kr = 0; /* no mappings: we're done ! */
1894 goto done;
1895 } else if (mappings_count <= SFM_MAX) {
1896 mappings = kalloc_data(mappings_count * sizeof(mappings[0]), Z_WAITOK);
1897 if (mappings == NULL) {
1898 kr = KERN_RESOURCE_SHORTAGE;
1899 goto done;
1900 }
1901 } else {
1902 SHARED_REGION_TRACE_ERROR(
1903 ("shared_region: %p [%d(%s)] map(): "
1904 "too many mappings (%d) max %d\n",
1905 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1906 proc_getpid(p), p->p_comm,
1907 mappings_count, SFM_MAX));
1908 kr = KERN_FAILURE;
1909 goto done;
1910 }
1911
1912 /*
1913 * struct shared_file_np does not have fields that are subject to
1914 * sanitization, it is thus copied from userspace as is.
1915 */
1916 kr = shared_region_copyin(p, uap->files, files_count, sizeof(shared_files[0]), shared_files);
1917 if (kr != KERN_SUCCESS) {
1918 goto done;
1919 }
1920
1921 kr = shared_region_map_and_slide_2_np_sanitize(
1922 p,
1923 uap->mappings_u,
1924 mappings_count,
1925 mappings);
1926 if (__improbable(kr != KERN_SUCCESS)) {
1927 kr = vm_sanitize_get_kr(kr);
1928 goto done;
1929 }
1930
1931 uint32_t max_slide = shared_files[0].sf_slide;
1932 uint32_t random_val;
1933 uint32_t slide_amount;
1934
1935 if (max_slide != 0) {
1936 read_random(&random_val, sizeof random_val);
1937 slide_amount = ((random_val % max_slide) & SLIDE_AMOUNT_MASK);
1938 } else {
1939 slide_amount = 0;
1940 }
1941 #if DEVELOPMENT || DEBUG
1942 extern bool bootarg_disable_aslr;
1943 if (bootarg_disable_aslr) {
1944 slide_amount = 0;
1945 }
1946 #endif /* DEVELOPMENT || DEBUG */
1947
1948 /*
1949 * Fix up the mappings to reflect the desired slide.
1950 */
1951 unsigned int f;
1952 unsigned int m = 0;
1953 unsigned int i;
1954 for (f = 0; f < files_count; ++f) {
1955 shared_files[f].sf_slide = slide_amount;
1956 for (i = 0; i < shared_files[f].sf_mappings_count; ++i, ++m) {
1957 if (m >= mappings_count) {
1958 SHARED_REGION_TRACE_ERROR(
1959 ("shared_region: %p [%d(%s)] map(): "
1960 "mapping count argument was too small\n",
1961 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1962 proc_getpid(p), p->p_comm));
1963 kr = KERN_FAILURE;
1964 goto done;
1965 }
1966 if (__improbable(
1967 os_add_overflow(
1968 mappings[m].sms_address,
1969 slide_amount,
1970 &mappings[m].sms_address))) {
1971 kr = KERN_INVALID_ARGUMENT;
1972 goto done;
1973 }
1974 if (mappings[m].sms_slide_size != 0) {
1975 mach_vm_address_t discard;
1976 /* Slide and check that new start/size pairs do not overflow. */
1977 if (__improbable(
1978 os_add_overflow(
1979 mappings[m].sms_slide_start,
1980 slide_amount,
1981 &mappings[m].sms_slide_start) ||
1982 os_add_overflow(
1983 mappings[m].sms_slide_start,
1984 mappings[m].sms_slide_size,
1985 &discard))) {
1986 kr = KERN_INVALID_ARGUMENT;
1987 goto done;
1988 }
1989 }
1990 }
1991 }
1992
1993 kr = _shared_region_map_and_slide(p, files_count, shared_files, mappings_count, mappings);
1994 done:
1995 kfree_data(shared_files, files_count * sizeof(shared_files[0]));
1996 kfree_data(mappings, mappings_count * sizeof(mappings[0]));
1997 return kr;
1998 }
1999
2000 /*
2001 * A syscall for dyld to use to map data pages that need load time relocation fixups.
2002 * The fixups are performed by a custom pager during page-in, so the pages still appear
2003 * "clean" and hence are easily discarded under memory pressure. They can be re-paged-in
2004 * on demand later, all w/o using the compressor.
2005 *
2006 * Note these page are treated as MAP_PRIVATE. So if the application dirties any pages while
2007 * running, they are COW'd as normal.
2008 */
2009 int
map_with_linking_np(struct proc * p,struct map_with_linking_np_args * uap,__unused int * retvalp)2010 map_with_linking_np(
2011 struct proc *p,
2012 struct map_with_linking_np_args *uap,
2013 __unused int *retvalp)
2014 {
2015 uint32_t region_count;
2016 uint32_t r;
2017 struct mwl_region *regions = NULL;
2018 struct mwl_region *rp;
2019 uint32_t link_info_size;
2020 void *link_info = NULL; /* starts with a struct mwl_info_hdr */
2021 struct mwl_info_hdr *info_hdr = NULL;
2022 uint64_t binds_size;
2023 int fd;
2024 struct fileproc *fp = NULL;
2025 struct vnode *vp = NULL;
2026 size_t file_size;
2027 off_t fs;
2028 struct vnode_attr va;
2029 memory_object_control_t file_control = NULL;
2030 int error;
2031 kern_return_t kr = KERN_SUCCESS;
2032
2033 /*
2034 * Check if dyld has told us it finished with this call.
2035 */
2036 if (p->p_disallow_map_with_linking) {
2037 printf("%s: [%d(%s)]: map__with_linking() was disabled\n",
2038 __func__, proc_getpid(p), p->p_comm);
2039 kr = KERN_FAILURE;
2040 goto done;
2041 }
2042
2043 /*
2044 * First we do some sanity checking on what dyld has passed us.
2045 */
2046 region_count = uap->region_count;
2047 link_info_size = uap->link_info_size;
2048 if (region_count == 0) {
2049 printf("%s: [%d(%s)]: region_count == 0\n",
2050 __func__, proc_getpid(p), p->p_comm);
2051 kr = KERN_FAILURE;
2052 goto done;
2053 }
2054 if (region_count > MWL_MAX_REGION_COUNT) {
2055 printf("%s: [%d(%s)]: region_count too big %d\n",
2056 __func__, proc_getpid(p), p->p_comm, region_count);
2057 kr = KERN_FAILURE;
2058 goto done;
2059 }
2060
2061 if (link_info_size <= MWL_MIN_LINK_INFO_SIZE) {
2062 printf("%s: [%d(%s)]: link_info_size too small\n",
2063 __func__, proc_getpid(p), p->p_comm);
2064 kr = KERN_FAILURE;
2065 goto done;
2066 }
2067 if (link_info_size >= MWL_MAX_LINK_INFO_SIZE) {
2068 printf("%s: [%d(%s)]: link_info_size too big %d\n",
2069 __func__, proc_getpid(p), p->p_comm, link_info_size);
2070 kr = KERN_FAILURE;
2071 goto done;
2072 }
2073
2074 /*
2075 * Allocate and copyin the regions and link info
2076 */
2077 regions = kalloc_data(region_count * sizeof(regions[0]), Z_WAITOK);
2078 if (regions == NULL) {
2079 printf("%s: [%d(%s)]: failed to allocate regions\n",
2080 __func__, proc_getpid(p), p->p_comm);
2081 kr = KERN_RESOURCE_SHORTAGE;
2082 goto done;
2083 }
2084 kr = shared_region_copyin(p, uap->regions, region_count, sizeof(regions[0]), regions);
2085 if (kr != KERN_SUCCESS) {
2086 printf("%s: [%d(%s)]: failed to copyin regions kr=%d\n",
2087 __func__, proc_getpid(p), p->p_comm, kr);
2088 goto done;
2089 }
2090
2091 link_info = kalloc_data(link_info_size, Z_WAITOK);
2092 if (link_info == NULL) {
2093 printf("%s: [%d(%s)]: failed to allocate link_info\n",
2094 __func__, proc_getpid(p), p->p_comm);
2095 kr = KERN_RESOURCE_SHORTAGE;
2096 goto done;
2097 }
2098 kr = shared_region_copyin(p, uap->link_info, 1, link_info_size, link_info);
2099 if (kr != KERN_SUCCESS) {
2100 printf("%s: [%d(%s)]: failed to copyin link_info kr=%d\n",
2101 __func__, proc_getpid(p), p->p_comm, kr);
2102 goto done;
2103 }
2104
2105 /*
2106 * Do some verification the data structures.
2107 */
2108 info_hdr = (struct mwl_info_hdr *)link_info;
2109 if (info_hdr->mwli_version != MWL_INFO_VERS) {
2110 printf("%s: [%d(%s)]: unrecognized mwli_version=%d\n",
2111 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_version);
2112 kr = KERN_FAILURE;
2113 goto done;
2114 }
2115
2116 if (info_hdr->mwli_binds_offset > link_info_size) {
2117 printf("%s: [%d(%s)]: mwli_binds_offset too large %d\n",
2118 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_offset);
2119 kr = KERN_FAILURE;
2120 goto done;
2121 }
2122
2123 /* some older devs have s/w page size > h/w page size, no need to support them */
2124 if (info_hdr->mwli_page_size != PAGE_SIZE) {
2125 /* no printf, since this is expected on some devices */
2126 kr = KERN_INVALID_ARGUMENT;
2127 goto done;
2128 }
2129
2130 binds_size = (uint64_t)info_hdr->mwli_binds_count *
2131 ((info_hdr->mwli_pointer_format == DYLD_CHAINED_PTR_32) ? 4 : 8);
2132 if (binds_size > link_info_size - info_hdr->mwli_binds_offset) {
2133 printf("%s: [%d(%s)]: mwli_binds_count too large %d\n",
2134 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_count);
2135 kr = KERN_FAILURE;
2136 goto done;
2137 }
2138
2139 if (info_hdr->mwli_chains_offset > link_info_size) {
2140 printf("%s: [%d(%s)]: mwli_chains_offset too large %d\n",
2141 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_offset);
2142 kr = KERN_FAILURE;
2143 goto done;
2144 }
2145
2146
2147 /*
2148 * Ensure the chained starts in the link info and make sure the
2149 * segment info offsets are within bounds.
2150 */
2151 if (info_hdr->mwli_chains_size < sizeof(struct dyld_chained_starts_in_image)) {
2152 printf("%s: [%d(%s)]: mwli_chains_size too small %d\n",
2153 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2154 kr = KERN_FAILURE;
2155 goto done;
2156 }
2157 if (info_hdr->mwli_chains_size > link_info_size - info_hdr->mwli_chains_offset) {
2158 printf("%s: [%d(%s)]: mwli_chains_size too large %d\n",
2159 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2160 kr = KERN_FAILURE;
2161 goto done;
2162 }
2163
2164 /* Note that more verification of offsets is done in the pager itself */
2165
2166 /*
2167 * Ensure we've only been given one FD and verify valid protections.
2168 */
2169 fd = regions[0].mwlr_fd;
2170 for (r = 0; r < region_count; ++r) {
2171 if (regions[r].mwlr_fd != fd) {
2172 printf("%s: [%d(%s)]: mwlr_fd mismatch %d and %d\n",
2173 __func__, proc_getpid(p), p->p_comm, fd, regions[r].mwlr_fd);
2174 kr = KERN_FAILURE;
2175 goto done;
2176 }
2177
2178 /*
2179 * Only allow data mappings and not zero fill. Permit TPRO
2180 * mappings only when VM_PROT_READ | VM_PROT_WRITE.
2181 */
2182 if (regions[r].mwlr_protections & VM_PROT_EXECUTE) {
2183 printf("%s: [%d(%s)]: mwlr_protections EXECUTE not allowed\n",
2184 __func__, proc_getpid(p), p->p_comm);
2185 kr = KERN_FAILURE;
2186 goto done;
2187 }
2188 if (regions[r].mwlr_protections & VM_PROT_ZF) {
2189 printf("%s: [%d(%s)]: region %d, found VM_PROT_ZF not allowed\n",
2190 __func__, proc_getpid(p), p->p_comm, r);
2191 kr = KERN_FAILURE;
2192 goto done;
2193 }
2194 if ((regions[r].mwlr_protections & VM_PROT_TPRO) &&
2195 !(regions[r].mwlr_protections & VM_PROT_WRITE)) {
2196 printf("%s: [%d(%s)]: region %d, found VM_PROT_TPRO without VM_PROT_WRITE\n",
2197 __func__, proc_getpid(p), p->p_comm, r);
2198 kr = KERN_FAILURE;
2199 goto done;
2200 }
2201 }
2202
2203
2204 /* get file structure from file descriptor */
2205 error = fp_get_ftype(p, fd, DTYPE_VNODE, EINVAL, &fp);
2206 if (error) {
2207 printf("%s: [%d(%s)]: fp_get_ftype() failed, error %d\n",
2208 __func__, proc_getpid(p), p->p_comm, error);
2209 kr = KERN_FAILURE;
2210 goto done;
2211 }
2212
2213 /* We need at least read permission on the file */
2214 if (!(fp->fp_glob->fg_flag & FREAD)) {
2215 printf("%s: [%d(%s)]: not readable\n",
2216 __func__, proc_getpid(p), p->p_comm);
2217 kr = KERN_FAILURE;
2218 goto done;
2219 }
2220
2221 /* Get the vnode from file structure */
2222 vp = (struct vnode *)fp_get_data(fp);
2223 error = vnode_getwithref(vp);
2224 if (error) {
2225 printf("%s: [%d(%s)]: failed to get vnode, error %d\n",
2226 __func__, proc_getpid(p), p->p_comm, error);
2227 kr = KERN_FAILURE;
2228 vp = NULL; /* just to be sure */
2229 goto done;
2230 }
2231
2232 /* Make sure the vnode is a regular file */
2233 if (vp->v_type != VREG) {
2234 printf("%s: [%d(%s)]: vnode not VREG\n",
2235 __func__, proc_getpid(p), p->p_comm);
2236 kr = KERN_FAILURE;
2237 goto done;
2238 }
2239
2240 /* get vnode size */
2241 error = vnode_size(vp, &fs, vfs_context_current());
2242 if (error) {
2243 goto done;
2244 }
2245 file_size = fs;
2246
2247 /* get the file's memory object handle */
2248 file_control = ubc_getobject(vp, UBC_HOLDOBJECT);
2249 if (file_control == MEMORY_OBJECT_CONTROL_NULL) {
2250 printf("%s: [%d(%s)]: no memory object\n",
2251 __func__, proc_getpid(p), p->p_comm);
2252 kr = KERN_FAILURE;
2253 goto done;
2254 }
2255
2256 for (r = 0; r < region_count; ++r) {
2257 rp = ®ions[r];
2258
2259 #if CONFIG_MACF
2260 vm_prot_t prot = (rp->mwlr_protections & VM_PROT_ALL);
2261 error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
2262 fp->fp_glob, prot, MAP_FILE | MAP_PRIVATE | MAP_FIXED, rp->mwlr_file_offset, &prot);
2263 if (error) {
2264 printf("%s: [%d(%s)]: mac_file_check_mmap() failed, region %d, error %d\n",
2265 __func__, proc_getpid(p), p->p_comm, r, error);
2266 kr = KERN_FAILURE;
2267 goto done;
2268 }
2269 #endif /* MAC */
2270
2271 /* check that the mappings are properly covered by code signatures */
2272 if (cs_system_enforcement()) {
2273 if (!ubc_cs_is_range_codesigned(vp, rp->mwlr_file_offset, rp->mwlr_size)) {
2274 printf("%s: [%d(%s)]: region %d, not code signed\n",
2275 __func__, proc_getpid(p), p->p_comm, r);
2276 kr = KERN_FAILURE;
2277 goto done;
2278 }
2279 }
2280 }
2281
2282 /* update the vnode's access time */
2283 if (!(vnode_vfsvisflags(vp) & MNT_NOATIME)) {
2284 VATTR_INIT(&va);
2285 nanotime(&va.va_access_time);
2286 VATTR_SET_ACTIVE(&va, va_access_time);
2287 vnode_setattr(vp, &va, vfs_context_current());
2288 }
2289
2290 /* get the VM to do the work */
2291 kr = vm_map_with_linking(proc_task(p), regions, region_count, &link_info, link_info_size, file_control);
2292
2293 done:
2294 if (fp != NULL) {
2295 /* release the file descriptor */
2296 fp_drop(p, fd, fp, 0);
2297 }
2298 if (vp != NULL) {
2299 (void)vnode_put(vp);
2300 }
2301 if (regions != NULL) {
2302 kfree_data(regions, region_count * sizeof(regions[0]));
2303 }
2304 /* link info is NULL if it is used in the pager, if things worked */
2305 if (link_info != NULL) {
2306 kfree_data(link_info, link_info_size);
2307 }
2308
2309 switch (kr) {
2310 case KERN_SUCCESS:
2311 return 0;
2312 case KERN_RESOURCE_SHORTAGE:
2313 return ENOMEM;
2314 default:
2315 return EINVAL;
2316 }
2317 }
2318
2319 #if DEBUG || DEVELOPMENT
2320 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count,
2321 CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count, 0, "");
2322 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count_max,
2323 CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count_max, 0, "");
2324 #endif /* DEBUG || DEVELOPMENT */
2325
2326 /* sysctl overflow room */
2327
2328 SYSCTL_INT(_vm, OID_AUTO, pagesize, CTLFLAG_RD | CTLFLAG_LOCKED,
2329 (int *) &page_size, 0, "vm page size");
2330
2331 /* vm_page_free_target is provided as a makeshift solution for applications that want to
2332 * allocate buffer space, possibly purgeable memory, but not cause inactive pages to be
2333 * reclaimed. It allows the app to calculate how much memory is free outside the free target. */
2334 extern unsigned int vm_page_free_target;
2335 SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED,
2336 &vm_page_free_target, 0, "Pageout daemon free target");
2337
2338 SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED,
2339 &vm_pageout_state.vm_memory_pressure, 0, "Memory pressure indicator");
2340
2341 static int
2342 vm_ctl_page_free_wanted SYSCTL_HANDLER_ARGS
2343 {
2344 #pragma unused(oidp, arg1, arg2)
2345 unsigned int page_free_wanted;
2346
2347 page_free_wanted = mach_vm_ctl_page_free_wanted();
2348 return SYSCTL_OUT(req, &page_free_wanted, sizeof(page_free_wanted));
2349 }
2350 SYSCTL_PROC(_vm, OID_AUTO, page_free_wanted,
2351 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
2352 0, 0, vm_ctl_page_free_wanted, "I", "");
2353
2354 extern unsigned int vm_page_purgeable_count;
2355 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2356 &vm_page_purgeable_count, 0, "Purgeable page count");
2357
2358 extern unsigned int vm_page_purgeable_wired_count;
2359 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2360 &vm_page_purgeable_wired_count, 0, "Wired purgeable page count");
2361
2362 extern unsigned int vm_page_kern_lpage_count;
2363 SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2364 &vm_page_kern_lpage_count, 0, "kernel used large pages");
2365
2366 SCALABLE_COUNTER_DECLARE(vm_page_grab_count);
2367 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed");
2368
2369 #if DEVELOPMENT || DEBUG
2370 #if __ARM_MIXED_PAGE_SIZE__
2371 static int vm_mixed_pagesize_supported = 1;
2372 #else
2373 static int vm_mixed_pagesize_supported = 0;
2374 #endif /*__ARM_MIXED_PAGE_SIZE__ */
2375 SYSCTL_INT(_debug, OID_AUTO, vm_mixed_pagesize_supported, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED,
2376 &vm_mixed_pagesize_supported, 0, "kernel support for mixed pagesize");
2377
2378 SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
2379 &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed");
2380
2381 SYSCTL_INT(_vm, OID_AUTO, pageout_purged_objects, CTLFLAG_RD | CTLFLAG_LOCKED,
2382 &vm_pageout_debug.vm_pageout_purged_objects, 0, "System purged object count");
2383 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED,
2384 &vm_pageout_debug.vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)");
2385 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED,
2386 &vm_pageout_debug.vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)");
2387
2388 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2389 &vm_pageout_debug.vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated");
2390 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2391 &vm_pageout_debug.vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated");
2392 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2393 &vm_pageout_debug.vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated"); /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */
2394 SYSCTL_ULONG(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2395 &vm_pageout_vminfo.vm_pageout_freed_cleaned, "Cleaned pages freed");
2396 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2397 &vm_pageout_debug.vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated");
2398 SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2399 &vm_pageout_debug.vm_pageout_enqueued_cleaned, 0, ""); /* sum of next two */
2400 #endif /* DEVELOPMENT || DEBUG */
2401
2402 extern int madvise_free_debug;
2403 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
2404 &madvise_free_debug, 0, "zero-fill on madvise(MADV_FREE*)");
2405 extern int madvise_free_debug_sometimes;
2406 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug_sometimes, CTLFLAG_RW | CTLFLAG_LOCKED,
2407 &madvise_free_debug_sometimes, 0, "sometimes zero-fill on madvise(MADV_FREE*)");
2408
2409 SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2410 &vm_page_stats_reusable.reusable_count, 0, "Reusable page count");
2411 SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2412 &vm_page_stats_reusable.reusable_pages_success, "");
2413 SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2414 &vm_page_stats_reusable.reusable_pages_failure, "");
2415 SYSCTL_QUAD(_vm, OID_AUTO, reusable_pages_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2416 &vm_page_stats_reusable.reusable_pages_shared, "");
2417 SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2418 &vm_page_stats_reusable.all_reusable_calls, "");
2419 SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2420 &vm_page_stats_reusable.partial_reusable_calls, "");
2421 SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2422 &vm_page_stats_reusable.reuse_pages_success, "");
2423 SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2424 &vm_page_stats_reusable.reuse_pages_failure, "");
2425 SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2426 &vm_page_stats_reusable.all_reuse_calls, "");
2427 SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2428 &vm_page_stats_reusable.partial_reuse_calls, "");
2429 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2430 &vm_page_stats_reusable.can_reuse_success, "");
2431 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2432 &vm_page_stats_reusable.can_reuse_failure, "");
2433 SYSCTL_QUAD(_vm, OID_AUTO, reusable_reclaimed, CTLFLAG_RD | CTLFLAG_LOCKED,
2434 &vm_page_stats_reusable.reusable_reclaimed, "");
2435 SYSCTL_QUAD(_vm, OID_AUTO, reusable_nonwritable, CTLFLAG_RD | CTLFLAG_LOCKED,
2436 &vm_page_stats_reusable.reusable_nonwritable, "");
2437 SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2438 &vm_page_stats_reusable.reusable_shared, "");
2439 SYSCTL_QUAD(_vm, OID_AUTO, free_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2440 &vm_page_stats_reusable.free_shared, "");
2441
2442
2443 extern unsigned int vm_page_free_count, vm_page_speculative_count;
2444 SYSCTL_UINT(_vm, OID_AUTO, page_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_count, 0, "");
2445 SYSCTL_UINT(_vm, OID_AUTO, page_speculative_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_speculative_count, 0, "");
2446
2447 extern unsigned int vm_page_cleaned_count;
2448 SYSCTL_UINT(_vm, OID_AUTO, page_cleaned_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_cleaned_count, 0, "Cleaned queue size");
2449
2450 extern unsigned int vm_page_pageable_internal_count, vm_page_pageable_external_count;
2451 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_internal_count, 0, "");
2452 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, "");
2453
2454 /* pageout counts */
2455 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_clean, 0, "");
2456 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_used, 0, "");
2457
2458 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, "");
2459 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_external, "");
2460 SYSCTL_ULONG(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2461 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_external, "");
2462 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2463 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_cleaned, "");
2464
2465 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_sharedcache, "");
2466 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache, "");
2467 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_realtime, "");
2468 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime, "");
2469 extern unsigned int vm_page_realtime_count;
2470 SYSCTL_UINT(_vm, OID_AUTO, page_realtime_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_realtime_count, 0, "");
2471 extern int vm_pageout_protect_realtime;
2472 SYSCTL_INT(_vm, OID_AUTO, pageout_protect_realtime, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_protect_realtime, 0, "");
2473
2474 /* counts of pages prefaulted when entering a memory object */
2475 extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout;
2476 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_pages, "");
2477 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_bailout, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_bailout, "");
2478
2479 #if defined (__x86_64__)
2480 extern unsigned int vm_clump_promote_threshold;
2481 SYSCTL_UINT(_vm, OID_AUTO, vm_clump_promote_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_clump_promote_threshold, 0, "clump size threshold for promotes");
2482 #if DEVELOPMENT || DEBUG
2483 extern unsigned long vm_clump_stats[];
2484 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[1], "free page allocations from clump of 1 page");
2485 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[2], "free page allocations from clump of 2 pages");
2486 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[3], "free page allocations from clump of 3 pages");
2487 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats4, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[4], "free page allocations from clump of 4 pages");
2488 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats5, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[5], "free page allocations from clump of 5 pages");
2489 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats6, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[6], "free page allocations from clump of 6 pages");
2490 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats7, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[7], "free page allocations from clump of 7 pages");
2491 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats8, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[8], "free page allocations from clump of 8 pages");
2492 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats9, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[9], "free page allocations from clump of 9 pages");
2493 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats10, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[10], "free page allocations from clump of 10 pages");
2494 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats11, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[11], "free page allocations from clump of 11 pages");
2495 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats12, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[12], "free page allocations from clump of 12 pages");
2496 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats13, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[13], "free page allocations from clump of 13 pages");
2497 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats14, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[14], "free page allocations from clump of 14 pages");
2498 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats15, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[15], "free page allocations from clump of 15 pages");
2499 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats16, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[16], "free page allocations from clump of 16 pages");
2500 extern unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
2501 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_alloc, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_allocs, "free page allocations");
2502 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inserts, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inserts, "free page insertions");
2503 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inrange, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inrange, "free page insertions that are part of vm_pages");
2504 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_promotes, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_promotes, "pages promoted to head");
2505 #endif /* if DEVELOPMENT || DEBUG */
2506 #endif /* #if defined (__x86_64__) */
2507
2508 #if CONFIG_SECLUDED_MEMORY
2509
2510 SYSCTL_UINT(_vm, OID_AUTO, num_tasks_can_use_secluded_mem, CTLFLAG_RD | CTLFLAG_LOCKED, &num_tasks_can_use_secluded_mem, 0, "");
2511 extern unsigned int vm_page_secluded_target;
2512 extern unsigned int vm_page_secluded_count;
2513 extern unsigned int vm_page_secluded_count_free;
2514 extern unsigned int vm_page_secluded_count_inuse;
2515 extern unsigned int vm_page_secluded_count_over_target;
2516 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, "");
2517 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, "");
2518 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, "");
2519 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, "");
2520 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_over_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_over_target, 0, "");
2521
2522 extern struct vm_page_secluded_data vm_page_secluded;
2523 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, "");
2524 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_free, 0, "");
2525 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_other, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_other, 0, "");
2526 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_locked, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_locked, 0, "");
2527 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_state, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_state, 0, "");
2528 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_realtime, 0, "");
2529 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_dirty, 0, "");
2530 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, "");
2531 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, "");
2532
2533 #endif /* CONFIG_SECLUDED_MEMORY */
2534
2535 #if CONFIG_DEFERRED_RECLAIM
2536 #pragma mark Deferred Reclaim
2537 SYSCTL_NODE(_vm, OID_AUTO, reclaim, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Deferred Memory Reclamation");
2538 #if DEVELOPMENT || DEBUG
2539 /*
2540 * VM reclaim testing
2541 */
2542 extern bool vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task);
2543
2544 static int
2545 sysctl_vm_reclaim_wait_for_pid SYSCTL_HANDLER_ARGS
2546 {
2547 int error = EINVAL, pid = 0;
2548 /*
2549 * Only send on write
2550 */
2551 error = sysctl_handle_int(oidp, &pid, 0, req);
2552 if (error || !req->newptr) {
2553 return error;
2554 }
2555 if (pid <= 0) {
2556 return EINVAL;
2557 }
2558 proc_t p = proc_find(pid);
2559 if (p == PROC_NULL) {
2560 return ESRCH;
2561 }
2562 task_t t = proc_task(p);
2563 if (t == TASK_NULL) {
2564 proc_rele(p);
2565 return ESRCH;
2566 }
2567 task_reference(t);
2568 proc_rele(p);
2569
2570 bool success = vm_deferred_reclamation_block_until_task_has_been_reclaimed(t);
2571 if (success) {
2572 error = 0;
2573 }
2574 task_deallocate(t);
2575
2576 return error;
2577 }
2578
2579 SYSCTL_PROC(_vm_reclaim, OID_AUTO, wait_for_pid,
2580 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2581 &sysctl_vm_reclaim_wait_for_pid, "I",
2582 "Block until the given pid has been drained by kernel GC");
2583
2584 static int
2585 sysctl_vm_reclaim_drain_pid SYSCTL_HANDLER_ARGS
2586 {
2587 int error = EINVAL;
2588 kern_return_t kr;
2589 pid_t pid;
2590 error = sysctl_handle_int(oidp, &pid, 0, req);
2591 /* Only reclaim on write */
2592 if (error || !req->newptr) {
2593 return error;
2594 }
2595 if (pid <= 0) {
2596 return EINVAL;
2597 }
2598 proc_t p = proc_find(pid);
2599 if (p == PROC_NULL) {
2600 return ESRCH;
2601 }
2602 task_t t = proc_task(p);
2603 if (t == TASK_NULL) {
2604 proc_rele(p);
2605 return ESRCH;
2606 }
2607 task_reference(t);
2608 proc_rele(p);
2609 kr = vm_deferred_reclamation_task_drain(t, RECLAIM_OPTIONS_NONE);
2610 task_deallocate(t);
2611 return mach_to_bsd_errno(kr);
2612 }
2613
2614 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_pid,
2615 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2616 &sysctl_vm_reclaim_drain_pid, "I",
2617 "Drain the deferred reclamation buffer for a pid");
2618
2619 static int
proc_filter_reclaimable(proc_t p,__unused void * arg)2620 proc_filter_reclaimable(proc_t p, __unused void *arg)
2621 {
2622 task_t task = proc_task(p);
2623 return vm_deferred_reclamation_task_has_ring(task);
2624 }
2625
2626 static int
proc_reclaim_drain(proc_t p,__unused void * arg)2627 proc_reclaim_drain(proc_t p, __unused void *arg)
2628 {
2629 kern_return_t kr;
2630 task_t task = proc_task(p);
2631 kr = vm_deferred_reclamation_task_drain(task, RECLAIM_OPTIONS_NONE);
2632 return mach_to_bsd_errno(kr);
2633 }
2634
2635 static int
2636 sysctl_vm_reclaim_drain_all SYSCTL_HANDLER_ARGS
2637 {
2638 int error;
2639 int val;
2640 if (!req->newptr) {
2641 return EINVAL;
2642 }
2643 error = sysctl_handle_int(oidp, &val, 0, req);
2644 if (error || val == FALSE) {
2645 return error;
2646 }
2647 proc_iterate(PROC_ALLPROCLIST, proc_reclaim_drain, NULL,
2648 proc_filter_reclaimable, NULL);
2649 return 0;
2650 }
2651
2652 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_all,
2653 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2654 &sysctl_vm_reclaim_drain_all, "I",
2655 "Fully reclaim from every deferred reclamation buffer on the system");
2656
2657 extern uint32_t vm_reclaim_buffer_count;
2658 extern uint64_t vm_reclaim_gc_epoch;
2659 extern uint64_t vm_reclaim_gc_reclaim_count;
2660 #if XNU_TARGET_OS_IOS
2661 extern uint64_t vm_reclaim_max_threshold;
2662 #else /* !XNU_TARGET_OS_IOS */
2663 extern bool vm_reclaim_debug;
2664 extern bool vm_reclaim_enabled;
2665 extern uint64_t vm_reclaim_sampling_period_ns;
2666 extern uint64_t vm_reclaim_sampling_period_abs;
2667 extern uint32_t vm_reclaim_autotrim_pct_normal;
2668 extern uint32_t vm_reclaim_autotrim_pct_pressure;
2669 extern uint32_t vm_reclaim_autotrim_pct_critical;
2670 extern uint32_t vm_reclaim_wma_weight_base;
2671 extern uint32_t vm_reclaim_wma_weight_cur;
2672 extern uint32_t vm_reclaim_wma_denom;
2673 extern uint64_t vm_reclaim_abandonment_threshold;
2674 #endif /* XNU_TARGET_OS_IOS */
2675
2676 SYSCTL_UINT(_vm_reclaim, OID_AUTO, reclaim_buffer_count,
2677 CTLFLAG_RD | CTLFLAG_LOCKED, (uint32_t *)&vm_reclaim_buffer_count, 0,
2678 "The number of deferred memory buffers currently alive");
2679 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_epoch,
2680 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_epoch,
2681 "Number of times the global GC thread has run");
2682 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_reclaim_count,
2683 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_reclaim_count,
2684 "Number of times the global GC thread has reclaimed from a buffer");
2685 #if XNU_TARGET_OS_IOS
2686 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, max_threshold,
2687 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_max_threshold,
2688 "Maximum amount of virtual memory (in B) that may be deferred without "
2689 "synchronous reclamation");
2690 #else /* !XNU_TARGET_OS_IOS */
2691 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, enabled,
2692 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_enabled, 0,
2693 "Whether deferred memory reclamation is enabled on this system");
2694 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, debug,
2695 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_debug, 0,
2696 "Whether vm.reclaim debug logs are enabled");
2697 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_normal,
2698 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_normal, 0,
2699 "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2700 "to engage auto-trim when the system is operating normally");
2701 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_pressure,
2702 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_pressure, 0,
2703 "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2704 "to engage auto-trim when the system is under memory pressure");
2705 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_critical,
2706 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_critical, 0,
2707 "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2708 "to engage auto-trim when the system is under critical memory pressure");
2709 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_base,
2710 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_base, 0,
2711 "Weight applied to historical minimum buffer size samples");
2712 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_cur,
2713 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_cur, 0,
2714 "Weight applied to current sampled minimum buffer size");
2715 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_denom,
2716 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_denom, 0,
2717 "Denominator for weighted moving average calculation");
2718 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, abandonment_threshold,
2719 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_abandonment_threshold,
2720 "The number of sampling periods between accounting updates that may elapse "
2721 "before the buffer is considered \"abandoned\"");
2722
2723 static int
2724 sysctl_vm_reclaim_sampling_period SYSCTL_HANDLER_ARGS
2725 {
2726 uint64_t new_val_ns;
2727 uint64_t old_val_ns = vm_reclaim_sampling_period_ns;
2728 int err = sysctl_io_number(req, vm_reclaim_sampling_period_ns,
2729 sizeof(vm_reclaim_sampling_period_ns), &new_val_ns, NULL);
2730 if (err || !req->newptr) {
2731 return err;
2732 }
2733 if (new_val_ns != old_val_ns) {
2734 vm_reclaim_sampling_period_ns = new_val_ns;
2735 nanoseconds_to_absolutetime(vm_reclaim_sampling_period_ns, &vm_reclaim_sampling_period_abs);
2736 }
2737 return 0;
2738 }
2739
2740 SYSCTL_PROC(_vm_reclaim, OID_AUTO, sampling_period_ns,
2741 CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0, sysctl_vm_reclaim_sampling_period, "I",
2742 "Interval (nanoseconds) at which to sample the minimum buffer size and "
2743 "consider trimming excess");
2744 #endif /* XNU_TARGET_OS_IOS */
2745 #endif /* DEVELOPMENT || DEBUG */
2746 #endif /* CONFIG_DEFERRED_RECLAIM */
2747
2748 #include <kern/thread.h>
2749 #include <sys/user.h>
2750
2751 void vm_pageout_io_throttle(void);
2752
2753 void
vm_pageout_io_throttle(void)2754 vm_pageout_io_throttle(void)
2755 {
2756 struct uthread *uthread = current_uthread();
2757
2758 /*
2759 * thread is marked as a low priority I/O type
2760 * and the I/O we issued while in this cleaning operation
2761 * collided with normal I/O operations... we'll
2762 * delay in order to mitigate the impact of this
2763 * task on the normal operation of the system
2764 */
2765
2766 if (uthread->uu_lowpri_window) {
2767 throttle_lowpri_io(1);
2768 }
2769 }
2770
2771 int
vm_pressure_monitor(__unused struct proc * p,struct vm_pressure_monitor_args * uap,int * retval)2772 vm_pressure_monitor(
2773 __unused struct proc *p,
2774 struct vm_pressure_monitor_args *uap,
2775 int *retval)
2776 {
2777 kern_return_t kr;
2778 uint32_t pages_reclaimed;
2779 uint32_t pages_wanted;
2780
2781 kr = mach_vm_pressure_monitor(
2782 (boolean_t) uap->wait_for_pressure,
2783 uap->nsecs_monitored,
2784 (uap->pages_reclaimed) ? &pages_reclaimed : NULL,
2785 &pages_wanted);
2786
2787 switch (kr) {
2788 case KERN_SUCCESS:
2789 break;
2790 case KERN_ABORTED:
2791 return EINTR;
2792 default:
2793 return EINVAL;
2794 }
2795
2796 if (uap->pages_reclaimed) {
2797 if (copyout((void *)&pages_reclaimed,
2798 uap->pages_reclaimed,
2799 sizeof(pages_reclaimed)) != 0) {
2800 return EFAULT;
2801 }
2802 }
2803
2804 *retval = (int) pages_wanted;
2805 return 0;
2806 }
2807
2808 int
kas_info(struct proc * p,struct kas_info_args * uap,int * retval __unused)2809 kas_info(struct proc *p,
2810 struct kas_info_args *uap,
2811 int *retval __unused)
2812 {
2813 #ifndef CONFIG_KAS_INFO
2814 (void)p;
2815 (void)uap;
2816 return ENOTSUP;
2817 #else /* CONFIG_KAS_INFO */
2818 int selector = uap->selector;
2819 user_addr_t valuep = uap->value;
2820 user_addr_t sizep = uap->size;
2821 user_size_t size, rsize;
2822 int error;
2823
2824 if (!kauth_cred_issuser(kauth_cred_get())) {
2825 return EPERM;
2826 }
2827
2828 #if CONFIG_MACF
2829 error = mac_system_check_kas_info(kauth_cred_get(), selector);
2830 if (error) {
2831 return error;
2832 }
2833 #endif
2834
2835 if (IS_64BIT_PROCESS(p)) {
2836 user64_size_t size64;
2837 error = copyin(sizep, &size64, sizeof(size64));
2838 size = (user_size_t)size64;
2839 } else {
2840 user32_size_t size32;
2841 error = copyin(sizep, &size32, sizeof(size32));
2842 size = (user_size_t)size32;
2843 }
2844 if (error) {
2845 return error;
2846 }
2847
2848 switch (selector) {
2849 case KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR:
2850 {
2851 uint64_t slide = vm_kernel_slide;
2852
2853 if (sizeof(slide) != size) {
2854 return EINVAL;
2855 }
2856
2857 error = copyout(&slide, valuep, sizeof(slide));
2858 if (error) {
2859 return error;
2860 }
2861 rsize = size;
2862 }
2863 break;
2864 case KAS_INFO_KERNEL_SEGMENT_VMADDR_SELECTOR:
2865 {
2866 uint32_t i;
2867 kernel_mach_header_t *mh = &_mh_execute_header;
2868 struct load_command *cmd;
2869 cmd = (struct load_command*) &mh[1];
2870 uint64_t *bases;
2871 rsize = mh->ncmds * sizeof(uint64_t);
2872
2873 /*
2874 * Return the size if no data was passed
2875 */
2876 if (valuep == 0) {
2877 break;
2878 }
2879
2880 if (rsize > size) {
2881 return EINVAL;
2882 }
2883
2884 bases = kalloc_data(rsize, Z_WAITOK | Z_ZERO);
2885
2886 for (i = 0; i < mh->ncmds; i++) {
2887 if (cmd->cmd == LC_SEGMENT_KERNEL) {
2888 __IGNORE_WCASTALIGN(kernel_segment_command_t * sg = (kernel_segment_command_t *) cmd);
2889 bases[i] = (uint64_t)sg->vmaddr;
2890 }
2891 cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize);
2892 }
2893
2894 error = copyout(bases, valuep, rsize);
2895
2896 kfree_data(bases, rsize);
2897
2898 if (error) {
2899 return error;
2900 }
2901 }
2902 break;
2903 case KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR:
2904 case KAS_INFO_TXM_TEXT_SLIDE_SELECTOR:
2905 {
2906 #if CONFIG_SPTM
2907 const uint64_t slide =
2908 (selector == KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR) ? vm_sptm_offsets.slide : vm_txm_offsets.slide;
2909 #else
2910 const uint64_t slide = 0;
2911 #endif
2912
2913 if (sizeof(slide) != size) {
2914 return EINVAL;
2915 }
2916
2917 error = copyout(&slide, valuep, sizeof(slide));
2918 if (error) {
2919 return error;
2920 }
2921 rsize = size;
2922 }
2923 break;
2924 default:
2925 return EINVAL;
2926 }
2927
2928 if (IS_64BIT_PROCESS(p)) {
2929 user64_size_t size64 = (user64_size_t)rsize;
2930 error = copyout(&size64, sizep, sizeof(size64));
2931 } else {
2932 user32_size_t size32 = (user32_size_t)rsize;
2933 error = copyout(&size32, sizep, sizeof(size32));
2934 }
2935
2936 return error;
2937 #endif /* CONFIG_KAS_INFO */
2938 }
2939
2940 #pragma clang diagnostic push
2941 #pragma clang diagnostic ignored "-Wcast-qual"
2942 #pragma clang diagnostic ignored "-Wunused-function"
2943
2944 static void
asserts()2945 asserts()
2946 {
2947 static_assert(sizeof(vm_min_kernel_address) == sizeof(unsigned long));
2948 static_assert(sizeof(vm_max_kernel_address) == sizeof(unsigned long));
2949 }
2950
2951 SYSCTL_ULONG(_vm, OID_AUTO, vm_min_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_min_kernel_address, "");
2952 SYSCTL_ULONG(_vm, OID_AUTO, vm_max_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_max_kernel_address, "");
2953 #pragma clang diagnostic pop
2954
2955 extern uint32_t vm_page_pages;
2956 SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0, "");
2957
2958 extern uint32_t vm_page_busy_absent_skipped;
2959 SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, "");
2960
2961 extern uint32_t vm_page_upl_tainted;
2962 SYSCTL_UINT(_vm, OID_AUTO, upl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_upl_tainted, 0, "");
2963
2964 extern uint32_t vm_page_iopl_tainted;
2965 SYSCTL_UINT(_vm, OID_AUTO, iopl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_iopl_tainted, 0, "");
2966
2967 #if __arm64__ && (DEVELOPMENT || DEBUG)
2968 extern int vm_footprint_suspend_allowed;
2969 SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, "");
2970
2971 extern void pmap_footprint_suspend(vm_map_t map, boolean_t suspend);
2972 static int
2973 sysctl_vm_footprint_suspend SYSCTL_HANDLER_ARGS
2974 {
2975 #pragma unused(oidp, arg1, arg2)
2976 int error = 0;
2977 int new_value;
2978
2979 if (req->newptr == USER_ADDR_NULL) {
2980 return 0;
2981 }
2982 error = SYSCTL_IN(req, &new_value, sizeof(int));
2983 if (error) {
2984 return error;
2985 }
2986 if (!vm_footprint_suspend_allowed) {
2987 if (new_value != 0) {
2988 /* suspends are not allowed... */
2989 return 0;
2990 }
2991 /* ... but let resumes proceed */
2992 }
2993 DTRACE_VM2(footprint_suspend,
2994 vm_map_t, current_map(),
2995 int, new_value);
2996
2997 pmap_footprint_suspend(current_map(), new_value);
2998
2999 return 0;
3000 }
3001 SYSCTL_PROC(_vm, OID_AUTO, footprint_suspend,
3002 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3003 0, 0, &sysctl_vm_footprint_suspend, "I", "");
3004 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
3005
3006 extern uint64_t vm_map_corpse_footprint_count;
3007 extern uint64_t vm_map_corpse_footprint_size_avg;
3008 extern uint64_t vm_map_corpse_footprint_size_max;
3009 extern uint64_t vm_map_corpse_footprint_full;
3010 extern uint64_t vm_map_corpse_footprint_no_buf;
3011 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_count,
3012 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_count, "");
3013 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_avg,
3014 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_avg, "");
3015 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_max,
3016 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_max, "");
3017 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_full,
3018 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_full, "");
3019 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_no_buf,
3020 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_no_buf, "");
3021
3022 #if CODE_SIGNING_MONITOR
3023 extern uint64_t vm_cs_defer_to_csm;
3024 extern uint64_t vm_cs_defer_to_csm_not;
3025 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm,
3026 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm, "");
3027 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm_not,
3028 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm_not, "");
3029 #endif /* CODE_SIGNING_MONITOR */
3030
3031 extern uint64_t shared_region_pager_copied;
3032 extern uint64_t shared_region_pager_slid;
3033 extern uint64_t shared_region_pager_slid_error;
3034 extern uint64_t shared_region_pager_reclaimed;
3035 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_copied,
3036 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_copied, "");
3037 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid,
3038 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid, "");
3039 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid_error,
3040 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid_error, "");
3041 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed,
3042 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_reclaimed, "");
3043 extern int shared_region_destroy_delay;
3044 SYSCTL_INT(_vm, OID_AUTO, shared_region_destroy_delay,
3045 CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_destroy_delay, 0, "");
3046
3047 #if MACH_ASSERT
3048 extern int pmap_ledgers_panic_leeway;
3049 SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
3050 #endif /* MACH_ASSERT */
3051
3052
3053 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_count;
3054 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_size;
3055 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_max;
3056 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart;
3057 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_error;
3058 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_count;
3059 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_size;
3060 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_max;
3061 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart;
3062 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_error;
3063 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_count;
3064 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_size;
3065 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_max;
3066 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_count,
3067 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_count, "");
3068 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_size,
3069 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_size, "");
3070 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_max,
3071 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_max, "");
3072 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_restart,
3073 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_restart, "");
3074 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_error,
3075 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_error, "");
3076 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_count,
3077 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_count, "");
3078 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_size,
3079 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_size, "");
3080 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_max,
3081 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_max, "");
3082 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_restart,
3083 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_restart, "");
3084 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_error,
3085 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_error, "");
3086 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_count,
3087 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_count, "");
3088 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_size,
3089 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_size, "");
3090 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_max,
3091 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_max, "");
3092
3093 extern int vm_protect_privileged_from_untrusted;
3094 SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted,
3095 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, "");
3096 extern uint64_t vm_copied_on_read;
3097 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read,
3098 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, "");
3099
3100 extern int vm_shared_region_count;
3101 extern int vm_shared_region_peak;
3102 SYSCTL_INT(_vm, OID_AUTO, shared_region_count,
3103 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_count, 0, "");
3104 SYSCTL_INT(_vm, OID_AUTO, shared_region_peak,
3105 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_peak, 0, "");
3106 #if DEVELOPMENT || DEBUG
3107 extern unsigned int shared_region_pagers_resident_count;
3108 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_count,
3109 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_count, 0, "");
3110 extern unsigned int shared_region_pagers_resident_peak;
3111 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_peak,
3112 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_peak, 0, "");
3113 extern int shared_region_pager_count;
3114 SYSCTL_INT(_vm, OID_AUTO, shared_region_pager_count,
3115 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_count, 0, "");
3116 #if __has_feature(ptrauth_calls)
3117 extern int shared_region_key_count;
3118 SYSCTL_INT(_vm, OID_AUTO, shared_region_key_count,
3119 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_key_count, 0, "");
3120 extern int vm_shared_region_reslide_count;
3121 SYSCTL_INT(_vm, OID_AUTO, shared_region_reslide_count,
3122 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_reslide_count, 0, "");
3123 #endif /* __has_feature(ptrauth_calls) */
3124 #endif /* DEVELOPMENT || DEBUG */
3125
3126 #if MACH_ASSERT
3127 extern int debug4k_filter;
3128 SYSCTL_INT(_vm, OID_AUTO, debug4k_filter, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_filter, 0, "");
3129 extern int debug4k_panic_on_terminate;
3130 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_terminate, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_terminate, 0, "");
3131 extern int debug4k_panic_on_exception;
3132 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_exception, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_exception, 0, "");
3133 extern int debug4k_panic_on_misaligned_sharing;
3134 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_misaligned_sharing, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_misaligned_sharing, 0, "");
3135 #endif /* MACH_ASSERT */
3136
3137 extern uint64_t vm_map_set_size_limit_count;
3138 extern uint64_t vm_map_set_data_limit_count;
3139 extern uint64_t vm_map_enter_RLIMIT_AS_count;
3140 extern uint64_t vm_map_enter_RLIMIT_DATA_count;
3141 SYSCTL_QUAD(_vm, OID_AUTO, map_set_size_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_size_limit_count, "");
3142 SYSCTL_QUAD(_vm, OID_AUTO, map_set_data_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_data_limit_count, "");
3143 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_AS_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_AS_count, "");
3144 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_DATA_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_DATA_count, "");
3145
3146 extern uint64_t vm_fault_resilient_media_initiate;
3147 extern uint64_t vm_fault_resilient_media_retry;
3148 extern uint64_t vm_fault_resilient_media_proceed;
3149 extern uint64_t vm_fault_resilient_media_release;
3150 extern uint64_t vm_fault_resilient_media_abort1;
3151 extern uint64_t vm_fault_resilient_media_abort2;
3152 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_initiate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_initiate, "");
3153 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_retry, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_retry, "");
3154 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_proceed, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_proceed, "");
3155 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_release, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_release, "");
3156 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort1, "");
3157 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort2, "");
3158 #if MACH_ASSERT
3159 extern int vm_fault_resilient_media_inject_error1_rate;
3160 extern int vm_fault_resilient_media_inject_error1;
3161 extern int vm_fault_resilient_media_inject_error2_rate;
3162 extern int vm_fault_resilient_media_inject_error2;
3163 extern int vm_fault_resilient_media_inject_error3_rate;
3164 extern int vm_fault_resilient_media_inject_error3;
3165 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1_rate, 0, "");
3166 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1, 0, "");
3167 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2_rate, 0, "");
3168 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2, 0, "");
3169 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3_rate, 0, "");
3170 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3, 0, "");
3171 #endif /* MACH_ASSERT */
3172
3173 extern uint64_t pmap_query_page_info_retries;
3174 SYSCTL_QUAD(_vm, OID_AUTO, pmap_query_page_info_retries, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_query_page_info_retries, "");
3175
3176 /*
3177 * A sysctl which causes all existing shared regions to become stale. They
3178 * will no longer be used by anything new and will be torn down as soon as
3179 * the last existing user exits. A write of non-zero value causes that to happen.
3180 * This should only be used by launchd, so we check that this is initproc.
3181 */
3182 static int
shared_region_pivot(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3183 shared_region_pivot(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3184 {
3185 unsigned int value = 0;
3186 int changed = 0;
3187 int error = sysctl_io_number(req, 0, sizeof(value), &value, &changed);
3188 if (error || !changed) {
3189 return error;
3190 }
3191 if (current_proc() != initproc) {
3192 return EPERM;
3193 }
3194
3195 vm_shared_region_pivot();
3196
3197 return 0;
3198 }
3199
3200 SYSCTL_PROC(_vm, OID_AUTO, shared_region_pivot,
3201 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
3202 0, 0, shared_region_pivot, "I", "");
3203
3204 extern uint64_t vm_object_shadow_forced;
3205 extern uint64_t vm_object_shadow_skipped;
3206 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
3207 &vm_object_shadow_forced, "");
3208 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_skipped, CTLFLAG_RD | CTLFLAG_LOCKED,
3209 &vm_object_shadow_skipped, "");
3210
3211
3212
3213 SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED,
3214 &vmtc_total, 0, "total text page corruptions detected");
3215
3216
3217 #if DEBUG || DEVELOPMENT
3218 /*
3219 * A sysctl that can be used to corrupt a text page with an illegal instruction.
3220 * Used for testing text page self healing.
3221 */
3222 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
3223 static int
corrupt_text_addr(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3224 corrupt_text_addr(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3225 {
3226 uint64_t value = 0;
3227 int error = sysctl_handle_quad(oidp, &value, 0, req);
3228 if (error || !req->newptr) {
3229 return error;
3230 }
3231
3232 if (vm_corrupt_text_addr((uintptr_t)value) == KERN_SUCCESS) {
3233 return 0;
3234 } else {
3235 return EINVAL;
3236 }
3237 }
3238
3239 SYSCTL_PROC(_vm, OID_AUTO, corrupt_text_addr,
3240 CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3241 0, 0, corrupt_text_addr, "-", "");
3242 #endif /* DEBUG || DEVELOPMENT */
3243
3244 #if CONFIG_MAP_RANGES
3245 /*
3246 * vm.malloc_ranges
3247 *
3248 * space-separated list of <left:right> hexadecimal addresses.
3249 */
3250 static int
3251 vm_map_malloc_ranges SYSCTL_HANDLER_ARGS
3252 {
3253 vm_map_t map = current_map();
3254 struct mach_vm_range r1, r2;
3255 char str[20 * 4];
3256 int len;
3257 mach_vm_offset_t right_hole_max;
3258
3259 if (vm_map_get_user_range(map, UMEM_RANGE_ID_DEFAULT, &r1)) {
3260 return ENOENT;
3261 }
3262 if (vm_map_get_user_range(map, UMEM_RANGE_ID_HEAP, &r2)) {
3263 return ENOENT;
3264 }
3265
3266 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
3267 right_hole_max = MACH_VM_JUMBO_ADDRESS;
3268 #else /* !XNU_TARGET_OS_IOS || !EXTENDED_USER_VA_SUPPORT */
3269 right_hole_max = get_map_max(map);
3270 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
3271
3272 len = scnprintf(str, sizeof(str), "0x%llx:0x%llx 0x%llx:0x%llx",
3273 r1.max_address, r2.min_address,
3274 r2.max_address, right_hole_max);
3275
3276 return SYSCTL_OUT(req, str, len);
3277 }
3278
3279 SYSCTL_PROC(_vm, OID_AUTO, malloc_ranges,
3280 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3281 0, 0, &vm_map_malloc_ranges, "A", "");
3282
3283 #if DEBUG || DEVELOPMENT
3284 static int
3285 vm_map_user_range_default SYSCTL_HANDLER_ARGS
3286 {
3287 #pragma unused(arg1, arg2, oidp)
3288 struct mach_vm_range range;
3289
3290 if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_DEFAULT, &range)
3291 != KERN_SUCCESS) {
3292 return EINVAL;
3293 }
3294
3295 return SYSCTL_OUT(req, &range, sizeof(range));
3296 }
3297
3298 static int
3299 vm_map_user_range_heap SYSCTL_HANDLER_ARGS
3300 {
3301 #pragma unused(arg1, arg2, oidp)
3302 struct mach_vm_range range;
3303
3304 if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_HEAP, &range)
3305 != KERN_SUCCESS) {
3306 return EINVAL;
3307 }
3308
3309 return SYSCTL_OUT(req, &range, sizeof(range));
3310 }
3311
3312 static int
3313 vm_map_user_range_large_file SYSCTL_HANDLER_ARGS
3314 {
3315 #pragma unused(arg1, arg2, oidp)
3316 struct mach_vm_range range;
3317
3318 if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_LARGE_FILE, &range)
3319 != KERN_SUCCESS) {
3320 return EINVAL;
3321 }
3322
3323 return SYSCTL_OUT(req, &range, sizeof(range));
3324 }
3325
3326 /*
3327 * A sysctl that can be used to return ranges for the current VM map.
3328 * Used for testing VM ranges.
3329 */
3330 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_default, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3331 0, 0, &vm_map_user_range_default, "S,mach_vm_range", "");
3332 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_heap, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3333 0, 0, &vm_map_user_range_heap, "S,mach_vm_range", "");
3334 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_large_file, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3335 0, 0, &vm_map_user_range_large_file, "S,mach_vm_range", "");
3336
3337 #endif /* DEBUG || DEVELOPMENT */
3338 #endif /* CONFIG_MAP_RANGES */
3339
3340 #if DEBUG || DEVELOPMENT
3341 #endif /* DEBUG || DEVELOPMENT */
3342
3343 extern uint64_t vm_map_range_overflows_count;
3344 SYSCTL_QUAD(_vm, OID_AUTO, map_range_overflows_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_range_overflows_count, "");
3345 extern boolean_t vm_map_range_overflows_log;
3346 SYSCTL_INT(_vm, OID_AUTO, map_range_oveflows_log, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_range_overflows_log, 0, "");
3347
3348 extern uint64_t c_seg_filled_no_contention;
3349 extern uint64_t c_seg_filled_contention;
3350 extern clock_sec_t c_seg_filled_contention_sec_max;
3351 extern clock_nsec_t c_seg_filled_contention_nsec_max;
3352 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_no_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_no_contention, "");
3353 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention, "");
3354 SYSCTL_ULONG(_vm, OID_AUTO, c_seg_filled_contention_sec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_sec_max, "");
3355 SYSCTL_UINT(_vm, OID_AUTO, c_seg_filled_contention_nsec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_nsec_max, 0, "");
3356 #if (XNU_TARGET_OS_OSX && __arm64__)
3357 extern clock_nsec_t c_process_major_report_over_ms; /* report if over ? ms */
3358 extern int c_process_major_yield_after; /* yield after moving ? segments */
3359 extern uint64_t c_process_major_reports;
3360 extern clock_sec_t c_process_major_max_sec;
3361 extern clock_nsec_t c_process_major_max_nsec;
3362 extern uint32_t c_process_major_peak_segcount;
3363 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_report_over_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_report_over_ms, 0, "");
3364 SYSCTL_INT(_vm, OID_AUTO, c_process_major_yield_after, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_yield_after, 0, "");
3365 SYSCTL_QUAD(_vm, OID_AUTO, c_process_major_reports, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_reports, "");
3366 SYSCTL_ULONG(_vm, OID_AUTO, c_process_major_max_sec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_sec, "");
3367 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_max_nsec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_nsec, 0, "");
3368 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_peak_segcount, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_peak_segcount, 0, "");
3369 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3370
3371 #if DEVELOPMENT || DEBUG
3372 extern int panic_object_not_alive;
3373 SYSCTL_INT(_vm, OID_AUTO, panic_object_not_alive, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &panic_object_not_alive, 0, "");
3374 #endif /* DEVELOPMENT || DEBUG */
3375
3376 #if FBDP_DEBUG_OBJECT_NO_PAGER
3377 extern int fbdp_no_panic;
3378 SYSCTL_INT(_vm, OID_AUTO, fbdp_no_panic, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &fbdp_no_panic, 0, "");
3379 #endif /* MACH_ASSERT */
3380
3381 extern uint64_t cluster_direct_write_wired;
3382 SYSCTL_QUAD(_vm, OID_AUTO, cluster_direct_write_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &cluster_direct_write_wired, "");
3383
3384
3385 #if DEVELOPMENT || DEBUG
3386
3387 static uint32_t
sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)3388 sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)
3389 {
3390 #pragma unused(with_data)
3391 return VM_C_SEGMENT_INFO_MAGIC;
3392 }
3393
3394 /* The largest possible single segment + its slots is
3395 * (sizeof(c_segment_info) + C_SLOT_MAX_INDEX * sizeof(c_slot_info)) + (data of a single segment) */
3396 #define SYSCTL_SEG_BUF_SIZE (8 * 1024 + 64 * 1024)
3397
3398 extern uint32_t c_segments_available;
3399
3400 struct sysctl_buf_header {
3401 uint32_t magic;
3402 } __attribute__((packed));
3403
3404 /* This sysctl iterates over the populated c_segments and writes some info about each one and its slots.
3405 * instead of doing everything here, the function calls a function vm_compressor.c. */
3406 static int
sysctl_compressor_segments_stream(struct sysctl_req * req,vm_c_serialize_add_data_t with_data)3407 sysctl_compressor_segments_stream(struct sysctl_req *req, vm_c_serialize_add_data_t with_data)
3408 {
3409 char* buf = kalloc_data(SYSCTL_SEG_BUF_SIZE, Z_WAITOK | Z_ZERO);
3410 if (!buf) {
3411 return ENOMEM;
3412 }
3413 size_t offset = 0;
3414 int error = 0;
3415 int segno = 0;
3416 /* 4 byte header to identify the version of the formatting of the data.
3417 * This should be incremented if c_segment_info or c_slot_info are changed */
3418 ((struct sysctl_buf_header*)buf)->magic = sysctl_compressor_seg_magic(with_data);
3419 offset += sizeof(uint32_t);
3420
3421 while (segno < c_segments_available) {
3422 size_t left_sz = SYSCTL_SEG_BUF_SIZE - offset;
3423 kern_return_t kr = vm_compressor_serialize_segment_debug_info(segno, buf + offset, &left_sz, with_data);
3424 if (kr == KERN_NO_SPACE) {
3425 /* failed to add another segment, push the current buffer out and try again */
3426 if (offset == 0) {
3427 error = EINVAL; /* no space to write but I didn't write anything, shouldn't really happen */
3428 goto out;
3429 }
3430 /* write out chunk */
3431 error = SYSCTL_OUT(req, buf, offset);
3432 if (error) {
3433 goto out;
3434 }
3435 offset = 0;
3436 bzero(buf, SYSCTL_SEG_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3437 /* don't increment segno, need to try again saving the current one */
3438 } else if (kr != KERN_SUCCESS) {
3439 error = EINVAL;
3440 goto out;
3441 } else {
3442 offset += left_sz;
3443 ++segno;
3444 assert(offset <= SYSCTL_SEG_BUF_SIZE);
3445 }
3446 }
3447
3448 if (offset > 0) { /* write last chunk */
3449 error = SYSCTL_OUT(req, buf, offset);
3450 }
3451
3452 out:
3453 kfree_data(buf, SYSCTL_SEG_BUF_SIZE)
3454 return error;
3455 }
3456
3457 static int
sysctl_compressor_segments(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3458 sysctl_compressor_segments(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3459 {
3460 return sysctl_compressor_segments_stream(req, VM_C_SERIALIZE_DATA_NONE);
3461 }
3462 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments, "S", "");
3463
3464
3465 extern uint32_t vm_compressor_fragmentation_level(void);
3466
3467 static int
sysctl_compressor_fragmentation_level(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3468 sysctl_compressor_fragmentation_level(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3469 {
3470 uint32_t value = vm_compressor_fragmentation_level();
3471 return SYSCTL_OUT(req, &value, sizeof(value));
3472 }
3473
3474 SYSCTL_PROC(_vm, OID_AUTO, compressor_fragmentation_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_fragmentation_level, "IU", "");
3475
3476 extern uint32_t vm_compressor_incore_fragmentation_wasted_pages(void);
3477
3478 static int
sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3479 sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3480 {
3481 uint32_t value = vm_compressor_incore_fragmentation_wasted_pages();
3482 return SYSCTL_OUT(req, &value, sizeof(value));
3483 }
3484
3485 SYSCTL_PROC(_vm, OID_AUTO, compressor_incore_fragmentation_wasted_pages, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_incore_fragmentation_wasted_pages, "IU", "");
3486
3487
3488
3489 #define SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE (8 * 1024)
3490
3491
3492 /* This sysctl iterates over all the entries of the vm_map of the a given process and write some info about the vm_object pointed by the entries.
3493 * This can be used for mapping where are all the pages of a process located in the compressor.
3494 */
3495 static int
sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)3496 sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3497 {
3498 int error = 0;
3499 char *buf = NULL;
3500 proc_t p = PROC_NULL;
3501 task_t task = TASK_NULL;
3502 vm_map_t map = VM_MAP_NULL;
3503 __block size_t offset = 0;
3504
3505 /* go from pid to proc to task to vm_map. see sysctl_procargsx() for another example of this procession */
3506 int *name = arg1;
3507 int namelen = arg2;
3508 if (namelen < 1) {
3509 return EINVAL;
3510 }
3511 int pid = name[0];
3512 p = proc_find(pid); /* this increments a reference to the proc */
3513 if (p == PROC_NULL) {
3514 return EINVAL;
3515 }
3516 task = proc_task(p);
3517 proc_rele(p); /* decrement ref of proc */
3518 p = PROC_NULL;
3519 if (task == TASK_NULL) {
3520 return EINVAL;
3521 }
3522 /* convert proc reference to task reference */
3523 task_reference(task);
3524 /* task reference to map reference */
3525 map = get_task_map_reference(task);
3526 task_deallocate(task);
3527
3528 if (map == VM_MAP_NULL) {
3529 return EINVAL; /* nothing allocated yet */
3530 }
3531
3532 buf = kalloc_data(SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE, Z_WAITOK | Z_ZERO);
3533 if (!buf) {
3534 error = ENOMEM;
3535 goto out;
3536 }
3537
3538 /* 4 byte header to identify the version of the formatting of the data.
3539 * This should be incremented if c_segment_info or c_slot_info are changed */
3540 ((struct sysctl_buf_header*)buf)->magic = VM_MAP_ENTRY_INFO_MAGIC;
3541 offset += sizeof(uint32_t);
3542
3543 kern_return_t (^write_header)(int) = ^kern_return_t (int nentries) {
3544 /* write the header, happens only once at the beginning so we should have enough space */
3545 assert(offset + sizeof(struct vm_map_info_hdr) < SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE);
3546 struct vm_map_info_hdr* out_hdr = (struct vm_map_info_hdr*)(buf + offset);
3547 out_hdr->vmi_nentries = nentries;
3548 offset += sizeof(struct vm_map_info_hdr);
3549 return KERN_SUCCESS;
3550 };
3551
3552 kern_return_t (^write_entry)(void*) = ^kern_return_t (void* entry) {
3553 while (true) { /* try up to 2 times, first try write the the current buffer, otherwise to a new buffer */
3554 size_t left_sz = SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE - offset;
3555 kern_return_t kr = vm_map_dump_entry_and_compressor_pager(entry, buf + offset, &left_sz);
3556 if (kr == KERN_NO_SPACE) {
3557 /* failed to write anything, flush the current buffer and try again */
3558 if (offset == 0) {
3559 return KERN_FAILURE; /* no space to write but I didn't write anything yet, shouldn't really happen */
3560 }
3561 /* write out chunk */
3562 int out_error = SYSCTL_OUT(req, buf, offset);
3563 if (out_error) {
3564 return KERN_FAILURE;
3565 }
3566 offset = 0;
3567 bzero(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3568 continue; /* need to retry the entry dump again with the cleaned buffer */
3569 } else if (kr != KERN_SUCCESS) {
3570 return kr;
3571 }
3572 offset += left_sz;
3573 break;
3574 }
3575 return KERN_SUCCESS;
3576 };
3577
3578 /* this foreach first calls to the first callback with the number of entries, then calls the second for every entry
3579 * when the buffer is exhausted, it is flushed to the sysctl and restarted */
3580 kern_return_t kr = vm_map_entries_foreach(map, write_header, write_entry);
3581
3582 if (kr != KERN_SUCCESS) {
3583 goto out;
3584 }
3585
3586 if (offset > 0) { /* last chunk */
3587 error = SYSCTL_OUT(req, buf, offset);
3588 }
3589
3590 out:
3591 if (buf != NULL) {
3592 kfree_data(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE)
3593 }
3594 if (map != NULL) {
3595 vm_map_deallocate(map);
3596 }
3597 return error;
3598 }
3599
3600 SYSCTL_PROC(_vm, OID_AUTO, task_vm_objects_slotmap, CTLTYPE_NODE | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_task_vm_objects_slotmap, "S", "");
3601
3602
3603
3604 #endif /* DEVELOPMENT || DEBUG */
3605