1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Mach Operating System
30 * Copyright (c) 1987 Carnegie-Mellon University
31 * All rights reserved. The CMU software License Agreement specifies
32 * the terms and conditions for use and redistribution.
33 */
34 /*
35 * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
36 * support for mandatory and extensible security protections. This notice
37 * is included in support of clause 2.2 (b) of the Apple Public License,
38 * Version 2.0.
39 */
40 #include <vm/vm_options.h>
41
42 #include <kern/ecc.h>
43 #include <kern/task.h>
44 #include <kern/thread.h>
45 #include <kern/debug.h>
46 #include <kern/extmod_statistics.h>
47 #include <mach/mach_traps.h>
48 #include <mach/port.h>
49 #include <mach/sdt.h>
50 #include <mach/task.h>
51 #include <mach/task_access.h>
52 #include <mach/task_special_ports.h>
53 #include <mach/time_value.h>
54 #include <mach/vm_map.h>
55 #include <mach/vm_param.h>
56 #include <mach/vm_prot.h>
57 #include <machine/machine_routines.h>
58
59 #include <sys/file_internal.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/dir.h>
63 #include <sys/namei.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/vm.h>
67 #include <sys/file.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/mount.h>
70 #include <sys/xattr.h>
71 #include <sys/trace.h>
72 #include <sys/kernel.h>
73 #include <sys/ubc_internal.h>
74 #include <sys/user.h>
75 #include <sys/syslog.h>
76 #include <sys/stat.h>
77 #include <sys/sysproto.h>
78 #include <sys/mman.h>
79 #include <sys/sysctl.h>
80 #include <sys/cprotect.h>
81 #include <sys/kpi_socket.h>
82 #include <sys/kas_info.h>
83 #include <sys/socket.h>
84 #include <sys/socketvar.h>
85 #include <sys/random.h>
86 #include <sys/code_signing.h>
87 #if NECP
88 #include <net/necp.h>
89 #endif /* NECP */
90 #if SKYWALK
91 #include <skywalk/os_channel.h>
92 #endif /* SKYWALK */
93
94 #include <security/audit/audit.h>
95 #include <security/mac.h>
96 #include <bsm/audit_kevents.h>
97
98 #include <kern/kalloc.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_kern_xnu.h>
101 #include <vm/vm_pageout_xnu.h>
102
103 #include <mach/shared_region.h>
104 #include <vm/vm_shared_region_internal.h>
105
106 #include <vm/vm_dyld_pager_internal.h>
107 #include <vm/vm_protos_internal.h>
108 #include <vm/vm_compressor_info.h> /* for c_segment_info */
109 #include <vm/vm_compressor_xnu.h> /* for vm_compressor_serialize_segment_debug_info() */
110 #include <vm/vm_object_xnu.h> /* for vm_chead_select_t */
111 #include <vm/vm_memory_entry_xnu.h>
112 #include <vm/vm_iokit.h>
113 #include <vm/vm_reclaim_xnu.h>
114
115 #include <sys/kern_memorystatus.h>
116 #include <sys/kern_memorystatus_freeze.h>
117 #include <sys/proc_internal.h>
118
119 #include <mach-o/fixup-chains.h>
120
121 #if CONFIG_MACF
122 #include <security/mac_framework.h>
123 #endif
124
125 #include <kern/bits.h>
126
127 #if CONFIG_CSR
128 #include <sys/csr.h>
129 #endif /* CONFIG_CSR */
130 #include <sys/trust_caches.h>
131 #include <libkern/amfi/amfi.h>
132 #include <IOKit/IOBSD.h>
133
134 #if VM_MAP_DEBUG_APPLE_PROTECT
135 SYSCTL_INT(_vm, OID_AUTO, map_debug_apple_protect, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_apple_protect, 0, "");
136 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
137
138 #if DEVELOPMENT || DEBUG
139
140 extern int vm_object_cache_evict_all(void);
141 static int
142 sysctl_vm_object_cache_evict SYSCTL_HANDLER_ARGS
143 {
144 #pragma unused(arg1, arg2, req)
145 (void) vm_object_cache_evict_all();
146 return 0;
147 }
148
149 SYSCTL_PROC(_vm, OID_AUTO, object_cache_evict, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
150 0, 0, &sysctl_vm_object_cache_evict, "I", "");
151
152 static int
153 sysctl_kmem_alloc_contig SYSCTL_HANDLER_ARGS
154 {
155 #pragma unused(arg1, arg2)
156 vm_offset_t kaddr;
157 kern_return_t kr;
158 int error = 0;
159 int size = 0;
160
161 error = sysctl_handle_int(oidp, &size, 0, req);
162 if (error || !req->newptr) {
163 return error;
164 }
165
166 kr = kmem_alloc_contig(kernel_map, &kaddr, (vm_size_t)size,
167 0, 0, 0, KMA_DATA, VM_KERN_MEMORY_IOKIT);
168
169 if (kr == KERN_SUCCESS) {
170 kmem_free(kernel_map, kaddr, size);
171 }
172
173 return error;
174 }
175
176 SYSCTL_PROC(_vm, OID_AUTO, kmem_alloc_contig, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
177 0, 0, &sysctl_kmem_alloc_contig, "I", "");
178
179 extern int vm_region_footprint;
180 SYSCTL_INT(_vm, OID_AUTO, region_footprint, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, &vm_region_footprint, 0, "");
181
182 static int
183 sysctl_kmem_gobj_stats SYSCTL_HANDLER_ARGS
184 {
185 #pragma unused(arg1, arg2, oidp)
186 kmem_gobj_stats stats = kmem_get_gobj_stats();
187
188 return SYSCTL_OUT(req, &stats, sizeof(stats));
189 }
190
191 SYSCTL_PROC(_vm, OID_AUTO, kmem_gobj_stats,
192 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
193 0, 0, &sysctl_kmem_gobj_stats, "S,kmem_gobj_stats", "");
194
195 #endif /* DEVELOPMENT || DEBUG */
196
197 static int
198 sysctl_vm_self_region_footprint SYSCTL_HANDLER_ARGS
199 {
200 #pragma unused(arg1, arg2, oidp)
201 int error = 0;
202 int value;
203
204 value = task_self_region_footprint();
205 error = SYSCTL_OUT(req, &value, sizeof(int));
206 if (error) {
207 return error;
208 }
209
210 if (!req->newptr) {
211 return 0;
212 }
213
214 error = SYSCTL_IN(req, &value, sizeof(int));
215 if (error) {
216 return error;
217 }
218 task_self_region_footprint_set(value);
219 return 0;
220 }
221 SYSCTL_PROC(_vm, OID_AUTO, self_region_footprint, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_footprint, "I", "");
222
223 static int
224 sysctl_vm_self_region_page_size SYSCTL_HANDLER_ARGS
225 {
226 #pragma unused(arg1, arg2, oidp)
227 int error = 0;
228 int value;
229
230 value = (1 << thread_self_region_page_shift());
231 error = SYSCTL_OUT(req, &value, sizeof(int));
232 if (error) {
233 return error;
234 }
235
236 if (!req->newptr) {
237 return 0;
238 }
239
240 error = SYSCTL_IN(req, &value, sizeof(int));
241 if (error) {
242 return error;
243 }
244
245 if (value != 0 && value != 4096 && value != 16384) {
246 return EINVAL;
247 }
248
249 #if !__ARM_MIXED_PAGE_SIZE__
250 if (value != vm_map_page_size(current_map())) {
251 return EINVAL;
252 }
253 #endif /* !__ARM_MIXED_PAGE_SIZE__ */
254
255 thread_self_region_page_shift_set(bit_first(value));
256 return 0;
257 }
258 SYSCTL_PROC(_vm, OID_AUTO, self_region_page_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_page_size, "I", "");
259
260 static int
261 sysctl_vm_self_region_info_flags SYSCTL_HANDLER_ARGS
262 {
263 #pragma unused(arg1, arg2, oidp)
264 int error = 0;
265 int value;
266 kern_return_t kr;
267
268 value = task_self_region_info_flags();
269 error = SYSCTL_OUT(req, &value, sizeof(int));
270 if (error) {
271 return error;
272 }
273
274 if (!req->newptr) {
275 return 0;
276 }
277
278 error = SYSCTL_IN(req, &value, sizeof(int));
279 if (error) {
280 return error;
281 }
282
283 kr = task_self_region_info_flags_set(value);
284 if (kr != KERN_SUCCESS) {
285 return EINVAL;
286 }
287
288 return 0;
289 }
290 SYSCTL_PROC(_vm, OID_AUTO, self_region_info_flags, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_info_flags, "I", "");
291
292
293 #if DEVELOPMENT || DEBUG
294 extern int panic_on_unsigned_execute;
295 SYSCTL_INT(_vm, OID_AUTO, panic_on_unsigned_execute, CTLFLAG_RW | CTLFLAG_LOCKED, &panic_on_unsigned_execute, 0, "");
296
297 extern int vm_log_xnu_user_debug;
298 SYSCTL_INT(_vm, OID_AUTO, log_xnu_user_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_xnu_user_debug, 0, "");
299 #endif /* DEVELOPMENT || DEBUG */
300
301 extern int vm_log_map_delete_permanent_prot_none;
302 SYSCTL_INT(_vm, OID_AUTO, log_map_delete_permanent_prot_none, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_log_map_delete_permanent_prot_none, 0, "");
303
304 extern int cs_executable_create_upl;
305 extern int cs_executable_wire;
306 SYSCTL_INT(_vm, OID_AUTO, cs_executable_create_upl, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_create_upl, 0, "");
307 SYSCTL_INT(_vm, OID_AUTO, cs_executable_wire, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_wire, 0, "");
308
309 extern int apple_protect_pager_count;
310 extern int apple_protect_pager_count_mapped;
311 extern unsigned int apple_protect_pager_cache_limit;
312 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count, 0, "");
313 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count_mapped, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count_mapped, 0, "");
314 SYSCTL_UINT(_vm, OID_AUTO, apple_protect_pager_cache_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_cache_limit, 0, "");
315
316 #if DEVELOPMENT || DEBUG
317 extern int radar_20146450;
318 SYSCTL_INT(_vm, OID_AUTO, radar_20146450, CTLFLAG_RW | CTLFLAG_LOCKED, &radar_20146450, 0, "");
319
320 extern int macho_printf;
321 SYSCTL_INT(_vm, OID_AUTO, macho_printf, CTLFLAG_RW | CTLFLAG_LOCKED, &macho_printf, 0, "");
322
323 extern int apple_protect_pager_data_request_debug;
324 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_data_request_debug, 0, "");
325
326 extern unsigned int vm_object_copy_delayed_paging_wait_disable;
327 EXPERIMENT_FACTOR_LEGACY_UINT(_vm, vm_object_copy_delayed_paging_wait_disable, &vm_object_copy_delayed_paging_wait_disable, FALSE, TRUE, "");
328
329 __enum_closed_decl(vm_submap_test_op, uint32_t, {
330 vsto_make_submap = 1, /* make submap from entries in current_map()
331 * at start..end, offset ignored */
332 vsto_remap_submap = 2, /* map in current_map() at start..end,
333 * from parent address submap_base_address
334 * and submap address offset */
335 vsto_end
336 });
337
338 static int
339 sysctl_vm_submap_test_ctl SYSCTL_HANDLER_ARGS
340 {
341 int error;
342 struct {
343 vm_submap_test_op op;
344 mach_vm_address_t submap_base_address;
345 mach_vm_address_t start;
346 mach_vm_address_t end;
347 mach_vm_address_t offset;
348 } args;
349 if (req->newlen != sizeof(args)) {
350 return EINVAL;
351 }
352 error = SYSCTL_IN(req, &args, sizeof(args));
353 if (error) {
354 return error;
355 }
356
357 switch (args.op) {
358 case vsto_make_submap:
359 vm_map_testing_make_sealed_submap(current_map(), args.start, args.end);
360 break;
361 case vsto_remap_submap:
362 vm_map_testing_remap_submap(current_map(),
363 args.submap_base_address, args.start, args.end, args.offset);
364 break;
365 default:
366 return EINVAL;
367 }
368
369 return 0;
370 }
371 SYSCTL_PROC(_vm, OID_AUTO, submap_test_ctl, CTLFLAG_WR | CTLFLAG_LOCKED, 0, 0, &sysctl_vm_submap_test_ctl, "-", "");
372
373 #if __arm64__
374 /* These are meant to support the page table accounting unit test. */
375 extern unsigned int arm_hardware_page_size;
376 extern unsigned int arm_pt_desc_size;
377 extern unsigned int arm_pt_root_size;
378 extern unsigned int inuse_user_tteroot_count;
379 extern unsigned int inuse_kernel_tteroot_count;
380 extern unsigned int inuse_user_ttepages_count;
381 extern unsigned int inuse_kernel_ttepages_count;
382 extern unsigned int inuse_user_ptepages_count;
383 extern unsigned int inuse_kernel_ptepages_count;
384 SYSCTL_UINT(_vm, OID_AUTO, native_hw_pagesize, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_hardware_page_size, 0, "");
385 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_desc_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_desc_size, 0, "");
386 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_root_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_root_size, 0, "");
387 SYSCTL_UINT(_vm, OID_AUTO, user_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_tteroot_count, 0, "");
388 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_tteroot_count, 0, "");
389 SYSCTL_UINT(_vm, OID_AUTO, user_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ttepages_count, 0, "");
390 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ttepages_count, 0, "");
391 SYSCTL_UINT(_vm, OID_AUTO, user_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ptepages_count, 0, "");
392 SYSCTL_UINT(_vm, OID_AUTO, kernel_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ptepages_count, 0, "");
393 #if !CONFIG_SPTM
394 extern unsigned int free_page_size_tt_count;
395 extern unsigned int free_tt_count;
396 SYSCTL_UINT(_vm, OID_AUTO, free_1page_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_page_size_tt_count, 0, "");
397 SYSCTL_UINT(_vm, OID_AUTO, free_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_tt_count, 0, "");
398 #endif
399 #if DEVELOPMENT || DEBUG
400 extern unsigned long pmap_asid_flushes;
401 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_flushes, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_flushes, "");
402 extern unsigned long pmap_asid_hits;
403 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_hits, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_hits, "");
404 extern unsigned long pmap_asid_misses;
405 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_misses, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_misses, "");
406 extern unsigned long pmap_speculation_restrictions;
407 SYSCTL_ULONG(_vm, OID_AUTO, pmap_speculation_restrictions, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_speculation_restrictions, "");
408 #endif
409 #endif /* __arm64__ */
410 #endif /* DEVELOPMENT || DEBUG */
411
412 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor, 0, "");
413 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor_pages, 0, "");
414 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate, 0, "");
415 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate_failure, 0, "");
416 SYSCTL_INT(_vm, OID_AUTO, vm_should_cow_but_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.should_cow_but_wired, 0, "");
417 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow, 0, "");
418 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow_pages, 0, "");
419 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_write, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_write, 0, "");
420 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_copy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_copy, 0, "");
421 #if VM_SCAN_FOR_SHADOW_CHAIN
422 static int vm_shadow_max_enabled = 0; /* Disabled by default */
423 extern int proc_shadow_max(void);
424 static int
425 vm_shadow_max SYSCTL_HANDLER_ARGS
426 {
427 #pragma unused(arg1, arg2, oidp)
428 int value = 0;
429
430 if (vm_shadow_max_enabled) {
431 value = proc_shadow_max();
432 }
433
434 return SYSCTL_OUT(req, &value, sizeof(value));
435 }
436 SYSCTL_PROC(_vm, OID_AUTO, vm_shadow_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
437 0, 0, &vm_shadow_max, "I", "");
438
439 SYSCTL_INT(_vm, OID_AUTO, vm_shadow_max_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_shadow_max_enabled, 0, "");
440
441 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
442
443 SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, "");
444
445 #if PAGE_SLEEP_WITH_INHERITOR
446 #if DEVELOPMENT || DEBUG
447 extern uint32_t page_worker_table_size;
448 SYSCTL_INT(_vm, OID_AUTO, page_worker_table_size, CTLFLAG_RD | CTLFLAG_LOCKED, &page_worker_table_size, 0, "");
449 SCALABLE_COUNTER_DECLARE(page_worker_hash_collisions);
450 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_hash_collisions, page_worker_hash_collisions, "");
451 SCALABLE_COUNTER_DECLARE(page_worker_inheritor_sleeps);
452 SYSCTL_SCALABLE_COUNTER(_vm, page_worker_inheritor_sleeps, page_worker_inheritor_sleeps, "");
453 #endif /* DEVELOPMENT || DEBUG */
454 #endif /* PAGE_SLEEP_WITH_INHERITOR */
455
456 #if COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1
457 extern uint32_t vm_cheads;
458 extern vm_chead_select_t vm_chead_select;
459 extern boolean_t vm_chead_rehint;
460 #if DEVELOPMENT || DEBUG
461 SYSCTL_UINT(_vm, OID_AUTO, compressor_heads, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cheads, 0, "");
462 SYSCTL_UINT(_vm, OID_AUTO, compressor_head_select, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_chead_select, 0, "");
463 SYSCTL_INT(_vm, OID_AUTO, compressor_head_rehint, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_chead_rehint, 0, "");
464 #endif /* DEVELOPMENT || DEBUG */
465 EXPERIMENT_FACTOR_UINT(compressor_heads, &vm_cheads, 1, COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT, "");
466 EXPERIMENT_FACTOR_UINT(compressor_head_select, &vm_chead_select, CSEL_MIN, CSEL_MAX, "");
467 EXPERIMENT_FACTOR_INT(compressor_head_rehint, &vm_chead_rehint, 0, 1, "");
468 #endif /* COMPRESSOR_PAGEOUT_CHEADS_MAX_COUNT > 1 */
469
470 /*
471 * Sysctl's related to data/stack execution. See osfmk/vm/vm_map.c
472 */
473
474 #if DEVELOPMENT || DEBUG
475 extern int allow_stack_exec, allow_data_exec;
476
477 SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, "");
478 SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, "");
479
480 #endif /* DEVELOPMENT || DEBUG */
481
482 static const char *prot_values[] = {
483 "none",
484 "read-only",
485 "write-only",
486 "read-write",
487 "execute-only",
488 "read-execute",
489 "write-execute",
490 "read-write-execute"
491 };
492
493 void
log_stack_execution_failure(addr64_t vaddr,vm_prot_t prot)494 log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot)
495 {
496 printf("Data/Stack execution not permitted: %s[pid %d] at virtual address 0x%qx, protections were %s\n",
497 current_proc()->p_comm, proc_getpid(current_proc()), vaddr, prot_values[prot & VM_PROT_ALL]);
498 }
499
500 /*
501 * shared_region_unnest_logging: level of logging of unnesting events
502 * 0 - no logging
503 * 1 - throttled logging of unexpected unnesting events (default)
504 * 2 - unthrottled logging of unexpected unnesting events
505 * 3+ - unthrottled logging of all unnesting events
506 */
507 int shared_region_unnest_logging = 1;
508
509 SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED,
510 &shared_region_unnest_logging, 0, "");
511
512 int vm_shared_region_unnest_log_interval = 10;
513 int shared_region_unnest_log_count_threshold = 5;
514
515
516 #if XNU_TARGET_OS_OSX
517
518 #if defined (__x86_64__)
519 static int scdir_enforce = 1;
520 #else /* defined (__x86_64__) */
521 static int scdir_enforce = 0; /* AOT caches live elsewhere */
522 #endif /* defined (__x86_64__) */
523
524 static char *scdir_path[] = {
525 "/System/Library/dyld/",
526 "/System/Volumes/Preboot/Cryptexes/OS/System/Library/dyld",
527 "/System/Cryptexes/OS/System/Library/dyld",
528 NULL
529 };
530
531 #else /* XNU_TARGET_OS_OSX */
532
533 static int scdir_enforce = 0;
534 static char *scdir_path[] = {
535 "/System/Library/Caches/com.apple.dyld/",
536 "/private/preboot/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
537 "/System/Cryptexes/OS/System/Library/Caches/com.apple.dyld",
538 NULL
539 };
540
541 #endif /* XNU_TARGET_OS_OSX */
542
543 static char *driverkit_scdir_path[] = {
544 "/System/DriverKit/System/Library/dyld/",
545 #if XNU_TARGET_OS_OSX
546 "/System/Volumes/Preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
547 #else
548 "/private/preboot/Cryptexes/OS/System/DriverKit/System/Library/dyld",
549 #endif /* XNU_TARGET_OS_OSX */
550 "/System/Cryptexes/OS/System/DriverKit/System/Library/dyld",
551 NULL
552 };
553
554 #ifndef SECURE_KERNEL
555 static int sysctl_scdir_enforce SYSCTL_HANDLER_ARGS
556 {
557 #if CONFIG_CSR
558 if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
559 printf("Failed attempt to set vm.enforce_shared_cache_dir sysctl\n");
560 return EPERM;
561 }
562 #endif /* CONFIG_CSR */
563 return sysctl_handle_int(oidp, arg1, arg2, req);
564 }
565
566 SYSCTL_PROC(_vm, OID_AUTO, enforce_shared_cache_dir, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, sysctl_scdir_enforce, "I", "");
567 #endif
568
569 /* These log rate throttling state variables aren't thread safe, but
570 * are sufficient unto the task.
571 */
572 static int64_t last_unnest_log_time = 0;
573 static int shared_region_unnest_log_count = 0;
574
575 void
log_unnest_badness(vm_map_t m,vm_map_offset_t s,vm_map_offset_t e,boolean_t is_nested_map,vm_map_offset_t lowest_unnestable_addr)576 log_unnest_badness(
577 vm_map_t m,
578 vm_map_offset_t s,
579 vm_map_offset_t e,
580 boolean_t is_nested_map,
581 vm_map_offset_t lowest_unnestable_addr)
582 {
583 struct timeval tv;
584
585 if (shared_region_unnest_logging == 0) {
586 return;
587 }
588
589 if (shared_region_unnest_logging <= 2 &&
590 is_nested_map &&
591 s >= lowest_unnestable_addr) {
592 /*
593 * Unnesting of writable map entries is fine.
594 */
595 return;
596 }
597
598 if (shared_region_unnest_logging <= 1) {
599 microtime(&tv);
600 if ((tv.tv_sec - last_unnest_log_time) <
601 vm_shared_region_unnest_log_interval) {
602 if (shared_region_unnest_log_count++ >
603 shared_region_unnest_log_count_threshold) {
604 return;
605 }
606 } else {
607 last_unnest_log_time = tv.tv_sec;
608 shared_region_unnest_log_count = 0;
609 }
610 }
611
612 DTRACE_VM4(log_unnest_badness,
613 vm_map_t, m,
614 vm_map_offset_t, s,
615 vm_map_offset_t, e,
616 vm_map_offset_t, lowest_unnestable_addr);
617 printf("%s[%d] triggered unnest of range 0x%qx->0x%qx of DYLD shared region in VM map %p. While not abnormal for debuggers, this increases system memory footprint until the target exits.\n", current_proc()->p_comm, proc_getpid(current_proc()), (uint64_t)s, (uint64_t)e, (void *) VM_KERNEL_ADDRPERM(m));
618 }
619
620 uint64_t
vm_purge_filebacked_pagers(void)621 vm_purge_filebacked_pagers(void)
622 {
623 uint64_t pages_purged;
624
625 pages_purged = 0;
626 pages_purged += apple_protect_pager_purge_all();
627 pages_purged += shared_region_pager_purge_all();
628 pages_purged += dyld_pager_purge_all();
629 #if DEVELOPMENT || DEBUG
630 printf("%s:%d pages purged: %llu\n", __FUNCTION__, __LINE__, pages_purged);
631 #endif /* DEVELOPMENT || DEBUG */
632 return pages_purged;
633 }
634
635 int
useracc(user_addr_ut addr_u,user_size_ut len_u,int prot)636 useracc(
637 user_addr_ut addr_u,
638 user_size_ut len_u,
639 int prot)
640 {
641 vm_map_t map;
642 vm_prot_t vm_prot = VM_PROT_WRITE;
643
644 map = current_map();
645
646 if (prot == B_READ) {
647 vm_prot = VM_PROT_READ;
648 }
649
650 return vm_map_check_protection(map, addr_u,
651 vm_sanitize_compute_ut_end(addr_u, len_u), vm_prot,
652 VM_SANITIZE_CALLER_USERACC);
653 }
654
655 #if XNU_PLATFORM_MacOSX
656 static __attribute__((always_inline, warn_unused_result))
657 kern_return_t
vslock_sanitize(vm_map_t map,user_addr_ut addr_u,user_size_ut len_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)658 vslock_sanitize(
659 vm_map_t map,
660 user_addr_ut addr_u,
661 user_size_ut len_u,
662 vm_sanitize_caller_t vm_sanitize_caller,
663 vm_map_offset_t *start,
664 vm_map_offset_t *end,
665 vm_map_size_t *size)
666 {
667 return vm_sanitize_addr_size(addr_u, len_u, vm_sanitize_caller,
668 map,
669 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
670 size);
671 }
672 #endif /* XNU_PLATFORM_MacOSX */
673
674 int
vslock(user_addr_ut addr,user_size_ut len)675 vslock(user_addr_ut addr, user_size_ut len)
676 {
677 kern_return_t kret;
678
679 #if XNU_PLATFORM_MacOSX
680 /*
681 * Preserve previous behavior on macOS for overflows due to bin
682 * compatibility i.e. return success for overflows without doing
683 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
684 * for overflow errors which gets converted to KERN_SUCCESS by
685 * vm_sanitize_get_kr.
686 */
687 vm_map_offset_t start, end;
688 vm_map_size_t size;
689
690 kret = vslock_sanitize(current_map(),
691 addr,
692 len,
693 VM_SANITIZE_CALLER_VSLOCK,
694 &start,
695 &end,
696 &size);
697 if (__improbable(kret != KERN_SUCCESS)) {
698 switch (vm_sanitize_get_kr(kret)) {
699 case KERN_SUCCESS:
700 return 0;
701 case KERN_INVALID_ADDRESS:
702 case KERN_NO_SPACE:
703 return ENOMEM;
704 case KERN_PROTECTION_FAILURE:
705 return EACCES;
706 default:
707 return EINVAL;
708 }
709 }
710 #endif /* XNU_PLATFORM_MacOSX */
711
712 kret = vm_map_wire_kernel(current_map(), addr,
713 vm_sanitize_compute_ut_end(addr, len),
714 vm_sanitize_wrap_prot(VM_PROT_READ | VM_PROT_WRITE),
715 VM_KERN_MEMORY_BSD,
716 FALSE);
717
718 switch (kret) {
719 case KERN_SUCCESS:
720 return 0;
721 case KERN_INVALID_ADDRESS:
722 case KERN_NO_SPACE:
723 return ENOMEM;
724 case KERN_PROTECTION_FAILURE:
725 return EACCES;
726 default:
727 return EINVAL;
728 }
729 }
730
731 int
vsunlock(user_addr_ut addr,user_size_ut len,__unused int dirtied)732 vsunlock(user_addr_ut addr, user_size_ut len, __unused int dirtied)
733 {
734 #if FIXME /* [ */
735 pmap_t pmap;
736 vm_page_t pg;
737 vm_map_offset_t vaddr;
738 ppnum_t paddr;
739 #endif /* FIXME ] */
740 kern_return_t kret;
741 vm_map_t map;
742
743 map = current_map();
744
745 #if FIXME /* [ */
746 if (dirtied) {
747 pmap = get_task_pmap(current_task());
748 for (vaddr = vm_map_trunc_page(addr, PAGE_MASK);
749 vaddr < vm_map_round_page(addr + len, PAGE_MASK);
750 vaddr += PAGE_SIZE) {
751 paddr = pmap_find_phys(pmap, vaddr);
752 pg = PHYS_TO_VM_PAGE(paddr);
753 vm_page_set_modified(pg);
754 }
755 }
756 #endif /* FIXME ] */
757 #ifdef lint
758 dirtied++;
759 #endif /* lint */
760
761 #if XNU_PLATFORM_MacOSX
762 /*
763 * Preserve previous behavior on macOS for overflows due to bin
764 * compatibility i.e. return success for overflows without doing
765 * anything. Error compatibility returns VM_ERR_RETURN_NOW (on macOS)
766 * for overflow errors which gets converted to KERN_SUCCESS by
767 * vm_sanitize_get_kr.
768 */
769 vm_map_offset_t start, end;
770 vm_map_size_t size;
771
772 kret = vslock_sanitize(map,
773 addr,
774 len,
775 VM_SANITIZE_CALLER_VSUNLOCK,
776 &start,
777 &end,
778 &size);
779 if (__improbable(kret != KERN_SUCCESS)) {
780 switch (vm_sanitize_get_kr(kret)) {
781 case KERN_SUCCESS:
782 return 0;
783 case KERN_INVALID_ADDRESS:
784 case KERN_NO_SPACE:
785 return ENOMEM;
786 case KERN_PROTECTION_FAILURE:
787 return EACCES;
788 default:
789 return EINVAL;
790 }
791 }
792 #endif /* XNU_PLATFORM_MacOSX */
793
794 kret = vm_map_unwire(map, addr,
795 vm_sanitize_compute_ut_end(addr, len), false);
796 switch (kret) {
797 case KERN_SUCCESS:
798 return 0;
799 case KERN_INVALID_ADDRESS:
800 case KERN_NO_SPACE:
801 return ENOMEM;
802 case KERN_PROTECTION_FAILURE:
803 return EACCES;
804 default:
805 return EINVAL;
806 }
807 }
808
809 int
subyte(user_addr_t addr,int byte)810 subyte(
811 user_addr_t addr,
812 int byte)
813 {
814 char character;
815
816 character = (char)byte;
817 return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
818 }
819
820 int
suibyte(user_addr_t addr,int byte)821 suibyte(
822 user_addr_t addr,
823 int byte)
824 {
825 char character;
826
827 character = (char)byte;
828 return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
829 }
830
831 int
fubyte(user_addr_t addr)832 fubyte(user_addr_t addr)
833 {
834 unsigned char byte;
835
836 if (copyin(addr, (void *) &byte, sizeof(char))) {
837 return -1;
838 }
839 return byte;
840 }
841
842 int
fuibyte(user_addr_t addr)843 fuibyte(user_addr_t addr)
844 {
845 unsigned char byte;
846
847 if (copyin(addr, (void *) &(byte), sizeof(char))) {
848 return -1;
849 }
850 return byte;
851 }
852
853 int
suword(user_addr_t addr,long word)854 suword(
855 user_addr_t addr,
856 long word)
857 {
858 return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
859 }
860
861 long
fuword(user_addr_t addr)862 fuword(user_addr_t addr)
863 {
864 long word = 0;
865
866 if (copyin(addr, (void *) &word, sizeof(int))) {
867 return -1;
868 }
869 return word;
870 }
871
872 /* suiword and fuiword are the same as suword and fuword, respectively */
873
874 int
suiword(user_addr_t addr,long word)875 suiword(
876 user_addr_t addr,
877 long word)
878 {
879 return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
880 }
881
882 long
fuiword(user_addr_t addr)883 fuiword(user_addr_t addr)
884 {
885 long word = 0;
886
887 if (copyin(addr, (void *) &word, sizeof(int))) {
888 return -1;
889 }
890 return word;
891 }
892
893 /*
894 * With a 32-bit kernel and mixed 32/64-bit user tasks, this interface allows the
895 * fetching and setting of process-sized size_t and pointer values.
896 */
897 int
sulong(user_addr_t addr,int64_t word)898 sulong(user_addr_t addr, int64_t word)
899 {
900 if (IS_64BIT_PROCESS(current_proc())) {
901 return copyout((void *)&word, addr, sizeof(word)) == 0 ? 0 : -1;
902 } else {
903 return suiword(addr, (long)word);
904 }
905 }
906
907 int64_t
fulong(user_addr_t addr)908 fulong(user_addr_t addr)
909 {
910 int64_t longword;
911
912 if (IS_64BIT_PROCESS(current_proc())) {
913 if (copyin(addr, (void *)&longword, sizeof(longword)) != 0) {
914 return -1;
915 }
916 return longword;
917 } else {
918 return (int64_t)fuiword(addr);
919 }
920 }
921
922 int
suulong(user_addr_t addr,uint64_t uword)923 suulong(user_addr_t addr, uint64_t uword)
924 {
925 if (IS_64BIT_PROCESS(current_proc())) {
926 return copyout((void *)&uword, addr, sizeof(uword)) == 0 ? 0 : -1;
927 } else {
928 return suiword(addr, (uint32_t)uword);
929 }
930 }
931
932 uint64_t
fuulong(user_addr_t addr)933 fuulong(user_addr_t addr)
934 {
935 uint64_t ulongword;
936
937 if (IS_64BIT_PROCESS(current_proc())) {
938 if (copyin(addr, (void *)&ulongword, sizeof(ulongword)) != 0) {
939 return -1ULL;
940 }
941 return ulongword;
942 } else {
943 return (uint64_t)fuiword(addr);
944 }
945 }
946
947 int
swapon(__unused proc_t procp,__unused struct swapon_args * uap,__unused int * retval)948 swapon(__unused proc_t procp, __unused struct swapon_args *uap, __unused int *retval)
949 {
950 return ENOTSUP;
951 }
952
953 #if defined(SECURE_KERNEL)
954 static int kern_secure_kernel = 1;
955 #else
956 static int kern_secure_kernel = 0;
957 #endif
958
959 SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_secure_kernel, 0, "");
960 SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW | CTLFLAG_LOCKED,
961 &shared_region_trace_level, 0, "");
962 SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD | CTLFLAG_LOCKED,
963 &shared_region_version, 0, "");
964 SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED,
965 &shared_region_persistence, 0, "");
966
967 /*
968 * shared_region_check_np:
969 *
970 * This system call is intended for dyld.
971 *
972 * dyld calls this when any process starts to see if the process's shared
973 * region is already set up and ready to use.
974 * This call returns the base address of the first mapping in the
975 * process's shared region's first mapping.
976 * dyld will then check what's mapped at that address.
977 *
978 * If the shared region is empty, dyld will then attempt to map the shared
979 * cache file in the shared region via the shared_region_map_and_slide_2_np()
980 * system call.
981 *
982 * If something's already mapped in the shared region, dyld will check if it
983 * matches the shared cache it would like to use for that process.
984 * If it matches, evrything's ready and the process can proceed and use the
985 * shared region.
986 * If it doesn't match, dyld will unmap the shared region and map the shared
987 * cache into the process's address space via mmap().
988 *
989 * A NULL pointer argument can be used by dyld to indicate it has unmapped
990 * the shared region. We will remove the shared_region reference from the task.
991 *
992 * ERROR VALUES
993 * EINVAL no shared region
994 * ENOMEM shared region is empty
995 * EFAULT bad address for "start_address"
996 */
997 int
shared_region_check_np(__unused struct proc * p,struct shared_region_check_np_args * uap,__unused int * retvalp)998 shared_region_check_np(
999 __unused struct proc *p,
1000 struct shared_region_check_np_args *uap,
1001 __unused int *retvalp)
1002 {
1003 vm_shared_region_t shared_region;
1004 mach_vm_offset_t start_address = 0;
1005 int error = 0;
1006 kern_return_t kr = KERN_FAILURE;
1007 task_t task = current_task();
1008
1009 SHARED_REGION_TRACE_DEBUG(
1010 ("shared_region: %p [%d(%s)] -> check_np(0x%llx)\n",
1011 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1012 proc_getpid(p), p->p_comm,
1013 (uint64_t)uap->start_address));
1014
1015 /*
1016 * Special value of start_address used to indicate that map_with_linking() should
1017 * no longer be allowed in this process
1018 */
1019 if (uap->start_address == (task_get_64bit_addr(task) ? DYLD_VM_END_MWL : (uint32_t)DYLD_VM_END_MWL)) {
1020 p->p_disallow_map_with_linking = TRUE;
1021 return 0;
1022 }
1023
1024 /* retrieve the current task's shared region */
1025 shared_region = vm_shared_region_get(task);
1026 if (shared_region != NULL) {
1027 /*
1028 * A NULL argument is used by dyld to indicate the task
1029 * has unmapped its shared region.
1030 */
1031 if (uap->start_address == 0) {
1032 /* unmap it first */
1033 vm_shared_region_remove(task, shared_region);
1034 vm_shared_region_set(task, NULL);
1035 } else {
1036 /* retrieve address of its first mapping... */
1037 kr = vm_shared_region_start_address(shared_region, &start_address);
1038 if (kr != KERN_SUCCESS) {
1039 SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1040 "check_np(0x%llx) "
1041 "vm_shared_region_start_address() returned 0x%x\n",
1042 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1043 proc_getpid(p), p->p_comm,
1044 (uint64_t)uap->start_address, kr));
1045 error = ENOMEM;
1046 }
1047 if (error == 0) {
1048 /* Insert the shared region submap and various bits of debug info into the task. */
1049 kr = vm_shared_region_update_task(task, shared_region, start_address);
1050 if (kr != KERN_SUCCESS) {
1051 SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1052 "check_np(0x%llx) "
1053 "vm_shared_update_task() returned 0x%x\n",
1054 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1055 proc_getpid(p), p->p_comm,
1056 (uint64_t)uap->start_address, kr));
1057
1058 error = ENOMEM;
1059 }
1060 }
1061 #if __has_feature(ptrauth_calls)
1062 /*
1063 * Remap any section of the shared library that
1064 * has authenticated pointers into private memory.
1065 */
1066 if ((error == 0) && (vm_shared_region_auth_remap(shared_region) != KERN_SUCCESS)) {
1067 SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
1068 "check_np(0x%llx) "
1069 "vm_shared_region_auth_remap() failed\n",
1070 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1071 proc_getpid(p), p->p_comm,
1072 (uint64_t)uap->start_address));
1073 error = ENOMEM;
1074 }
1075 #endif /* __has_feature(ptrauth_calls) */
1076 /* Give the start address to the caller */
1077 if (error == 0) {
1078 error = copyout(&start_address,
1079 (user_addr_t) uap->start_address,
1080 sizeof(start_address));
1081 if (error != 0) {
1082 SHARED_REGION_TRACE_ERROR(
1083 ("shared_region: %p [%d(%s)] "
1084 "check_np(0x%llx) "
1085 "copyout(0x%llx) error %d\n",
1086 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1087 proc_getpid(p), p->p_comm,
1088 (uint64_t)uap->start_address, (uint64_t)start_address,
1089 error));
1090 }
1091 }
1092 }
1093 vm_shared_region_deallocate(shared_region);
1094 } else {
1095 /* no shared region ! */
1096 error = EINVAL;
1097 }
1098
1099 SHARED_REGION_TRACE_DEBUG(
1100 ("shared_region: %p [%d(%s)] check_np(0x%llx) <- 0x%llx %d\n",
1101 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1102 proc_getpid(p), p->p_comm,
1103 (uint64_t)uap->start_address, (uint64_t)start_address, error));
1104
1105 return error;
1106 }
1107
1108
1109 static int
shared_region_copyin(struct proc * p,user_addr_t user_addr,unsigned int count,unsigned int element_size,void * kernel_data)1110 shared_region_copyin(
1111 struct proc *p,
1112 user_addr_t user_addr,
1113 unsigned int count,
1114 unsigned int element_size,
1115 void *kernel_data)
1116 {
1117 int error = 0;
1118 vm_size_t size = count * element_size;
1119
1120 error = copyin(user_addr, kernel_data, size);
1121 if (error) {
1122 SHARED_REGION_TRACE_ERROR(
1123 ("shared_region: %p [%d(%s)] map(): "
1124 "copyin(0x%llx, %ld) failed (error=%d)\n",
1125 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1126 proc_getpid(p), p->p_comm,
1127 (uint64_t)user_addr, (long)size, error));
1128 }
1129 return error;
1130 }
1131
1132 /*
1133 * A reasonable upper limit to prevent overflow of allocation/copyin.
1134 */
1135 #define _SR_FILE_MAPPINGS_MAX_FILES 256
1136
1137 /* forward declaration */
1138 __attribute__((noinline))
1139 static void shared_region_map_and_slide_cleanup(
1140 struct proc *p,
1141 uint32_t files_count,
1142 struct _sr_file_mappings *sr_file_mappings,
1143 struct vm_shared_region *shared_region);
1144
1145 /*
1146 * Setup part of _shared_region_map_and_slide().
1147 * It had to be broken out of _shared_region_map_and_slide() to
1148 * prevent compiler inlining from blowing out the stack.
1149 */
1150 __attribute__((noinline))
1151 static int
shared_region_map_and_slide_setup(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings,struct _sr_file_mappings ** sr_file_mappings,struct vm_shared_region ** shared_region_ptr,struct vnode * rdir_vp)1152 shared_region_map_and_slide_setup(
1153 struct proc *p,
1154 uint32_t files_count,
1155 struct shared_file_np *files,
1156 uint32_t mappings_count,
1157 struct shared_file_mapping_slide_np *mappings,
1158 struct _sr_file_mappings **sr_file_mappings,
1159 struct vm_shared_region **shared_region_ptr,
1160 struct vnode *rdir_vp)
1161 {
1162 int error = 0;
1163 struct _sr_file_mappings *srfmp;
1164 uint32_t mappings_next;
1165 struct vnode_attr va;
1166 off_t fs;
1167 #if CONFIG_MACF
1168 vm_prot_t maxprot = VM_PROT_ALL;
1169 #endif
1170 uint32_t i;
1171 struct vm_shared_region *shared_region = NULL;
1172 boolean_t is_driverkit = task_is_driver(current_task());
1173
1174 SHARED_REGION_TRACE_DEBUG(
1175 ("shared_region: %p [%d(%s)] -> map_and_slide_setup\n",
1176 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1177 proc_getpid(p), p->p_comm));
1178
1179 if (files_count > _SR_FILE_MAPPINGS_MAX_FILES) {
1180 error = E2BIG;
1181 goto done;
1182 }
1183 if (files_count == 0) {
1184 error = EINVAL;
1185 goto done;
1186 }
1187 *sr_file_mappings = kalloc_type(struct _sr_file_mappings, files_count,
1188 Z_WAITOK | Z_ZERO);
1189 if (*sr_file_mappings == NULL) {
1190 error = ENOMEM;
1191 goto done;
1192 }
1193 mappings_next = 0;
1194 for (i = 0; i < files_count; i++) {
1195 srfmp = &(*sr_file_mappings)[i];
1196 srfmp->fd = files[i].sf_fd;
1197 srfmp->mappings_count = files[i].sf_mappings_count;
1198 srfmp->mappings = &mappings[mappings_next];
1199 mappings_next += srfmp->mappings_count;
1200 if (mappings_next > mappings_count) {
1201 error = EINVAL;
1202 goto done;
1203 }
1204 srfmp->slide = files[i].sf_slide;
1205 }
1206
1207 /* get the process's shared region (setup in vm_map_exec()) */
1208 shared_region = vm_shared_region_get(current_task());
1209 *shared_region_ptr = shared_region;
1210 if (shared_region == NULL) {
1211 SHARED_REGION_TRACE_ERROR(
1212 ("shared_region: %p [%d(%s)] map(): "
1213 "no shared region\n",
1214 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1215 proc_getpid(p), p->p_comm));
1216 error = EINVAL;
1217 goto done;
1218 }
1219
1220 /*
1221 * Check the shared region matches the current root
1222 * directory of this process. Deny the mapping to
1223 * avoid tainting the shared region with something that
1224 * doesn't quite belong into it.
1225 */
1226 struct vnode *sr_vnode = vm_shared_region_root_dir(shared_region);
1227 if (sr_vnode != NULL ? rdir_vp != sr_vnode : rdir_vp != rootvnode) {
1228 SHARED_REGION_TRACE_ERROR(
1229 ("shared_region: map(%p) root_dir mismatch\n",
1230 (void *)VM_KERNEL_ADDRPERM(current_thread())));
1231 error = EPERM;
1232 goto done;
1233 }
1234
1235
1236 for (srfmp = &(*sr_file_mappings)[0];
1237 srfmp < &(*sr_file_mappings)[files_count];
1238 srfmp++) {
1239 if (srfmp->mappings_count == 0) {
1240 /* no mappings here... */
1241 continue;
1242 }
1243
1244 /*
1245 * A file descriptor of -1 is used to indicate that the data
1246 * to be put in the shared region for this mapping comes directly
1247 * from the processes address space. Ensure we have proper alignments.
1248 */
1249 if (srfmp->fd == -1) {
1250 /* only allow one mapping per fd */
1251 if (srfmp->mappings_count > 1) {
1252 SHARED_REGION_TRACE_ERROR(
1253 ("shared_region: %p [%d(%s)] map data >1 mapping\n",
1254 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1255 proc_getpid(p), p->p_comm));
1256 error = EINVAL;
1257 goto done;
1258 }
1259
1260 /*
1261 * The destination address and size must be page aligned.
1262 */
1263 struct shared_file_mapping_slide_np *mapping = &srfmp->mappings[0];
1264 mach_vm_address_t dest_addr = mapping->sms_address;
1265 mach_vm_size_t map_size = mapping->sms_size;
1266 if (!vm_map_page_aligned(dest_addr, vm_map_page_mask(current_map()))) {
1267 SHARED_REGION_TRACE_ERROR(
1268 ("shared_region: %p [%d(%s)] map data destination 0x%llx not aligned\n",
1269 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1270 proc_getpid(p), p->p_comm, dest_addr));
1271 error = EINVAL;
1272 goto done;
1273 }
1274 if (!vm_map_page_aligned(map_size, vm_map_page_mask(current_map()))) {
1275 SHARED_REGION_TRACE_ERROR(
1276 ("shared_region: %p [%d(%s)] map data size 0x%llx not aligned\n",
1277 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1278 proc_getpid(p), p->p_comm, map_size));
1279 error = EINVAL;
1280 goto done;
1281 }
1282 continue;
1283 }
1284
1285 /* get file structure from file descriptor */
1286 error = fp_get_ftype(p, srfmp->fd, DTYPE_VNODE, EINVAL, &srfmp->fp);
1287 if (error) {
1288 SHARED_REGION_TRACE_ERROR(
1289 ("shared_region: %p [%d(%s)] map: "
1290 "fd=%d lookup failed (error=%d)\n",
1291 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1292 proc_getpid(p), p->p_comm, srfmp->fd, error));
1293 goto done;
1294 }
1295
1296 /* we need at least read permission on the file */
1297 if (!(srfmp->fp->fp_glob->fg_flag & FREAD)) {
1298 SHARED_REGION_TRACE_ERROR(
1299 ("shared_region: %p [%d(%s)] map: "
1300 "fd=%d not readable\n",
1301 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1302 proc_getpid(p), p->p_comm, srfmp->fd));
1303 error = EPERM;
1304 goto done;
1305 }
1306
1307 /* get vnode from file structure */
1308 error = vnode_getwithref((vnode_t)fp_get_data(srfmp->fp));
1309 if (error) {
1310 SHARED_REGION_TRACE_ERROR(
1311 ("shared_region: %p [%d(%s)] map: "
1312 "fd=%d getwithref failed (error=%d)\n",
1313 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1314 proc_getpid(p), p->p_comm, srfmp->fd, error));
1315 goto done;
1316 }
1317 srfmp->vp = (struct vnode *)fp_get_data(srfmp->fp);
1318
1319 /* make sure the vnode is a regular file */
1320 if (srfmp->vp->v_type != VREG) {
1321 SHARED_REGION_TRACE_ERROR(
1322 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1323 "not a file (type=%d)\n",
1324 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1325 proc_getpid(p), p->p_comm,
1326 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1327 srfmp->vp->v_name, srfmp->vp->v_type));
1328 error = EINVAL;
1329 goto done;
1330 }
1331
1332 #if CONFIG_MACF
1333 /* pass in 0 for the offset argument because AMFI does not need the offset
1334 * of the shared cache */
1335 error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
1336 srfmp->fp->fp_glob, VM_PROT_ALL, MAP_FILE | MAP_PRIVATE | MAP_FIXED, 0, &maxprot);
1337 if (error) {
1338 goto done;
1339 }
1340 #endif /* MAC */
1341
1342 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1343 /*
1344 * Check if the shared cache is in the trust cache;
1345 * if so, we can skip the root ownership check.
1346 */
1347 #if DEVELOPMENT || DEBUG
1348 /*
1349 * Skip both root ownership and trust cache check if
1350 * enforcement is disabled.
1351 */
1352 if (!cs_system_enforcement()) {
1353 goto after_root_check;
1354 }
1355 #endif /* DEVELOPMENT || DEBUG */
1356 struct cs_blob *blob = csvnode_get_blob(srfmp->vp, 0);
1357 if (blob == NULL) {
1358 SHARED_REGION_TRACE_ERROR(
1359 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1360 "missing CS blob\n",
1361 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1362 proc_getpid(p), p->p_comm,
1363 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1364 srfmp->vp->v_name));
1365 goto root_check;
1366 }
1367 const uint8_t *cdhash = csblob_get_cdhash(blob);
1368 if (cdhash == NULL) {
1369 SHARED_REGION_TRACE_ERROR(
1370 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1371 "missing cdhash\n",
1372 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1373 proc_getpid(p), p->p_comm,
1374 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1375 srfmp->vp->v_name));
1376 goto root_check;
1377 }
1378
1379 bool in_trust_cache = false;
1380 TrustCacheQueryToken_t qt;
1381 if (query_trust_cache(kTCQueryTypeAll, cdhash, &qt) == KERN_SUCCESS) {
1382 TCType_t tc_type = kTCTypeInvalid;
1383 TCReturn_t tc_ret = amfi->TrustCache.queryGetTCType(&qt, &tc_type);
1384 in_trust_cache = (tc_ret.error == kTCReturnSuccess &&
1385 (tc_type == kTCTypeCryptex1BootOS ||
1386 tc_type == kTCTypeStatic ||
1387 tc_type == kTCTypeEngineering));
1388 }
1389 if (!in_trust_cache) {
1390 SHARED_REGION_TRACE_ERROR(
1391 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1392 "not in trust cache\n",
1393 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1394 proc_getpid(p), p->p_comm,
1395 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1396 srfmp->vp->v_name));
1397 goto root_check;
1398 }
1399 goto after_root_check;
1400 root_check:
1401 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1402
1403 /* The shared cache file must be owned by root */
1404 VATTR_INIT(&va);
1405 VATTR_WANTED(&va, va_uid);
1406 error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1407 if (error) {
1408 SHARED_REGION_TRACE_ERROR(
1409 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1410 "vnode_getattr(%p) failed (error=%d)\n",
1411 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1412 proc_getpid(p), p->p_comm,
1413 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1414 srfmp->vp->v_name,
1415 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1416 error));
1417 goto done;
1418 }
1419 if (va.va_uid != 0) {
1420 SHARED_REGION_TRACE_ERROR(
1421 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1422 "owned by uid=%d instead of 0\n",
1423 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1424 proc_getpid(p), p->p_comm,
1425 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1426 srfmp->vp->v_name, va.va_uid));
1427 error = EPERM;
1428 goto done;
1429 }
1430
1431 #if XNU_TARGET_OS_OSX && defined(__arm64__)
1432 after_root_check:
1433 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
1434
1435 #if CONFIG_CSR
1436 if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
1437 VATTR_INIT(&va);
1438 VATTR_WANTED(&va, va_flags);
1439 error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
1440 if (error) {
1441 SHARED_REGION_TRACE_ERROR(
1442 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1443 "vnode_getattr(%p) failed (error=%d)\n",
1444 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1445 proc_getpid(p), p->p_comm,
1446 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1447 srfmp->vp->v_name,
1448 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1449 error));
1450 goto done;
1451 }
1452
1453 if (!(va.va_flags & SF_RESTRICTED)) {
1454 /*
1455 * CSR is not configured in CSR_ALLOW_UNRESTRICTED_FS mode, and
1456 * the shared cache file is NOT SIP-protected, so reject the
1457 * mapping request
1458 */
1459 SHARED_REGION_TRACE_ERROR(
1460 ("shared_region: %p [%d(%s)] map(%p:'%s'), "
1461 "vnode is not SIP-protected. \n",
1462 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1463 proc_getpid(p), p->p_comm,
1464 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1465 srfmp->vp->v_name));
1466 error = EPERM;
1467 goto done;
1468 }
1469 }
1470 #else /* CONFIG_CSR */
1471
1472 /*
1473 * Devices without SIP/ROSP need to make sure that the shared cache
1474 * is either on the root volume or in the preboot cryptex volume.
1475 */
1476 assert(rdir_vp != NULL);
1477 if (srfmp->vp->v_mount != rdir_vp->v_mount) {
1478 vnode_t preboot_vp = NULL;
1479 #if XNU_TARGET_OS_OSX
1480 #define PREBOOT_CRYPTEX_PATH "/System/Volumes/Preboot/Cryptexes"
1481 #else
1482 #define PREBOOT_CRYPTEX_PATH "/private/preboot/Cryptexes"
1483 #endif
1484 error = vnode_lookup(PREBOOT_CRYPTEX_PATH, 0, &preboot_vp, vfs_context_current());
1485 if (error || srfmp->vp->v_mount != preboot_vp->v_mount) {
1486 SHARED_REGION_TRACE_ERROR(
1487 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1488 "not on process' root volume nor preboot volume\n",
1489 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1490 proc_getpid(p), p->p_comm,
1491 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1492 srfmp->vp->v_name));
1493 error = EPERM;
1494 if (preboot_vp) {
1495 (void)vnode_put(preboot_vp);
1496 }
1497 goto done;
1498 } else if (preboot_vp) {
1499 (void)vnode_put(preboot_vp);
1500 }
1501 }
1502 #endif /* CONFIG_CSR */
1503
1504 if (scdir_enforce) {
1505 char **expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1506 struct vnode *scdir_vp = NULL;
1507 for (expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
1508 *expected_scdir_path != NULL;
1509 expected_scdir_path++) {
1510 /* get vnode for expected_scdir_path */
1511 error = vnode_lookup(*expected_scdir_path, 0, &scdir_vp, vfs_context_current());
1512 if (error) {
1513 SHARED_REGION_TRACE_ERROR(
1514 ("shared_region: %p [%d(%s)]: "
1515 "vnode_lookup(%s) failed (error=%d)\n",
1516 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1517 proc_getpid(p), p->p_comm,
1518 *expected_scdir_path, error));
1519 continue;
1520 }
1521
1522 /* check if parent is scdir_vp */
1523 assert(scdir_vp != NULL);
1524 if (vnode_parent(srfmp->vp) == scdir_vp) {
1525 (void)vnode_put(scdir_vp);
1526 scdir_vp = NULL;
1527 goto scdir_ok;
1528 }
1529 (void)vnode_put(scdir_vp);
1530 scdir_vp = NULL;
1531 }
1532 /* nothing matches */
1533 SHARED_REGION_TRACE_ERROR(
1534 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1535 "shared cache file not in expected directory\n",
1536 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1537 proc_getpid(p), p->p_comm,
1538 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1539 srfmp->vp->v_name));
1540 error = EPERM;
1541 goto done;
1542 }
1543 scdir_ok:
1544
1545 /* get vnode size */
1546 error = vnode_size(srfmp->vp, &fs, vfs_context_current());
1547 if (error) {
1548 SHARED_REGION_TRACE_ERROR(
1549 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1550 "vnode_size(%p) failed (error=%d)\n",
1551 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1552 proc_getpid(p), p->p_comm,
1553 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1554 srfmp->vp->v_name,
1555 (void *)VM_KERNEL_ADDRPERM(srfmp->vp), error));
1556 goto done;
1557 }
1558 srfmp->file_size = fs;
1559
1560 /* get the file's memory object handle */
1561 srfmp->file_control = ubc_getobject(srfmp->vp, UBC_HOLDOBJECT);
1562 if (srfmp->file_control == MEMORY_OBJECT_CONTROL_NULL) {
1563 SHARED_REGION_TRACE_ERROR(
1564 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1565 "no memory object\n",
1566 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1567 proc_getpid(p), p->p_comm,
1568 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1569 srfmp->vp->v_name));
1570 error = EINVAL;
1571 goto done;
1572 }
1573
1574 /* check that the mappings are properly covered by code signatures */
1575 if (!cs_system_enforcement()) {
1576 /* code signing is not enforced: no need to check */
1577 } else {
1578 for (i = 0; i < srfmp->mappings_count; i++) {
1579 if (srfmp->mappings[i].sms_init_prot & VM_PROT_ZF) {
1580 /* zero-filled mapping: not backed by the file */
1581 continue;
1582 }
1583 if (ubc_cs_is_range_codesigned(srfmp->vp,
1584 srfmp->mappings[i].sms_file_offset,
1585 srfmp->mappings[i].sms_size)) {
1586 /* this mapping is fully covered by code signatures */
1587 continue;
1588 }
1589 SHARED_REGION_TRACE_ERROR(
1590 ("shared_region: %p [%d(%s)] map(%p:'%s'): "
1591 "mapping #%d/%d [0x%llx:0x%llx:0x%llx:0x%x:0x%x] "
1592 "is not code-signed\n",
1593 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1594 proc_getpid(p), p->p_comm,
1595 (void *)VM_KERNEL_ADDRPERM(srfmp->vp),
1596 srfmp->vp->v_name,
1597 i, srfmp->mappings_count,
1598 srfmp->mappings[i].sms_address,
1599 srfmp->mappings[i].sms_size,
1600 srfmp->mappings[i].sms_file_offset,
1601 srfmp->mappings[i].sms_max_prot,
1602 srfmp->mappings[i].sms_init_prot));
1603 error = EINVAL;
1604 goto done;
1605 }
1606 }
1607 }
1608 done:
1609 if (error != 0) {
1610 shared_region_map_and_slide_cleanup(p, files_count, *sr_file_mappings, shared_region);
1611 *sr_file_mappings = NULL;
1612 *shared_region_ptr = NULL;
1613 }
1614 SHARED_REGION_TRACE_DEBUG(
1615 ("shared_region: %p [%d(%s)] map_and_slide_setup <- %d\n",
1616 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1617 proc_getpid(p), p->p_comm, error));
1618 return error;
1619 }
1620
1621 /*
1622 * shared_region_map_np()
1623 *
1624 * This system call is intended for dyld.
1625 *
1626 * dyld uses this to map a shared cache file into a shared region.
1627 * This is usually done only the first time a shared cache is needed.
1628 * Subsequent processes will just use the populated shared region without
1629 * requiring any further setup.
1630 */
1631 static int
_shared_region_map_and_slide(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings)1632 _shared_region_map_and_slide(
1633 struct proc *p,
1634 uint32_t files_count,
1635 struct shared_file_np *files,
1636 uint32_t mappings_count,
1637 struct shared_file_mapping_slide_np *mappings)
1638 {
1639 int error = 0;
1640 kern_return_t kr = KERN_SUCCESS;
1641 struct _sr_file_mappings *sr_file_mappings = NULL;
1642 struct vnode *rdir_vp = NULL;
1643 struct vm_shared_region *shared_region = NULL;
1644
1645 /*
1646 * Get a reference to the current proc's root dir.
1647 * Need this to prevent racing with chroot.
1648 */
1649 proc_fdlock(p);
1650 rdir_vp = p->p_fd.fd_rdir;
1651 if (rdir_vp == NULL) {
1652 rdir_vp = rootvnode;
1653 }
1654 assert(rdir_vp != NULL);
1655 vnode_get(rdir_vp);
1656 proc_fdunlock(p);
1657
1658 /*
1659 * Turn files, mappings into sr_file_mappings and other setup.
1660 */
1661 error = shared_region_map_and_slide_setup(p, files_count,
1662 files, mappings_count, mappings,
1663 &sr_file_mappings, &shared_region, rdir_vp);
1664 if (error != 0) {
1665 vnode_put(rdir_vp);
1666 return error;
1667 }
1668
1669 /* map the file(s) into that shared region's submap */
1670 kr = vm_shared_region_map_file(shared_region, files_count, sr_file_mappings);
1671 if (kr != KERN_SUCCESS) {
1672 SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] map(): "
1673 "vm_shared_region_map_file() failed kr=0x%x\n",
1674 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1675 proc_getpid(p), p->p_comm, kr));
1676 }
1677
1678 /* convert kern_return_t to errno */
1679 switch (kr) {
1680 case KERN_SUCCESS:
1681 error = 0;
1682 break;
1683 case KERN_INVALID_ADDRESS:
1684 error = EFAULT;
1685 break;
1686 case KERN_PROTECTION_FAILURE:
1687 error = EPERM;
1688 break;
1689 case KERN_NO_SPACE:
1690 error = ENOMEM;
1691 break;
1692 case KERN_FAILURE:
1693 case KERN_INVALID_ARGUMENT:
1694 default:
1695 error = EINVAL;
1696 break;
1697 }
1698
1699 /*
1700 * Mark that this process is now using split libraries.
1701 */
1702 if (error == 0 && (p->p_flag & P_NOSHLIB)) {
1703 OSBitAndAtomic(~((uint32_t)P_NOSHLIB), &p->p_flag);
1704 }
1705
1706 vnode_put(rdir_vp);
1707 shared_region_map_and_slide_cleanup(p, files_count, sr_file_mappings, shared_region);
1708
1709 SHARED_REGION_TRACE_DEBUG(
1710 ("shared_region: %p [%d(%s)] <- map\n",
1711 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1712 proc_getpid(p), p->p_comm));
1713
1714 return error;
1715 }
1716
1717 /*
1718 * Clean up part of _shared_region_map_and_slide()
1719 * It had to be broken out of _shared_region_map_and_slide() to
1720 * prevent compiler inlining from blowing out the stack.
1721 */
1722 __attribute__((noinline))
1723 static void
shared_region_map_and_slide_cleanup(struct proc * p,uint32_t files_count,struct _sr_file_mappings * sr_file_mappings,struct vm_shared_region * shared_region)1724 shared_region_map_and_slide_cleanup(
1725 struct proc *p,
1726 uint32_t files_count,
1727 struct _sr_file_mappings *sr_file_mappings,
1728 struct vm_shared_region *shared_region)
1729 {
1730 struct _sr_file_mappings *srfmp;
1731 struct vnode_attr va;
1732
1733 if (sr_file_mappings != NULL) {
1734 for (srfmp = &sr_file_mappings[0]; srfmp < &sr_file_mappings[files_count]; srfmp++) {
1735 if (srfmp->vp != NULL) {
1736 vnode_lock_spin(srfmp->vp);
1737 srfmp->vp->v_flag |= VSHARED_DYLD;
1738 vnode_unlock(srfmp->vp);
1739
1740 /* update the vnode's access time */
1741 if (!(vnode_vfsvisflags(srfmp->vp) & MNT_NOATIME)) {
1742 VATTR_INIT(&va);
1743 nanotime(&va.va_access_time);
1744 VATTR_SET_ACTIVE(&va, va_access_time);
1745 vnode_setattr(srfmp->vp, &va, vfs_context_current());
1746 }
1747
1748 #if NAMEDSTREAMS
1749 /*
1750 * If the shared cache is compressed, it may
1751 * have a namedstream vnode instantiated for
1752 * for it. That namedstream vnode will also
1753 * have to be marked with VSHARED_DYLD.
1754 */
1755 if (vnode_hasnamedstreams(srfmp->vp)) {
1756 vnode_t svp;
1757 if (vnode_getnamedstream(srfmp->vp, &svp, XATTR_RESOURCEFORK_NAME,
1758 NS_OPEN, 0, vfs_context_kernel()) == 0) {
1759 vnode_lock_spin(svp);
1760 svp->v_flag |= VSHARED_DYLD;
1761 vnode_unlock(svp);
1762 vnode_put(svp);
1763 }
1764 }
1765 #endif /* NAMEDSTREAMS */
1766 /*
1767 * release the vnode...
1768 * ubc_map() still holds it for us in the non-error case
1769 */
1770 (void) vnode_put(srfmp->vp);
1771 srfmp->vp = NULL;
1772 }
1773 if (srfmp->fp != NULL) {
1774 /* release the file descriptor */
1775 fp_drop(p, srfmp->fd, srfmp->fp, 0);
1776 srfmp->fp = NULL;
1777 }
1778 }
1779 kfree_type(struct _sr_file_mappings, files_count, sr_file_mappings);
1780 }
1781
1782 if (shared_region != NULL) {
1783 vm_shared_region_deallocate(shared_region);
1784 }
1785 }
1786
1787 /*
1788 * For each file mapped, we may have mappings for:
1789 * TEXT, EXECUTE, LINKEDIT, DATA_CONST, __AUTH, DATA
1790 * so let's round up to 8 mappings per file.
1791 */
1792 #define SFM_MAX (_SR_FILE_MAPPINGS_MAX_FILES * 8) /* max mapping structs allowed to pass in */
1793
1794 /*
1795 * This is the new interface for setting up shared region mappings.
1796 *
1797 * The slide used for shared regions setup using this interface is done differently
1798 * from the old interface. The slide value passed in the shared_files_np represents
1799 * a max value. The kernel will choose a random value based on that, then use it
1800 * for all shared regions.
1801 */
1802 #if defined (__x86_64__)
1803 #define SLIDE_AMOUNT_MASK ~FOURK_PAGE_MASK
1804 #else
1805 #define SLIDE_AMOUNT_MASK ~SIXTEENK_PAGE_MASK
1806 #endif
1807
1808 static inline __result_use_check kern_return_t
shared_region_map_and_slide_2_np_sanitize(struct proc * p,user_addr_t mappings_userspace_addr,unsigned int count,shared_file_mapping_slide_np_t * mappings)1809 shared_region_map_and_slide_2_np_sanitize(
1810 struct proc *p,
1811 user_addr_t mappings_userspace_addr,
1812 unsigned int count,
1813 shared_file_mapping_slide_np_t *mappings)
1814 {
1815 kern_return_t kr;
1816 vm_map_t map = current_map();
1817 mach_vm_address_t addr, end;
1818 mach_vm_offset_t offset, offset_end;
1819 mach_vm_size_t size, offset_size;
1820 user_addr_t slide_start, slide_end, slide_size;
1821 vm_prot_t cur;
1822 vm_prot_t max;
1823
1824 user_addr_t user_addr = mappings_userspace_addr;
1825
1826 for (size_t i = 0; i < count; i++) {
1827 shared_file_mapping_slide_np_ut mapping_u;
1828 /*
1829 * First we bring each mapping struct into our kernel stack to
1830 * avoid TOCTOU.
1831 */
1832 kr = shared_region_copyin(
1833 p,
1834 user_addr,
1835 1, // copy 1 element at a time
1836 sizeof(shared_file_mapping_slide_np_ut),
1837 &mapping_u);
1838 if (__improbable(kr != KERN_SUCCESS)) {
1839 return kr;
1840 }
1841
1842 /*
1843 * Then, we sanitize the data on the kernel stack.
1844 */
1845 kr = vm_sanitize_addr_size(
1846 mapping_u.sms_address_u,
1847 mapping_u.sms_size_u,
1848 VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1849 map,
1850 (VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1851 | VM_SANITIZE_FLAGS_CHECK_ALIGNED_START
1852 | VM_SANITIZE_FLAGS_CHECK_ALIGNED_SIZE),
1853 &addr,
1854 &end,
1855 &size);
1856 if (__improbable(kr != KERN_SUCCESS)) {
1857 return kr;
1858 }
1859
1860 kr = vm_sanitize_addr_size(
1861 mapping_u.sms_file_offset_u,
1862 mapping_u.sms_size_u,
1863 VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1864 PAGE_MASK,
1865 (VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1866 | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1867 &offset,
1868 &offset_end,
1869 &offset_size);
1870 if (__improbable(kr != KERN_SUCCESS)) {
1871 return kr;
1872 }
1873 if (__improbable(0 != (offset & vm_map_page_mask(map)))) {
1874 return KERN_INVALID_ARGUMENT;
1875 }
1876
1877 /*
1878 * Unsafe access is immediately followed by wrap to
1879 * convert from addr to size.
1880 */
1881 mach_vm_size_ut sms_slide_size_u =
1882 vm_sanitize_wrap_size(
1883 VM_SANITIZE_UNSAFE_UNWRAP(
1884 mapping_u.sms_slide_size_u));
1885
1886 kr = vm_sanitize_addr_size(
1887 mapping_u.sms_slide_start_u,
1888 sms_slide_size_u,
1889 VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1890 map,
1891 (VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH
1892 | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES),
1893 &slide_start,
1894 &slide_end,
1895 &slide_size);
1896 if (__improbable(kr != KERN_SUCCESS)) {
1897 return kr;
1898 }
1899
1900 kr = vm_sanitize_cur_and_max_prots(
1901 mapping_u.sms_init_prot_u,
1902 mapping_u.sms_max_prot_u,
1903 VM_SANITIZE_CALLER_SHARED_REGION_MAP_AND_SLIDE_2_NP,
1904 map,
1905 VM_PROT_SFM_EXTENSIONS_MASK | VM_PROT_TPRO,
1906 &cur,
1907 &max);
1908 if (__improbable(kr != KERN_SUCCESS)) {
1909 return kr;
1910 }
1911
1912 /*
1913 * Finally, we move the data from the kernel stack to our
1914 * caller-allocated kernel heap buffer.
1915 */
1916 mappings[i].sms_address = addr;
1917 mappings[i].sms_size = size;
1918 mappings[i].sms_file_offset = offset;
1919 mappings[i].sms_slide_size = slide_size;
1920 mappings[i].sms_slide_start = slide_start;
1921 mappings[i].sms_max_prot = max;
1922 mappings[i].sms_init_prot = cur;
1923
1924 if (__improbable(os_add_overflow(
1925 user_addr,
1926 sizeof(shared_file_mapping_slide_np_ut),
1927 &user_addr))) {
1928 return KERN_INVALID_ARGUMENT;
1929 }
1930 }
1931
1932 return KERN_SUCCESS;
1933 }
1934
1935 int
shared_region_map_and_slide_2_np(struct proc * p,struct shared_region_map_and_slide_2_np_args * uap,__unused int * retvalp)1936 shared_region_map_and_slide_2_np(
1937 struct proc *p,
1938 struct shared_region_map_and_slide_2_np_args *uap,
1939 __unused int *retvalp)
1940 {
1941 unsigned int files_count;
1942 struct shared_file_np *shared_files = NULL;
1943 unsigned int mappings_count;
1944 struct shared_file_mapping_slide_np *mappings = NULL;
1945 kern_return_t kr = KERN_SUCCESS;
1946
1947 files_count = uap->files_count;
1948 mappings_count = uap->mappings_count;
1949
1950 SHARED_REGION_TRACE_DEBUG(
1951 ("shared_region: %p [%d(%s)] -> map_and_slide(0x%llx)\n",
1952 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1953 proc_getpid(p), p->p_comm,
1954 (uint64_t)uap->mappings_u));
1955
1956 if (files_count == 0) {
1957 SHARED_REGION_TRACE_INFO(
1958 ("shared_region: %p [%d(%s)] map(): "
1959 "no files\n",
1960 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1961 proc_getpid(p), p->p_comm));
1962 kr = 0; /* no files to map: we're done ! */
1963 goto done;
1964 } else if (files_count <= _SR_FILE_MAPPINGS_MAX_FILES) {
1965 shared_files = kalloc_data(files_count * sizeof(shared_files[0]), Z_WAITOK);
1966 if (shared_files == NULL) {
1967 kr = KERN_RESOURCE_SHORTAGE;
1968 goto done;
1969 }
1970 } else {
1971 SHARED_REGION_TRACE_ERROR(
1972 ("shared_region: %p [%d(%s)] map(): "
1973 "too many files (%d) max %d\n",
1974 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1975 proc_getpid(p), p->p_comm,
1976 files_count, _SR_FILE_MAPPINGS_MAX_FILES));
1977 kr = KERN_FAILURE;
1978 goto done;
1979 }
1980
1981 if (mappings_count == 0) {
1982 SHARED_REGION_TRACE_INFO(
1983 ("shared_region: %p [%d(%s)] map(): "
1984 "no mappings\n",
1985 (void *)VM_KERNEL_ADDRPERM(current_thread()),
1986 proc_getpid(p), p->p_comm));
1987 kr = 0; /* no mappings: we're done ! */
1988 goto done;
1989 } else if (mappings_count <= SFM_MAX) {
1990 mappings = kalloc_data(mappings_count * sizeof(mappings[0]), Z_WAITOK);
1991 if (mappings == NULL) {
1992 kr = KERN_RESOURCE_SHORTAGE;
1993 goto done;
1994 }
1995 } else {
1996 SHARED_REGION_TRACE_ERROR(
1997 ("shared_region: %p [%d(%s)] map(): "
1998 "too many mappings (%d) max %d\n",
1999 (void *)VM_KERNEL_ADDRPERM(current_thread()),
2000 proc_getpid(p), p->p_comm,
2001 mappings_count, SFM_MAX));
2002 kr = KERN_FAILURE;
2003 goto done;
2004 }
2005
2006 /*
2007 * struct shared_file_np does not have fields that are subject to
2008 * sanitization, it is thus copied from userspace as is.
2009 */
2010 kr = shared_region_copyin(p, uap->files, files_count, sizeof(shared_files[0]), shared_files);
2011 if (kr != KERN_SUCCESS) {
2012 SHARED_REGION_TRACE_ERROR(
2013 ("shared_region: %p [%d(%s)] copyin() returned 0x%x\n",
2014 (void *)VM_KERNEL_ADDRPERM(current_thread()),
2015 proc_getpid(p), p->p_comm, kr));
2016 goto done;
2017 }
2018
2019 kr = shared_region_map_and_slide_2_np_sanitize(
2020 p,
2021 uap->mappings_u,
2022 mappings_count,
2023 mappings);
2024 if (__improbable(kr != KERN_SUCCESS)) {
2025 SHARED_REGION_TRACE_ERROR(
2026 ("shared_region: %p [%d(%s)] sanitize() returned 0x%x\n",
2027 (void *)VM_KERNEL_ADDRPERM(current_thread()),
2028 proc_getpid(p), p->p_comm, kr));
2029 kr = vm_sanitize_get_kr(kr);
2030 goto done;
2031 }
2032
2033 uint32_t max_slide = shared_files[0].sf_slide;
2034 uint32_t random_val;
2035 uint32_t slide_amount;
2036
2037 if (max_slide != 0) {
2038 read_random(&random_val, sizeof random_val);
2039 slide_amount = ((random_val % max_slide) & SLIDE_AMOUNT_MASK);
2040 } else {
2041 slide_amount = 0;
2042 }
2043 #if DEVELOPMENT || DEBUG
2044 extern bool bootarg_disable_aslr;
2045 if (bootarg_disable_aslr) {
2046 slide_amount = 0;
2047 }
2048 #endif /* DEVELOPMENT || DEBUG */
2049
2050 /*
2051 * Fix up the mappings to reflect the desired slide.
2052 */
2053 unsigned int f;
2054 unsigned int m = 0;
2055 unsigned int i;
2056 for (f = 0; f < files_count; ++f) {
2057 shared_files[f].sf_slide = slide_amount;
2058 for (i = 0; i < shared_files[f].sf_mappings_count; ++i, ++m) {
2059 if (m >= mappings_count) {
2060 SHARED_REGION_TRACE_ERROR(
2061 ("shared_region: %p [%d(%s)] map(): "
2062 "mapping count argument was too small\n",
2063 (void *)VM_KERNEL_ADDRPERM(current_thread()),
2064 proc_getpid(p), p->p_comm));
2065 kr = KERN_FAILURE;
2066 goto done;
2067 }
2068 if (__improbable(
2069 os_add_overflow(
2070 mappings[m].sms_address,
2071 slide_amount,
2072 &mappings[m].sms_address))) {
2073 kr = KERN_INVALID_ARGUMENT;
2074 goto done;
2075 }
2076 if (mappings[m].sms_slide_size != 0) {
2077 mach_vm_address_t discard;
2078 /* Slide and check that new start/size pairs do not overflow. */
2079 if (__improbable(
2080 os_add_overflow(
2081 mappings[m].sms_slide_start,
2082 slide_amount,
2083 &mappings[m].sms_slide_start) ||
2084 os_add_overflow(
2085 mappings[m].sms_slide_start,
2086 mappings[m].sms_slide_size,
2087 &discard))) {
2088 kr = KERN_INVALID_ARGUMENT;
2089 goto done;
2090 }
2091 }
2092 }
2093 }
2094
2095 kr = _shared_region_map_and_slide(p, files_count, shared_files, mappings_count, mappings);
2096 done:
2097 kfree_data(shared_files, files_count * sizeof(shared_files[0]));
2098 kfree_data(mappings, mappings_count * sizeof(mappings[0]));
2099
2100 SHARED_REGION_TRACE_DEBUG(
2101 ("shared_region: %p [%d(%s)] map_and_slide(0x%llx) <- 0x%x\n",
2102 (void *)VM_KERNEL_ADDRPERM(current_thread()),
2103 proc_getpid(p), p->p_comm,
2104 (uint64_t)uap->mappings_u, kr));
2105
2106 return kr;
2107 }
2108
2109 /*
2110 * A syscall for dyld to use to map data pages that need load time relocation fixups.
2111 * The fixups are performed by a custom pager during page-in, so the pages still appear
2112 * "clean" and hence are easily discarded under memory pressure. They can be re-paged-in
2113 * on demand later, all w/o using the compressor.
2114 *
2115 * Note these page are treated as MAP_PRIVATE. So if the application dirties any pages while
2116 * running, they are COW'd as normal.
2117 */
2118 int
map_with_linking_np(struct proc * p,struct map_with_linking_np_args * uap,__unused int * retvalp)2119 map_with_linking_np(
2120 struct proc *p,
2121 struct map_with_linking_np_args *uap,
2122 __unused int *retvalp)
2123 {
2124 uint32_t region_count;
2125 uint32_t r;
2126 struct mwl_region *regions = NULL;
2127 struct mwl_region *rp;
2128 uint32_t link_info_size;
2129 void *link_info = NULL; /* starts with a struct mwl_info_hdr */
2130 struct mwl_info_hdr *info_hdr = NULL;
2131 uint64_t binds_size;
2132 int fd;
2133 struct fileproc *fp = NULL;
2134 struct vnode *vp = NULL;
2135 size_t file_size;
2136 off_t fs;
2137 struct vnode_attr va;
2138 memory_object_control_t file_control = NULL;
2139 int error;
2140 kern_return_t kr = KERN_SUCCESS;
2141
2142 /*
2143 * Check if dyld has told us it finished with this call.
2144 */
2145 if (p->p_disallow_map_with_linking) {
2146 printf("%s: [%d(%s)]: map__with_linking() was disabled\n",
2147 __func__, proc_getpid(p), p->p_comm);
2148 kr = KERN_FAILURE;
2149 goto done;
2150 }
2151
2152 /*
2153 * First we do some sanity checking on what dyld has passed us.
2154 */
2155 region_count = uap->region_count;
2156 link_info_size = uap->link_info_size;
2157 if (region_count == 0) {
2158 printf("%s: [%d(%s)]: region_count == 0\n",
2159 __func__, proc_getpid(p), p->p_comm);
2160 kr = KERN_FAILURE;
2161 goto done;
2162 }
2163 if (region_count > MWL_MAX_REGION_COUNT) {
2164 printf("%s: [%d(%s)]: region_count too big %d\n",
2165 __func__, proc_getpid(p), p->p_comm, region_count);
2166 kr = KERN_FAILURE;
2167 goto done;
2168 }
2169
2170 if (link_info_size <= MWL_MIN_LINK_INFO_SIZE) {
2171 printf("%s: [%d(%s)]: link_info_size too small\n",
2172 __func__, proc_getpid(p), p->p_comm);
2173 kr = KERN_FAILURE;
2174 goto done;
2175 }
2176 if (link_info_size >= MWL_MAX_LINK_INFO_SIZE) {
2177 printf("%s: [%d(%s)]: link_info_size too big %d\n",
2178 __func__, proc_getpid(p), p->p_comm, link_info_size);
2179 kr = KERN_FAILURE;
2180 goto done;
2181 }
2182
2183 /*
2184 * Allocate and copyin the regions and link info
2185 */
2186 regions = kalloc_data(region_count * sizeof(regions[0]), Z_WAITOK);
2187 if (regions == NULL) {
2188 printf("%s: [%d(%s)]: failed to allocate regions\n",
2189 __func__, proc_getpid(p), p->p_comm);
2190 kr = KERN_RESOURCE_SHORTAGE;
2191 goto done;
2192 }
2193 kr = shared_region_copyin(p, uap->regions, region_count, sizeof(regions[0]), regions);
2194 if (kr != KERN_SUCCESS) {
2195 printf("%s: [%d(%s)]: failed to copyin regions kr=%d\n",
2196 __func__, proc_getpid(p), p->p_comm, kr);
2197 goto done;
2198 }
2199
2200 link_info = kalloc_data(link_info_size, Z_WAITOK);
2201 if (link_info == NULL) {
2202 printf("%s: [%d(%s)]: failed to allocate link_info\n",
2203 __func__, proc_getpid(p), p->p_comm);
2204 kr = KERN_RESOURCE_SHORTAGE;
2205 goto done;
2206 }
2207 kr = shared_region_copyin(p, uap->link_info, 1, link_info_size, link_info);
2208 if (kr != KERN_SUCCESS) {
2209 printf("%s: [%d(%s)]: failed to copyin link_info kr=%d\n",
2210 __func__, proc_getpid(p), p->p_comm, kr);
2211 goto done;
2212 }
2213
2214 /*
2215 * Do some verification the data structures.
2216 */
2217 info_hdr = (struct mwl_info_hdr *)link_info;
2218 if (info_hdr->mwli_version != MWL_INFO_VERS) {
2219 printf("%s: [%d(%s)]: unrecognized mwli_version=%d\n",
2220 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_version);
2221 kr = KERN_FAILURE;
2222 goto done;
2223 }
2224
2225 if (info_hdr->mwli_binds_offset > link_info_size) {
2226 printf("%s: [%d(%s)]: mwli_binds_offset too large %d\n",
2227 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_offset);
2228 kr = KERN_FAILURE;
2229 goto done;
2230 }
2231
2232 /* some older devs have s/w page size > h/w page size, no need to support them */
2233 if (info_hdr->mwli_page_size != PAGE_SIZE) {
2234 /* no printf, since this is expected on some devices */
2235 kr = KERN_INVALID_ARGUMENT;
2236 goto done;
2237 }
2238
2239 binds_size = (uint64_t)info_hdr->mwli_binds_count *
2240 ((info_hdr->mwli_pointer_format == DYLD_CHAINED_PTR_32) ? 4 : 8);
2241 if (binds_size > link_info_size - info_hdr->mwli_binds_offset) {
2242 printf("%s: [%d(%s)]: mwli_binds_count too large %d\n",
2243 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_binds_count);
2244 kr = KERN_FAILURE;
2245 goto done;
2246 }
2247
2248 if (info_hdr->mwli_chains_offset > link_info_size) {
2249 printf("%s: [%d(%s)]: mwli_chains_offset too large %d\n",
2250 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_offset);
2251 kr = KERN_FAILURE;
2252 goto done;
2253 }
2254
2255
2256 /*
2257 * Ensure the chained starts in the link info and make sure the
2258 * segment info offsets are within bounds.
2259 */
2260 if (info_hdr->mwli_chains_size < sizeof(struct dyld_chained_starts_in_image)) {
2261 printf("%s: [%d(%s)]: mwli_chains_size too small %d\n",
2262 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2263 kr = KERN_FAILURE;
2264 goto done;
2265 }
2266 if (info_hdr->mwli_chains_size > link_info_size - info_hdr->mwli_chains_offset) {
2267 printf("%s: [%d(%s)]: mwli_chains_size too large %d\n",
2268 __func__, proc_getpid(p), p->p_comm, info_hdr->mwli_chains_size);
2269 kr = KERN_FAILURE;
2270 goto done;
2271 }
2272
2273 /* Note that more verification of offsets is done in the pager itself */
2274
2275 /*
2276 * Ensure we've only been given one FD and verify valid protections.
2277 */
2278 fd = regions[0].mwlr_fd;
2279 for (r = 0; r < region_count; ++r) {
2280 if (regions[r].mwlr_fd != fd) {
2281 printf("%s: [%d(%s)]: mwlr_fd mismatch %d and %d\n",
2282 __func__, proc_getpid(p), p->p_comm, fd, regions[r].mwlr_fd);
2283 kr = KERN_FAILURE;
2284 goto done;
2285 }
2286
2287 /*
2288 * Only allow data mappings and not zero fill. Permit TPRO
2289 * mappings only when VM_PROT_READ | VM_PROT_WRITE.
2290 */
2291 if (regions[r].mwlr_protections & VM_PROT_EXECUTE) {
2292 printf("%s: [%d(%s)]: mwlr_protections EXECUTE not allowed\n",
2293 __func__, proc_getpid(p), p->p_comm);
2294 kr = KERN_FAILURE;
2295 goto done;
2296 }
2297 if (regions[r].mwlr_protections & VM_PROT_ZF) {
2298 printf("%s: [%d(%s)]: region %d, found VM_PROT_ZF not allowed\n",
2299 __func__, proc_getpid(p), p->p_comm, r);
2300 kr = KERN_FAILURE;
2301 goto done;
2302 }
2303 if ((regions[r].mwlr_protections & VM_PROT_TPRO) &&
2304 !(regions[r].mwlr_protections & VM_PROT_WRITE)) {
2305 printf("%s: [%d(%s)]: region %d, found VM_PROT_TPRO without VM_PROT_WRITE\n",
2306 __func__, proc_getpid(p), p->p_comm, r);
2307 kr = KERN_FAILURE;
2308 goto done;
2309 }
2310 }
2311
2312
2313 /* get file structure from file descriptor */
2314 error = fp_get_ftype(p, fd, DTYPE_VNODE, EINVAL, &fp);
2315 if (error) {
2316 printf("%s: [%d(%s)]: fp_get_ftype() failed, error %d\n",
2317 __func__, proc_getpid(p), p->p_comm, error);
2318 kr = KERN_FAILURE;
2319 goto done;
2320 }
2321
2322 /* We need at least read permission on the file */
2323 if (!(fp->fp_glob->fg_flag & FREAD)) {
2324 printf("%s: [%d(%s)]: not readable\n",
2325 __func__, proc_getpid(p), p->p_comm);
2326 kr = KERN_FAILURE;
2327 goto done;
2328 }
2329
2330 /* Get the vnode from file structure */
2331 vp = (struct vnode *)fp_get_data(fp);
2332 error = vnode_getwithref(vp);
2333 if (error) {
2334 printf("%s: [%d(%s)]: failed to get vnode, error %d\n",
2335 __func__, proc_getpid(p), p->p_comm, error);
2336 kr = KERN_FAILURE;
2337 vp = NULL; /* just to be sure */
2338 goto done;
2339 }
2340
2341 /* Make sure the vnode is a regular file */
2342 if (vp->v_type != VREG) {
2343 printf("%s: [%d(%s)]: vnode not VREG\n",
2344 __func__, proc_getpid(p), p->p_comm);
2345 kr = KERN_FAILURE;
2346 goto done;
2347 }
2348
2349 /* get vnode size */
2350 error = vnode_size(vp, &fs, vfs_context_current());
2351 if (error) {
2352 goto done;
2353 }
2354 file_size = fs;
2355
2356 /* get the file's memory object handle */
2357 file_control = ubc_getobject(vp, UBC_HOLDOBJECT);
2358 if (file_control == MEMORY_OBJECT_CONTROL_NULL) {
2359 printf("%s: [%d(%s)]: no memory object\n",
2360 __func__, proc_getpid(p), p->p_comm);
2361 kr = KERN_FAILURE;
2362 goto done;
2363 }
2364
2365 for (r = 0; r < region_count; ++r) {
2366 rp = ®ions[r];
2367
2368 #if CONFIG_MACF
2369 vm_prot_t prot = (rp->mwlr_protections & VM_PROT_ALL);
2370 error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
2371 fp->fp_glob, prot, MAP_FILE | MAP_PRIVATE | MAP_FIXED, rp->mwlr_file_offset, &prot);
2372 if (error) {
2373 printf("%s: [%d(%s)]: mac_file_check_mmap() failed, region %d, error %d\n",
2374 __func__, proc_getpid(p), p->p_comm, r, error);
2375 kr = KERN_FAILURE;
2376 goto done;
2377 }
2378 #endif /* MAC */
2379
2380 /* check that the mappings are properly covered by code signatures */
2381 if (cs_system_enforcement()) {
2382 if (!ubc_cs_is_range_codesigned(vp, rp->mwlr_file_offset, rp->mwlr_size)) {
2383 printf("%s: [%d(%s)]: region %d, not code signed\n",
2384 __func__, proc_getpid(p), p->p_comm, r);
2385 kr = KERN_FAILURE;
2386 goto done;
2387 }
2388 }
2389 }
2390
2391 /* update the vnode's access time */
2392 if (!(vnode_vfsvisflags(vp) & MNT_NOATIME)) {
2393 VATTR_INIT(&va);
2394 nanotime(&va.va_access_time);
2395 VATTR_SET_ACTIVE(&va, va_access_time);
2396 vnode_setattr(vp, &va, vfs_context_current());
2397 }
2398
2399 /* get the VM to do the work */
2400 kr = vm_map_with_linking(proc_task(p), regions, region_count, &link_info, link_info_size, file_control);
2401
2402 done:
2403 if (fp != NULL) {
2404 /* release the file descriptor */
2405 fp_drop(p, fd, fp, 0);
2406 }
2407 if (vp != NULL) {
2408 (void)vnode_put(vp);
2409 }
2410 if (regions != NULL) {
2411 kfree_data(regions, region_count * sizeof(regions[0]));
2412 }
2413 /* link info is NULL if it is used in the pager, if things worked */
2414 if (link_info != NULL) {
2415 kfree_data(link_info, link_info_size);
2416 }
2417
2418 switch (kr) {
2419 case KERN_SUCCESS:
2420 return 0;
2421 case KERN_RESOURCE_SHORTAGE:
2422 return ENOMEM;
2423 default:
2424 return EINVAL;
2425 }
2426 }
2427
2428 #if DEBUG || DEVELOPMENT
2429 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count,
2430 CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count, 0, "");
2431 SYSCTL_INT(_vm, OID_AUTO, dyld_pager_count_max,
2432 CTLFLAG_RD | CTLFLAG_LOCKED, &dyld_pager_count_max, 0, "");
2433 #endif /* DEBUG || DEVELOPMENT */
2434
2435 /* sysctl overflow room */
2436
2437 SYSCTL_INT(_vm, OID_AUTO, pagesize, CTLFLAG_RD | CTLFLAG_LOCKED,
2438 (int *) &page_size, 0, "vm page size");
2439
2440 /* vm_page_free_target is provided as a makeshift solution for applications that want to
2441 * allocate buffer space, possibly purgeable memory, but not cause inactive pages to be
2442 * reclaimed. It allows the app to calculate how much memory is free outside the free target. */
2443 extern unsigned int vm_page_free_target;
2444 SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED,
2445 &vm_page_free_target, 0, "Pageout daemon free target");
2446
2447 SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED,
2448 &vm_pageout_state.vm_memory_pressure, 0, "Memory pressure indicator");
2449
2450 static int
2451 vm_ctl_page_free_wanted SYSCTL_HANDLER_ARGS
2452 {
2453 #pragma unused(oidp, arg1, arg2)
2454 unsigned int page_free_wanted;
2455
2456 page_free_wanted = mach_vm_ctl_page_free_wanted();
2457 return SYSCTL_OUT(req, &page_free_wanted, sizeof(page_free_wanted));
2458 }
2459 SYSCTL_PROC(_vm, OID_AUTO, page_free_wanted,
2460 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
2461 0, 0, vm_ctl_page_free_wanted, "I", "");
2462
2463 extern unsigned int vm_page_purgeable_count;
2464 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2465 &vm_page_purgeable_count, 0, "Purgeable page count");
2466
2467 extern unsigned int vm_page_purgeable_wired_count;
2468 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2469 &vm_page_purgeable_wired_count, 0, "Wired purgeable page count");
2470
2471 extern unsigned int vm_page_kern_lpage_count;
2472 SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2473 &vm_page_kern_lpage_count, 0, "kernel used large pages");
2474
2475 SCALABLE_COUNTER_DECLARE(vm_page_grab_count);
2476 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed");
2477 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_kern);
2478 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_kern, vm_page_grab_count_kern, "Total pages grabbed (kernel)");
2479 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_iopl);
2480 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_iopl, vm_page_grab_count_iopl, "Total pages grabbed (iopl)");
2481 SCALABLE_COUNTER_DECLARE(vm_page_grab_count_upl);
2482 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed_upl, vm_page_grab_count_upl, "Total pages grabbed (upl)");
2483
2484
2485 #if DEVELOPMENT || DEBUG
2486 SCALABLE_COUNTER_DECLARE(vm_page_deactivate_behind_count);
2487 SYSCTL_SCALABLE_COUNTER(_vm, pages_deactivated_behind, vm_page_deactivate_behind_count,
2488 "Number of pages deactivated behind");
2489 #endif
2490
2491 #if DEVELOPMENT || DEBUG
2492 #if __ARM_MIXED_PAGE_SIZE__
2493 static int vm_mixed_pagesize_supported = 1;
2494 #else
2495 static int vm_mixed_pagesize_supported = 0;
2496 #endif /*__ARM_MIXED_PAGE_SIZE__ */
2497 SYSCTL_INT(_debug, OID_AUTO, vm_mixed_pagesize_supported, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED,
2498 &vm_mixed_pagesize_supported, 0, "kernel support for mixed pagesize");
2499
2500 SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
2501 &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed");
2502
2503 SYSCTL_INT(_vm, OID_AUTO, pageout_purged_objects, CTLFLAG_RD | CTLFLAG_LOCKED,
2504 &vm_pageout_debug.vm_pageout_purged_objects, 0, "System purged object count");
2505 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED,
2506 &vm_pageout_debug.vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)");
2507 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED,
2508 &vm_pageout_debug.vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)");
2509
2510 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2511 &vm_pageout_debug.vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated");
2512 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2513 &vm_pageout_debug.vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated");
2514 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2515 &vm_pageout_debug.vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated"); /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */
2516 SYSCTL_ULONG(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2517 &vm_pageout_vminfo.vm_pageout_freed_cleaned, "Cleaned pages freed");
2518 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
2519 &vm_pageout_debug.vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated");
2520 SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
2521 &vm_pageout_debug.vm_pageout_enqueued_cleaned, 0, ""); /* sum of next two */
2522 #endif /* DEVELOPMENT || DEBUG */
2523
2524 extern int madvise_free_debug;
2525 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
2526 &madvise_free_debug, 0, "zero-fill on madvise(MADV_FREE*)");
2527 extern int madvise_free_debug_sometimes;
2528 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug_sometimes, CTLFLAG_RW | CTLFLAG_LOCKED,
2529 &madvise_free_debug_sometimes, 0, "sometimes zero-fill on madvise(MADV_FREE*)");
2530
2531 SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2532 &vm_page_stats_reusable.reusable_count, 0, "Reusable page count");
2533 SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2534 &vm_page_stats_reusable.reusable_pages_success, "");
2535 SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2536 &vm_page_stats_reusable.reusable_pages_failure, "");
2537 SYSCTL_QUAD(_vm, OID_AUTO, reusable_pages_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2538 &vm_page_stats_reusable.reusable_pages_shared, "");
2539 SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2540 &vm_page_stats_reusable.all_reusable_calls, "");
2541 SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2542 &vm_page_stats_reusable.partial_reusable_calls, "");
2543 SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2544 &vm_page_stats_reusable.reuse_pages_success, "");
2545 SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2546 &vm_page_stats_reusable.reuse_pages_failure, "");
2547 SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2548 &vm_page_stats_reusable.all_reuse_calls, "");
2549 SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
2550 &vm_page_stats_reusable.partial_reuse_calls, "");
2551 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
2552 &vm_page_stats_reusable.can_reuse_success, "");
2553 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
2554 &vm_page_stats_reusable.can_reuse_failure, "");
2555 SYSCTL_QUAD(_vm, OID_AUTO, reusable_reclaimed, CTLFLAG_RD | CTLFLAG_LOCKED,
2556 &vm_page_stats_reusable.reusable_reclaimed, "");
2557 SYSCTL_QUAD(_vm, OID_AUTO, reusable_nonwritable, CTLFLAG_RD | CTLFLAG_LOCKED,
2558 &vm_page_stats_reusable.reusable_nonwritable, "");
2559 SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2560 &vm_page_stats_reusable.reusable_shared, "");
2561 SYSCTL_QUAD(_vm, OID_AUTO, free_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
2562 &vm_page_stats_reusable.free_shared, "");
2563
2564
2565 extern unsigned int vm_page_free_count, vm_page_speculative_count;
2566 SYSCTL_UINT(_vm, OID_AUTO, page_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_count, 0, "");
2567 SYSCTL_UINT(_vm, OID_AUTO, page_speculative_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_speculative_count, 0, "");
2568
2569 extern unsigned int vm_page_cleaned_count;
2570 SYSCTL_UINT(_vm, OID_AUTO, page_cleaned_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_cleaned_count, 0, "Cleaned queue size");
2571
2572 extern unsigned int vm_page_pageable_internal_count, vm_page_pageable_external_count;
2573 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_internal_count, 0, "");
2574 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, "");
2575
2576 /* pageout counts */
2577 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_clean, 0, "");
2578 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_used, 0, "");
2579
2580 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, "");
2581 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_external, "");
2582 SYSCTL_ULONG(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2583 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_external, "");
2584 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
2585 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_cleaned, "");
2586
2587 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_sharedcache, "");
2588 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_sharedcache, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache, "");
2589 SYSCTL_ULONG(_vm, OID_AUTO, pageout_protected_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_protected_realtime, "");
2590 SYSCTL_ULONG(_vm, OID_AUTO, pageout_forcereclaimed_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime, "");
2591 extern unsigned int vm_page_realtime_count;
2592 SYSCTL_UINT(_vm, OID_AUTO, page_realtime_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_realtime_count, 0, "");
2593 extern int vm_pageout_protect_realtime;
2594 SYSCTL_INT(_vm, OID_AUTO, pageout_protect_realtime, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_protect_realtime, 0, "");
2595
2596 /* counts of pages prefaulted when entering a memory object */
2597 extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout;
2598 extern int64_t vm_prefault_nb_no_page, vm_prefault_nb_wrong_page;
2599 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_pages, "");
2600 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_bailout, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_bailout, "");
2601 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_no_page, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_no_page, "");
2602 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_wrong_page, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_wrong_page, "");
2603
2604 #if defined (__x86_64__)
2605 extern unsigned int vm_clump_promote_threshold;
2606 SYSCTL_UINT(_vm, OID_AUTO, vm_clump_promote_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_clump_promote_threshold, 0, "clump size threshold for promotes");
2607 #if DEVELOPMENT || DEBUG
2608 extern unsigned long vm_clump_stats[];
2609 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[1], "free page allocations from clump of 1 page");
2610 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[2], "free page allocations from clump of 2 pages");
2611 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[3], "free page allocations from clump of 3 pages");
2612 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats4, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[4], "free page allocations from clump of 4 pages");
2613 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats5, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[5], "free page allocations from clump of 5 pages");
2614 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats6, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[6], "free page allocations from clump of 6 pages");
2615 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats7, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[7], "free page allocations from clump of 7 pages");
2616 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats8, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[8], "free page allocations from clump of 8 pages");
2617 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats9, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[9], "free page allocations from clump of 9 pages");
2618 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats10, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[10], "free page allocations from clump of 10 pages");
2619 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats11, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[11], "free page allocations from clump of 11 pages");
2620 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats12, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[12], "free page allocations from clump of 12 pages");
2621 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats13, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[13], "free page allocations from clump of 13 pages");
2622 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats14, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[14], "free page allocations from clump of 14 pages");
2623 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats15, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[15], "free page allocations from clump of 15 pages");
2624 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats16, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[16], "free page allocations from clump of 16 pages");
2625 extern unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
2626 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_alloc, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_allocs, "free page allocations");
2627 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inserts, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inserts, "free page insertions");
2628 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inrange, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inrange, "free page insertions that are part of vm_pages");
2629 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_promotes, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_promotes, "pages promoted to head");
2630 #endif /* if DEVELOPMENT || DEBUG */
2631 #endif /* #if defined (__x86_64__) */
2632
2633 #if CONFIG_SECLUDED_MEMORY
2634
2635 SYSCTL_UINT(_vm, OID_AUTO, num_tasks_can_use_secluded_mem, CTLFLAG_RD | CTLFLAG_LOCKED, &num_tasks_can_use_secluded_mem, 0, "");
2636 extern unsigned int vm_page_secluded_target;
2637 extern unsigned int vm_page_secluded_count;
2638 extern unsigned int vm_page_secluded_count_free;
2639 extern unsigned int vm_page_secluded_count_inuse;
2640 extern unsigned int vm_page_secluded_count_over_target;
2641 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, "");
2642 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, "");
2643 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, "");
2644 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, "");
2645 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_over_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_over_target, 0, "");
2646
2647 extern struct vm_page_secluded_data vm_page_secluded;
2648 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, "");
2649 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_free, 0, "");
2650 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_other, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_other, 0, "");
2651 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_locked, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_locked, 0, "");
2652 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_state, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_state, 0, "");
2653 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_realtime, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_realtime, 0, "");
2654 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_dirty, 0, "");
2655 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, "");
2656 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, "");
2657
2658 #endif /* CONFIG_SECLUDED_MEMORY */
2659
2660 #if CONFIG_DEFERRED_RECLAIM
2661 #pragma mark Deferred Reclaim
2662 SYSCTL_NODE(_vm, OID_AUTO, reclaim, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Deferred Memory Reclamation");
2663 #if DEVELOPMENT || DEBUG
2664 /*
2665 * VM reclaim testing
2666 */
2667 extern bool vm_deferred_reclamation_block_until_task_has_been_reclaimed(task_t task);
2668
2669 static int
2670 sysctl_vm_reclaim_wait_for_pid SYSCTL_HANDLER_ARGS
2671 {
2672 int error = EINVAL, pid = 0;
2673 /*
2674 * Only send on write
2675 */
2676 error = sysctl_handle_int(oidp, &pid, 0, req);
2677 if (error || !req->newptr) {
2678 return error;
2679 }
2680 if (pid <= 0) {
2681 return EINVAL;
2682 }
2683 proc_t p = proc_find(pid);
2684 if (p == PROC_NULL) {
2685 return ESRCH;
2686 }
2687 task_t t = proc_task(p);
2688 if (t == TASK_NULL) {
2689 proc_rele(p);
2690 return ESRCH;
2691 }
2692 task_reference(t);
2693 proc_rele(p);
2694
2695 bool success = vm_deferred_reclamation_block_until_task_has_been_reclaimed(t);
2696 if (success) {
2697 error = 0;
2698 }
2699 task_deallocate(t);
2700
2701 return error;
2702 }
2703
2704 SYSCTL_PROC(_vm_reclaim, OID_AUTO, wait_for_pid,
2705 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2706 &sysctl_vm_reclaim_wait_for_pid, "I",
2707 "Block until the given pid has been drained by kernel GC");
2708
2709 static int
2710 sysctl_vm_reclaim_drain_pid SYSCTL_HANDLER_ARGS
2711 {
2712 int error = EINVAL;
2713 kern_return_t kr;
2714 pid_t pid;
2715 error = sysctl_handle_int(oidp, &pid, 0, req);
2716 /* Only reclaim on write */
2717 if (error || !req->newptr) {
2718 return error;
2719 }
2720 if (pid <= 0) {
2721 return EINVAL;
2722 }
2723 proc_t p = proc_find(pid);
2724 if (p == PROC_NULL) {
2725 return ESRCH;
2726 }
2727 task_t t = proc_task(p);
2728 if (t == TASK_NULL) {
2729 proc_rele(p);
2730 return ESRCH;
2731 }
2732 task_reference(t);
2733 proc_rele(p);
2734 kr = vm_deferred_reclamation_task_drain(t, RECLAIM_OPTIONS_NONE);
2735 task_deallocate(t);
2736 return mach_to_bsd_errno(kr);
2737 }
2738
2739 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_pid,
2740 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2741 &sysctl_vm_reclaim_drain_pid, "I",
2742 "Drain the deferred reclamation buffer for a pid");
2743
2744 static int
proc_filter_reclaimable(proc_t p,__unused void * arg)2745 proc_filter_reclaimable(proc_t p, __unused void *arg)
2746 {
2747 task_t task = proc_task(p);
2748 return vm_deferred_reclamation_task_has_ring(task);
2749 }
2750
2751 static int
proc_reclaim_drain(proc_t p,__unused void * arg)2752 proc_reclaim_drain(proc_t p, __unused void *arg)
2753 {
2754 kern_return_t kr;
2755 task_t task = proc_task(p);
2756 kr = vm_deferred_reclamation_task_drain(task, RECLAIM_OPTIONS_NONE);
2757 return mach_to_bsd_errno(kr);
2758 }
2759
2760 static int
2761 sysctl_vm_reclaim_drain_all SYSCTL_HANDLER_ARGS
2762 {
2763 int error;
2764 int val;
2765 if (!req->newptr) {
2766 return EINVAL;
2767 }
2768 error = sysctl_handle_int(oidp, &val, 0, req);
2769 if (error || val == FALSE) {
2770 return error;
2771 }
2772 proc_iterate(PROC_ALLPROCLIST, proc_reclaim_drain, NULL,
2773 proc_filter_reclaimable, NULL);
2774 return 0;
2775 }
2776
2777 SYSCTL_PROC(_vm_reclaim, OID_AUTO, drain_all,
2778 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0,
2779 &sysctl_vm_reclaim_drain_all, "I",
2780 "Fully reclaim from every deferred reclamation buffer on the system");
2781
2782 extern uint32_t vm_reclaim_buffer_count;
2783 extern uint64_t vm_reclaim_gc_epoch;
2784 extern uint64_t vm_reclaim_gc_reclaim_count;
2785 extern uint64_t vm_reclaim_sampling_period_abs;
2786 extern uint64_t vm_reclaim_sampling_period_ns;
2787 extern bool vm_reclaim_debug;
2788 #if XNU_TARGET_OS_IOS
2789 extern uint64_t vm_reclaim_max_threshold;
2790 #else /* !XNU_TARGET_OS_IOS */
2791 extern bool vm_reclaim_enabled;
2792 extern uint32_t vm_reclaim_autotrim_pct_normal;
2793 extern uint32_t vm_reclaim_autotrim_pct_pressure;
2794 extern uint32_t vm_reclaim_autotrim_pct_critical;
2795 extern uint32_t vm_reclaim_wma_weight_base;
2796 extern uint32_t vm_reclaim_wma_weight_cur;
2797 extern uint32_t vm_reclaim_wma_denom;
2798 extern uint64_t vm_reclaim_abandonment_threshold;
2799 #endif /* XNU_TARGET_OS_IOS */
2800
2801 SYSCTL_UINT(_vm_reclaim, OID_AUTO, reclaim_buffer_count,
2802 CTLFLAG_RD | CTLFLAG_LOCKED, (uint32_t *)&vm_reclaim_buffer_count, 0,
2803 "The number of deferred memory buffers currently alive");
2804 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_epoch,
2805 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_epoch,
2806 "Number of times the global GC thread has run");
2807 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, reclaim_gc_reclaim_count,
2808 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_gc_reclaim_count,
2809 "Number of times the global GC thread has reclaimed from a buffer");
2810 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, debug,
2811 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_debug, 0,
2812 "Debug logs for vm.reclaim");
2813 #if XNU_TARGET_OS_IOS
2814 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, max_threshold,
2815 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_max_threshold,
2816 "Maximum amount of virtual memory (in B) that may be deferred without "
2817 "synchronous reclamation");
2818 #else /* !XNU_TARGET_OS_IOS */
2819 SYSCTL_COMPAT_UINT(_vm_reclaim, OID_AUTO, enabled,
2820 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_enabled, 0,
2821 "Whether deferred memory reclamation is enabled on this system");
2822 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_normal,
2823 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_normal, 0,
2824 "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2825 "to engage auto-trim when the system is operating normally");
2826 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_pressure,
2827 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_pressure, 0,
2828 "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2829 "to engage auto-trim when the system is under memory pressure");
2830 SYSCTL_UINT(_vm_reclaim, OID_AUTO, autotrim_pct_critical,
2831 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_autotrim_pct_critical, 0,
2832 "Percentage of a task's lifetime max phys_footprint that must be reclaimable "
2833 "to engage auto-trim when the system is under critical memory pressure");
2834 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_base,
2835 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_base, 0,
2836 "Weight applied to historical minimum buffer size samples");
2837 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_weight_cur,
2838 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_weight_cur, 0,
2839 "Weight applied to current sampled minimum buffer size");
2840 SYSCTL_UINT(_vm_reclaim, OID_AUTO, wma_denom,
2841 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_wma_denom, 0,
2842 "Denominator for weighted moving average calculation");
2843 SYSCTL_QUAD(_vm_reclaim, OID_AUTO, abandonment_threshold,
2844 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_reclaim_abandonment_threshold,
2845 "The number of sampling periods between accounting updates that may elapse "
2846 "before the buffer is considered \"abandoned\"");
2847 #endif /* XNU_TARGET_OS_IOS */
2848
2849 static int
2850 sysctl_vm_reclaim_sampling_period SYSCTL_HANDLER_ARGS
2851 {
2852 uint64_t new_val_ns;
2853 uint64_t old_val_ns = vm_reclaim_sampling_period_ns;
2854 int err = sysctl_io_number(req, vm_reclaim_sampling_period_ns,
2855 sizeof(vm_reclaim_sampling_period_ns), &new_val_ns, NULL);
2856 if (err || !req->newptr) {
2857 return err;
2858 }
2859 if (new_val_ns != old_val_ns) {
2860 vm_reclaim_sampling_period_ns = new_val_ns;
2861 nanoseconds_to_absolutetime(vm_reclaim_sampling_period_ns, &vm_reclaim_sampling_period_abs);
2862 }
2863 return 0;
2864 }
2865
2866 SYSCTL_PROC(_vm_reclaim, OID_AUTO, sampling_period_ns,
2867 CTLFLAG_RW | CTLTYPE_QUAD | CTLFLAG_LOCKED, NULL, 0, sysctl_vm_reclaim_sampling_period, "QU",
2868 "Interval (nanoseconds) at which to sample the minimum buffer size and "
2869 "consider trimming excess");
2870 #endif /* DEVELOPMENT || DEBUG */
2871 #endif /* CONFIG_DEFERRED_RECLAIM */
2872
2873 #include <kern/thread.h>
2874 #include <sys/user.h>
2875
2876 void vm_pageout_io_throttle(void);
2877
2878 void
vm_pageout_io_throttle(void)2879 vm_pageout_io_throttle(void)
2880 {
2881 struct uthread *uthread = current_uthread();
2882
2883 /*
2884 * thread is marked as a low priority I/O type
2885 * and the I/O we issued while in this cleaning operation
2886 * collided with normal I/O operations... we'll
2887 * delay in order to mitigate the impact of this
2888 * task on the normal operation of the system
2889 */
2890
2891 if (uthread->uu_lowpri_window) {
2892 throttle_lowpri_io(1);
2893 }
2894 }
2895
2896 int
vm_pressure_monitor(__unused struct proc * p,struct vm_pressure_monitor_args * uap,int * retval)2897 vm_pressure_monitor(
2898 __unused struct proc *p,
2899 struct vm_pressure_monitor_args *uap,
2900 int *retval)
2901 {
2902 kern_return_t kr;
2903 uint32_t pages_reclaimed;
2904 uint32_t pages_wanted;
2905
2906 kr = mach_vm_pressure_monitor(
2907 (boolean_t) uap->wait_for_pressure,
2908 uap->nsecs_monitored,
2909 (uap->pages_reclaimed) ? &pages_reclaimed : NULL,
2910 &pages_wanted);
2911
2912 switch (kr) {
2913 case KERN_SUCCESS:
2914 break;
2915 case KERN_ABORTED:
2916 return EINTR;
2917 default:
2918 return EINVAL;
2919 }
2920
2921 if (uap->pages_reclaimed) {
2922 if (copyout((void *)&pages_reclaimed,
2923 uap->pages_reclaimed,
2924 sizeof(pages_reclaimed)) != 0) {
2925 return EFAULT;
2926 }
2927 }
2928
2929 *retval = (int) pages_wanted;
2930 return 0;
2931 }
2932
2933 int
kas_info(struct proc * p,struct kas_info_args * uap,int * retval __unused)2934 kas_info(struct proc *p,
2935 struct kas_info_args *uap,
2936 int *retval __unused)
2937 {
2938 #ifndef CONFIG_KAS_INFO
2939 (void)p;
2940 (void)uap;
2941 return ENOTSUP;
2942 #else /* CONFIG_KAS_INFO */
2943 int selector = uap->selector;
2944 user_addr_t valuep = uap->value;
2945 user_addr_t sizep = uap->size;
2946 user_size_t size, rsize;
2947 int error;
2948
2949 if (!kauth_cred_issuser(kauth_cred_get())) {
2950 return EPERM;
2951 }
2952
2953 #if CONFIG_MACF
2954 error = mac_system_check_kas_info(kauth_cred_get(), selector);
2955 if (error) {
2956 return error;
2957 }
2958 #endif
2959
2960 if (IS_64BIT_PROCESS(p)) {
2961 user64_size_t size64;
2962 error = copyin(sizep, &size64, sizeof(size64));
2963 size = (user_size_t)size64;
2964 } else {
2965 user32_size_t size32;
2966 error = copyin(sizep, &size32, sizeof(size32));
2967 size = (user_size_t)size32;
2968 }
2969 if (error) {
2970 return error;
2971 }
2972
2973 switch (selector) {
2974 case KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR:
2975 {
2976 uint64_t slide = vm_kernel_slide;
2977
2978 if (sizeof(slide) != size) {
2979 return EINVAL;
2980 }
2981
2982 error = copyout(&slide, valuep, sizeof(slide));
2983 if (error) {
2984 return error;
2985 }
2986 rsize = size;
2987 }
2988 break;
2989 case KAS_INFO_KERNEL_SEGMENT_VMADDR_SELECTOR:
2990 {
2991 uint32_t i;
2992 kernel_mach_header_t *mh = &_mh_execute_header;
2993 struct load_command *cmd;
2994 cmd = (struct load_command*) &mh[1];
2995 uint64_t *bases;
2996 rsize = mh->ncmds * sizeof(uint64_t);
2997
2998 /*
2999 * Return the size if no data was passed
3000 */
3001 if (valuep == 0) {
3002 break;
3003 }
3004
3005 if (rsize > size) {
3006 return EINVAL;
3007 }
3008
3009 bases = kalloc_data(rsize, Z_WAITOK | Z_ZERO);
3010
3011 for (i = 0; i < mh->ncmds; i++) {
3012 if (cmd->cmd == LC_SEGMENT_KERNEL) {
3013 __IGNORE_WCASTALIGN(kernel_segment_command_t * sg = (kernel_segment_command_t *) cmd);
3014 bases[i] = (uint64_t)sg->vmaddr;
3015 }
3016 cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize);
3017 }
3018
3019 error = copyout(bases, valuep, rsize);
3020
3021 kfree_data(bases, rsize);
3022
3023 if (error) {
3024 return error;
3025 }
3026 }
3027 break;
3028 case KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR:
3029 case KAS_INFO_TXM_TEXT_SLIDE_SELECTOR:
3030 {
3031 #if CONFIG_SPTM
3032 const uint64_t slide =
3033 (selector == KAS_INFO_SPTM_TEXT_SLIDE_SELECTOR) ? vm_sptm_offsets.slide : vm_txm_offsets.slide;
3034 #else
3035 const uint64_t slide = 0;
3036 #endif
3037
3038 if (sizeof(slide) != size) {
3039 return EINVAL;
3040 }
3041
3042 error = copyout(&slide, valuep, sizeof(slide));
3043 if (error) {
3044 return error;
3045 }
3046 rsize = size;
3047 }
3048 break;
3049 default:
3050 return EINVAL;
3051 }
3052
3053 if (IS_64BIT_PROCESS(p)) {
3054 user64_size_t size64 = (user64_size_t)rsize;
3055 error = copyout(&size64, sizep, sizeof(size64));
3056 } else {
3057 user32_size_t size32 = (user32_size_t)rsize;
3058 error = copyout(&size32, sizep, sizeof(size32));
3059 }
3060
3061 return error;
3062 #endif /* CONFIG_KAS_INFO */
3063 }
3064
3065 #pragma clang diagnostic push
3066 #pragma clang diagnostic ignored "-Wcast-qual"
3067 #pragma clang diagnostic ignored "-Wunused-function"
3068
3069 static void
asserts()3070 asserts()
3071 {
3072 static_assert(sizeof(vm_min_kernel_address) == sizeof(unsigned long));
3073 static_assert(sizeof(vm_max_kernel_address) == sizeof(unsigned long));
3074 }
3075
3076 SYSCTL_ULONG(_vm, OID_AUTO, vm_min_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_min_kernel_address, "");
3077 SYSCTL_ULONG(_vm, OID_AUTO, vm_max_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_max_kernel_address, "");
3078 #pragma clang diagnostic pop
3079
3080 extern uint32_t vm_page_pages;
3081 SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0, "");
3082
3083 extern uint32_t vm_page_busy_absent_skipped;
3084 SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, "");
3085
3086 extern uint32_t vm_page_upl_tainted;
3087 SYSCTL_UINT(_vm, OID_AUTO, upl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_upl_tainted, 0, "");
3088
3089 extern uint32_t vm_page_iopl_tainted;
3090 SYSCTL_UINT(_vm, OID_AUTO, iopl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_iopl_tainted, 0, "");
3091
3092 #if __arm64__ && (DEVELOPMENT || DEBUG)
3093 extern int vm_footprint_suspend_allowed;
3094 SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, "");
3095
3096 extern void pmap_footprint_suspend(vm_map_t map, boolean_t suspend);
3097 static int
3098 sysctl_vm_footprint_suspend SYSCTL_HANDLER_ARGS
3099 {
3100 #pragma unused(oidp, arg1, arg2)
3101 int error = 0;
3102 int new_value;
3103
3104 if (req->newptr == USER_ADDR_NULL) {
3105 return 0;
3106 }
3107 error = SYSCTL_IN(req, &new_value, sizeof(int));
3108 if (error) {
3109 return error;
3110 }
3111 if (!vm_footprint_suspend_allowed) {
3112 if (new_value != 0) {
3113 /* suspends are not allowed... */
3114 return 0;
3115 }
3116 /* ... but let resumes proceed */
3117 }
3118 DTRACE_VM2(footprint_suspend,
3119 vm_map_t, current_map(),
3120 int, new_value);
3121
3122 pmap_footprint_suspend(current_map(), new_value);
3123
3124 return 0;
3125 }
3126 SYSCTL_PROC(_vm, OID_AUTO, footprint_suspend,
3127 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3128 0, 0, &sysctl_vm_footprint_suspend, "I", "");
3129 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
3130
3131 extern uint64_t vm_map_corpse_footprint_count;
3132 extern uint64_t vm_map_corpse_footprint_size_avg;
3133 extern uint64_t vm_map_corpse_footprint_size_max;
3134 extern uint64_t vm_map_corpse_footprint_full;
3135 extern uint64_t vm_map_corpse_footprint_no_buf;
3136 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_count,
3137 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_count, "");
3138 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_avg,
3139 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_avg, "");
3140 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_max,
3141 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_max, "");
3142 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_full,
3143 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_full, "");
3144 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_no_buf,
3145 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_no_buf, "");
3146
3147 #if CODE_SIGNING_MONITOR
3148 extern uint64_t vm_cs_defer_to_csm;
3149 extern uint64_t vm_cs_defer_to_csm_not;
3150 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm,
3151 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm, "");
3152 SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_csm_not,
3153 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_csm_not, "");
3154 #endif /* CODE_SIGNING_MONITOR */
3155
3156 extern uint64_t shared_region_pager_copied;
3157 extern uint64_t shared_region_pager_slid;
3158 extern uint64_t shared_region_pager_slid_error;
3159 extern uint64_t shared_region_pager_reclaimed;
3160 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_copied,
3161 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_copied, "");
3162 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid,
3163 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid, "");
3164 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid_error,
3165 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid_error, "");
3166 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed,
3167 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_reclaimed, "");
3168 extern int shared_region_destroy_delay;
3169 SYSCTL_INT(_vm, OID_AUTO, shared_region_destroy_delay,
3170 CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_destroy_delay, 0, "");
3171
3172 #if MACH_ASSERT
3173 extern int pmap_ledgers_panic_leeway;
3174 SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
3175 #endif /* MACH_ASSERT */
3176
3177
3178 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_count;
3179 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_size;
3180 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_max;
3181 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart;
3182 extern uint64_t vm_map_lookup_and_lock_object_copy_slowly_error;
3183 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_count;
3184 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_size;
3185 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_max;
3186 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart;
3187 extern uint64_t vm_map_lookup_and_lock_object_copy_strategically_error;
3188 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_count;
3189 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_size;
3190 extern uint64_t vm_map_lookup_and_lock_object_copy_shadow_max;
3191 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_count,
3192 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_count, "");
3193 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_size,
3194 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_size, "");
3195 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_max,
3196 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_max, "");
3197 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_restart,
3198 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_restart, "");
3199 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_error,
3200 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_slowly_error, "");
3201 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_count,
3202 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_count, "");
3203 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_size,
3204 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_size, "");
3205 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_max,
3206 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_max, "");
3207 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_restart,
3208 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_restart, "");
3209 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_error,
3210 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_strategically_error, "");
3211 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_count,
3212 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_count, "");
3213 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_size,
3214 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_size, "");
3215 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_max,
3216 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_and_lock_object_copy_shadow_max, "");
3217
3218 extern int vm_protect_privileged_from_untrusted;
3219 SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted,
3220 CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, "");
3221 extern uint64_t vm_copied_on_read;
3222 extern uint64_t vm_copied_on_read_kernel_map;
3223 extern uint64_t vm_copied_on_read_platform_map;
3224 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read,
3225 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, "");
3226 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read_kernel_map,
3227 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read_kernel_map, "");
3228 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read_platform_map,
3229 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read_platform_map, "");
3230
3231 extern int vm_shared_region_count;
3232 extern int vm_shared_region_peak;
3233 SYSCTL_INT(_vm, OID_AUTO, shared_region_count,
3234 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_count, 0, "");
3235 SYSCTL_INT(_vm, OID_AUTO, shared_region_peak,
3236 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_peak, 0, "");
3237 #if DEVELOPMENT || DEBUG
3238 extern unsigned int shared_region_pagers_resident_count;
3239 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_count,
3240 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_count, 0, "");
3241 extern unsigned int shared_region_pagers_resident_peak;
3242 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_peak,
3243 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_peak, 0, "");
3244 extern int shared_region_pager_count;
3245 SYSCTL_INT(_vm, OID_AUTO, shared_region_pager_count,
3246 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_count, 0, "");
3247 #if __has_feature(ptrauth_calls)
3248 extern int shared_region_key_count;
3249 SYSCTL_INT(_vm, OID_AUTO, shared_region_key_count,
3250 CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_key_count, 0, "");
3251 extern int vm_shared_region_reslide_count;
3252 SYSCTL_INT(_vm, OID_AUTO, shared_region_reslide_count,
3253 CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_reslide_count, 0, "");
3254 #endif /* __has_feature(ptrauth_calls) */
3255 #endif /* DEVELOPMENT || DEBUG */
3256
3257 #if MACH_ASSERT
3258 extern int debug4k_filter;
3259 SYSCTL_INT(_vm, OID_AUTO, debug4k_filter, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_filter, 0, "");
3260 extern int debug4k_panic_on_terminate;
3261 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_terminate, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_terminate, 0, "");
3262 extern int debug4k_panic_on_exception;
3263 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_exception, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_exception, 0, "");
3264 extern int debug4k_panic_on_misaligned_sharing;
3265 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_misaligned_sharing, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_misaligned_sharing, 0, "");
3266 #endif /* MACH_ASSERT */
3267
3268 extern uint64_t vm_map_set_size_limit_count;
3269 extern uint64_t vm_map_set_data_limit_count;
3270 extern uint64_t vm_map_enter_RLIMIT_AS_count;
3271 extern uint64_t vm_map_enter_RLIMIT_DATA_count;
3272 SYSCTL_QUAD(_vm, OID_AUTO, map_set_size_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_size_limit_count, "");
3273 SYSCTL_QUAD(_vm, OID_AUTO, map_set_data_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_data_limit_count, "");
3274 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_AS_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_AS_count, "");
3275 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_DATA_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_DATA_count, "");
3276
3277 extern uint64_t vm_fault_resilient_media_initiate;
3278 extern uint64_t vm_fault_resilient_media_retry;
3279 extern uint64_t vm_fault_resilient_media_proceed;
3280 extern uint64_t vm_fault_resilient_media_release;
3281 extern uint64_t vm_fault_resilient_media_abort1;
3282 extern uint64_t vm_fault_resilient_media_abort2;
3283 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_initiate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_initiate, "");
3284 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_retry, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_retry, "");
3285 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_proceed, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_proceed, "");
3286 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_release, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_release, "");
3287 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort1, "");
3288 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort2, "");
3289 #if MACH_ASSERT
3290 extern int vm_fault_resilient_media_inject_error1_rate;
3291 extern int vm_fault_resilient_media_inject_error1;
3292 extern int vm_fault_resilient_media_inject_error2_rate;
3293 extern int vm_fault_resilient_media_inject_error2;
3294 extern int vm_fault_resilient_media_inject_error3_rate;
3295 extern int vm_fault_resilient_media_inject_error3;
3296 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1_rate, 0, "");
3297 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1, 0, "");
3298 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2_rate, 0, "");
3299 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2, 0, "");
3300 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3_rate, 0, "");
3301 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3, 0, "");
3302 #endif /* MACH_ASSERT */
3303
3304 extern uint64_t pmap_query_page_info_retries;
3305 SYSCTL_QUAD(_vm, OID_AUTO, pmap_query_page_info_retries, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_query_page_info_retries, "");
3306
3307 /*
3308 * A sysctl which causes all existing shared regions to become stale. They
3309 * will no longer be used by anything new and will be torn down as soon as
3310 * the last existing user exits. A write of non-zero value causes that to happen.
3311 * This should only be used by launchd, so we check that this is initproc.
3312 */
3313 static int
shared_region_pivot(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3314 shared_region_pivot(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3315 {
3316 unsigned int value = 0;
3317 int changed = 0;
3318 int error = sysctl_io_number(req, 0, sizeof(value), &value, &changed);
3319 if (error || !changed) {
3320 return error;
3321 }
3322 if (current_proc() != initproc) {
3323 return EPERM;
3324 }
3325
3326 vm_shared_region_pivot();
3327
3328 return 0;
3329 }
3330
3331 SYSCTL_PROC(_vm, OID_AUTO, shared_region_pivot,
3332 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
3333 0, 0, shared_region_pivot, "I", "");
3334
3335 extern uint64_t vm_object_shadow_forced;
3336 extern uint64_t vm_object_shadow_skipped;
3337 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
3338 &vm_object_shadow_forced, "");
3339 SYSCTL_QUAD(_vm, OID_AUTO, object_shadow_skipped, CTLFLAG_RD | CTLFLAG_LOCKED,
3340 &vm_object_shadow_skipped, "");
3341
3342 extern uint64_t vm_object_upl_throttle_cnt;
3343 SYSCTL_QUAD(_vm, OID_AUTO, object_upl_throttle_cnt, CTLFLAG_RD | CTLFLAG_LOCKED,
3344 &vm_object_upl_throttle_cnt,
3345 "The number of times in which a UPL write was throttled due to pageout starvation");
3346
3347
3348 SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED,
3349 &vmtc_total, 0, "total text page corruptions detected");
3350
3351
3352 #if DEBUG || DEVELOPMENT
3353 /*
3354 * A sysctl that can be used to corrupt a text page with an illegal instruction.
3355 * Used for testing text page self healing.
3356 */
3357 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
3358 static int
corrupt_text_addr(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3359 corrupt_text_addr(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3360 {
3361 uint64_t value = 0;
3362 int error = sysctl_handle_quad(oidp, &value, 0, req);
3363 if (error || !req->newptr) {
3364 return error;
3365 }
3366
3367 if (vm_corrupt_text_addr((uintptr_t)value) == KERN_SUCCESS) {
3368 return 0;
3369 } else {
3370 return EINVAL;
3371 }
3372 }
3373
3374 SYSCTL_PROC(_vm, OID_AUTO, corrupt_text_addr,
3375 CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3376 0, 0, corrupt_text_addr, "-", "");
3377 #endif /* DEBUG || DEVELOPMENT */
3378
3379 #if CONFIG_MAP_RANGES
3380 /*
3381 * vm.malloc_ranges
3382 *
3383 * space-separated list of <left:right> hexadecimal addresses.
3384 */
3385 static int
3386 vm_map_malloc_ranges SYSCTL_HANDLER_ARGS
3387 {
3388 vm_map_t map = current_map();
3389 struct mach_vm_range r1, r2;
3390 char str[20 * 4];
3391 int len;
3392 mach_vm_offset_t right_hole_max;
3393
3394 if (vm_map_get_user_range(map, UMEM_RANGE_ID_DEFAULT, &r1)) {
3395 return ENOENT;
3396 }
3397 if (vm_map_get_user_range(map, UMEM_RANGE_ID_HEAP, &r2)) {
3398 return ENOENT;
3399 }
3400
3401 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
3402 right_hole_max = MACH_VM_JUMBO_ADDRESS;
3403 #else /* !XNU_TARGET_OS_IOS || !EXTENDED_USER_VA_SUPPORT */
3404 right_hole_max = get_map_max(map);
3405 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
3406
3407 len = scnprintf(str, sizeof(str), "0x%llx:0x%llx 0x%llx:0x%llx",
3408 r1.max_address, r2.min_address,
3409 r2.max_address, right_hole_max);
3410
3411 return SYSCTL_OUT(req, str, len);
3412 }
3413
3414 SYSCTL_PROC(_vm, OID_AUTO, malloc_ranges,
3415 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3416 0, 0, &vm_map_malloc_ranges, "A", "");
3417
3418 #if DEBUG || DEVELOPMENT
3419 static int
3420 vm_map_user_range_default SYSCTL_HANDLER_ARGS
3421 {
3422 #pragma unused(arg1, arg2, oidp)
3423 struct mach_vm_range range;
3424
3425 if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_DEFAULT, &range)
3426 != KERN_SUCCESS) {
3427 return EINVAL;
3428 }
3429
3430 return SYSCTL_OUT(req, &range, sizeof(range));
3431 }
3432
3433 static int
3434 vm_map_user_range_heap SYSCTL_HANDLER_ARGS
3435 {
3436 #pragma unused(arg1, arg2, oidp)
3437 struct mach_vm_range range;
3438
3439 if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_HEAP, &range)
3440 != KERN_SUCCESS) {
3441 return EINVAL;
3442 }
3443
3444 return SYSCTL_OUT(req, &range, sizeof(range));
3445 }
3446
3447 static int
3448 vm_map_user_range_large_file SYSCTL_HANDLER_ARGS
3449 {
3450 #pragma unused(arg1, arg2, oidp)
3451 struct mach_vm_range range;
3452
3453 if (vm_map_get_user_range(current_map(), UMEM_RANGE_ID_LARGE_FILE, &range)
3454 != KERN_SUCCESS) {
3455 return EINVAL;
3456 }
3457
3458 return SYSCTL_OUT(req, &range, sizeof(range));
3459 }
3460
3461 /*
3462 * A sysctl that can be used to return ranges for the current VM map.
3463 * Used for testing VM ranges.
3464 */
3465 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_default, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3466 0, 0, &vm_map_user_range_default, "S,mach_vm_range", "");
3467 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_heap, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3468 0, 0, &vm_map_user_range_heap, "S,mach_vm_range", "");
3469 SYSCTL_PROC(_vm, OID_AUTO, vm_map_user_range_large_file, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3470 0, 0, &vm_map_user_range_large_file, "S,mach_vm_range", "");
3471
3472 #endif /* DEBUG || DEVELOPMENT */
3473 #endif /* CONFIG_MAP_RANGES */
3474
3475 #if DEBUG || DEVELOPMENT
3476 #endif /* DEBUG || DEVELOPMENT */
3477
3478 extern uint64_t vm_map_range_overflows_count;
3479 SYSCTL_QUAD(_vm, OID_AUTO, map_range_overflows_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_range_overflows_count, "");
3480 extern boolean_t vm_map_range_overflows_log;
3481 SYSCTL_INT(_vm, OID_AUTO, map_range_oveflows_log, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_range_overflows_log, 0, "");
3482
3483 extern uint64_t c_seg_filled_no_contention;
3484 extern uint64_t c_seg_filled_contention;
3485 extern clock_sec_t c_seg_filled_contention_sec_max;
3486 extern clock_nsec_t c_seg_filled_contention_nsec_max;
3487 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_no_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_no_contention, "");
3488 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention, "");
3489 SYSCTL_ULONG(_vm, OID_AUTO, c_seg_filled_contention_sec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_sec_max, "");
3490 SYSCTL_UINT(_vm, OID_AUTO, c_seg_filled_contention_nsec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_nsec_max, 0, "");
3491 #if (XNU_TARGET_OS_OSX && __arm64__)
3492 extern clock_nsec_t c_process_major_report_over_ms; /* report if over ? ms */
3493 extern int c_process_major_yield_after; /* yield after moving ? segments */
3494 extern uint64_t c_process_major_reports;
3495 extern clock_sec_t c_process_major_max_sec;
3496 extern clock_nsec_t c_process_major_max_nsec;
3497 extern uint32_t c_process_major_peak_segcount;
3498 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_report_over_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_report_over_ms, 0, "");
3499 SYSCTL_INT(_vm, OID_AUTO, c_process_major_yield_after, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_yield_after, 0, "");
3500 SYSCTL_QUAD(_vm, OID_AUTO, c_process_major_reports, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_reports, "");
3501 SYSCTL_ULONG(_vm, OID_AUTO, c_process_major_max_sec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_sec, "");
3502 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_max_nsec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_nsec, 0, "");
3503 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_peak_segcount, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_peak_segcount, 0, "");
3504 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3505
3506 #if DEVELOPMENT || DEBUG
3507 extern int panic_object_not_alive;
3508 SYSCTL_INT(_vm, OID_AUTO, panic_object_not_alive, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &panic_object_not_alive, 0, "");
3509 #endif /* DEVELOPMENT || DEBUG */
3510
3511 #if FBDP_DEBUG_OBJECT_NO_PAGER
3512 extern int fbdp_no_panic;
3513 SYSCTL_INT(_vm, OID_AUTO, fbdp_no_panic, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, &fbdp_no_panic, 0, "");
3514 #endif /* MACH_ASSERT */
3515
3516 extern uint64_t cluster_direct_write_wired;
3517 SYSCTL_QUAD(_vm, OID_AUTO, cluster_direct_write_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &cluster_direct_write_wired, "");
3518
3519 extern uint64_t vm_object_pageout_not_on_queue;
3520 extern uint64_t vm_object_pageout_not_pageable;
3521 extern uint64_t vm_object_pageout_pageable;
3522 extern uint64_t vm_object_pageout_active_local;
3523 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_on_queue, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_on_queue, "");
3524 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_not_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_not_pageable, "");
3525 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_pageable, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_pageable, "");
3526 SYSCTL_QUAD(_vm, OID_AUTO, object_pageout_active_local, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_object_pageout_active_local, "");
3527
3528
3529 #if DEVELOPMENT || DEBUG
3530
3531 static uint32_t
sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)3532 sysctl_compressor_seg_magic(vm_c_serialize_add_data_t with_data)
3533 {
3534 #pragma unused(with_data)
3535 return VM_C_SEGMENT_INFO_MAGIC;
3536 }
3537
3538 /* The largest possible single segment + its slots is
3539 * (sizeof(c_segment_info) + C_SLOT_MAX_INDEX * sizeof(c_slot_info)) + (data of a single segment) */
3540 #define SYSCTL_SEG_BUF_SIZE (8 * 1024 + 64 * 1024)
3541
3542 extern uint32_t c_segments_available;
3543
3544 struct sysctl_buf_header {
3545 uint32_t magic;
3546 } __attribute__((packed));
3547
3548 /* This sysctl iterates over the populated c_segments and writes some info about each one and its slots.
3549 * instead of doing everything here, the function calls a function vm_compressor.c. */
3550 static int
sysctl_compressor_segments_stream(struct sysctl_req * req,vm_c_serialize_add_data_t with_data)3551 sysctl_compressor_segments_stream(struct sysctl_req *req, vm_c_serialize_add_data_t with_data)
3552 {
3553 char* buf = kalloc_data(SYSCTL_SEG_BUF_SIZE, Z_WAITOK | Z_ZERO);
3554 if (!buf) {
3555 return ENOMEM;
3556 }
3557 size_t offset = 0;
3558 int error = 0;
3559 int segno = 0;
3560 /* 4 byte header to identify the version of the formatting of the data.
3561 * This should be incremented if c_segment_info or c_slot_info are changed */
3562 ((struct sysctl_buf_header*)buf)->magic = sysctl_compressor_seg_magic(with_data);
3563 offset += sizeof(uint32_t);
3564
3565 while (segno < c_segments_available) {
3566 size_t left_sz = SYSCTL_SEG_BUF_SIZE - offset;
3567 kern_return_t kr = vm_compressor_serialize_segment_debug_info(segno, buf + offset, &left_sz, with_data);
3568 if (kr == KERN_NO_SPACE) {
3569 /* failed to add another segment, push the current buffer out and try again */
3570 if (offset == 0) {
3571 error = EINVAL; /* no space to write but I didn't write anything, shouldn't really happen */
3572 goto out;
3573 }
3574 /* write out chunk */
3575 error = SYSCTL_OUT(req, buf, offset);
3576 if (error) {
3577 goto out;
3578 }
3579 offset = 0;
3580 bzero(buf, SYSCTL_SEG_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3581 /* don't increment segno, need to try again saving the current one */
3582 } else if (kr != KERN_SUCCESS) {
3583 error = EINVAL;
3584 goto out;
3585 } else {
3586 offset += left_sz;
3587 ++segno;
3588 assert(offset <= SYSCTL_SEG_BUF_SIZE);
3589 }
3590 }
3591
3592 if (offset > 0) { /* write last chunk */
3593 error = SYSCTL_OUT(req, buf, offset);
3594 }
3595
3596 out:
3597 kfree_data(buf, SYSCTL_SEG_BUF_SIZE)
3598 return error;
3599 }
3600
3601 static int
sysctl_compressor_segments(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3602 sysctl_compressor_segments(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3603 {
3604 return sysctl_compressor_segments_stream(req, VM_C_SERIALIZE_DATA_NONE);
3605 }
3606 SYSCTL_PROC(_vm, OID_AUTO, compressor_segments, CTLTYPE_STRUCT | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_compressor_segments, "S", "");
3607
3608
3609 extern uint32_t vm_compressor_fragmentation_level(void);
3610
3611 static int
sysctl_compressor_fragmentation_level(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3612 sysctl_compressor_fragmentation_level(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3613 {
3614 uint32_t value = vm_compressor_fragmentation_level();
3615 return SYSCTL_OUT(req, &value, sizeof(value));
3616 }
3617
3618 SYSCTL_PROC(_vm, OID_AUTO, compressor_fragmentation_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_fragmentation_level, "IU", "");
3619
3620 extern uint32_t vm_compressor_incore_fragmentation_wasted_pages(void);
3621
3622 static int
sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3623 sysctl_compressor_incore_fragmentation_wasted_pages(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3624 {
3625 uint32_t value = vm_compressor_incore_fragmentation_wasted_pages();
3626 return SYSCTL_OUT(req, &value, sizeof(value));
3627 }
3628
3629 SYSCTL_PROC(_vm, OID_AUTO, compressor_incore_fragmentation_wasted_pages, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_compressor_incore_fragmentation_wasted_pages, "IU", "");
3630
3631
3632
3633 #define SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE (8 * 1024)
3634
3635
3636 /* This sysctl iterates over all the entries of the vm_map of the a given process and write some info about the vm_object pointed by the entries.
3637 * This can be used for mapping where are all the pages of a process located in the compressor.
3638 */
3639 static int
sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)3640 sysctl_task_vm_objects_slotmap(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
3641 {
3642 int error = 0;
3643 char *buf = NULL;
3644 proc_t p = PROC_NULL;
3645 task_t task = TASK_NULL;
3646 vm_map_t map = VM_MAP_NULL;
3647 __block size_t offset = 0;
3648
3649 /* go from pid to proc to task to vm_map. see sysctl_procargsx() for another example of this procession */
3650 int *name = arg1;
3651 int namelen = arg2;
3652 if (namelen < 1) {
3653 return EINVAL;
3654 }
3655 int pid = name[0];
3656 p = proc_find(pid); /* this increments a reference to the proc */
3657 if (p == PROC_NULL) {
3658 return EINVAL;
3659 }
3660 task = proc_task(p);
3661 proc_rele(p); /* decrement ref of proc */
3662 p = PROC_NULL;
3663 if (task == TASK_NULL) {
3664 return EINVAL;
3665 }
3666 /* convert proc reference to task reference */
3667 task_reference(task);
3668 /* task reference to map reference */
3669 map = get_task_map_reference(task);
3670 task_deallocate(task);
3671
3672 if (map == VM_MAP_NULL) {
3673 return EINVAL; /* nothing allocated yet */
3674 }
3675
3676 buf = kalloc_data(SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE, Z_WAITOK | Z_ZERO);
3677 if (!buf) {
3678 error = ENOMEM;
3679 goto out;
3680 }
3681
3682 /* 4 byte header to identify the version of the formatting of the data.
3683 * This should be incremented if c_segment_info or c_slot_info are changed */
3684 ((struct sysctl_buf_header*)buf)->magic = VM_MAP_ENTRY_INFO_MAGIC;
3685 offset += sizeof(uint32_t);
3686
3687 kern_return_t (^write_header)(int) = ^kern_return_t (int nentries) {
3688 /* write the header, happens only once at the beginning so we should have enough space */
3689 assert(offset + sizeof(struct vm_map_info_hdr) < SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE);
3690 struct vm_map_info_hdr* out_hdr = (struct vm_map_info_hdr*)(buf + offset);
3691 out_hdr->vmi_nentries = nentries;
3692 offset += sizeof(struct vm_map_info_hdr);
3693 return KERN_SUCCESS;
3694 };
3695
3696 kern_return_t (^write_entry)(void*) = ^kern_return_t (void* entry) {
3697 while (true) { /* try up to 2 times, first try write the the current buffer, otherwise to a new buffer */
3698 size_t left_sz = SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE - offset;
3699 kern_return_t kr = vm_map_dump_entry_and_compressor_pager(entry, buf + offset, &left_sz);
3700 if (kr == KERN_NO_SPACE) {
3701 /* failed to write anything, flush the current buffer and try again */
3702 if (offset == 0) {
3703 return KERN_FAILURE; /* no space to write but I didn't write anything yet, shouldn't really happen */
3704 }
3705 /* write out chunk */
3706 int out_error = SYSCTL_OUT(req, buf, offset);
3707 if (out_error) {
3708 return KERN_FAILURE;
3709 }
3710 offset = 0;
3711 bzero(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE); /* zero any reserved bits that are not going to be filled */
3712 continue; /* need to retry the entry dump again with the cleaned buffer */
3713 } else if (kr != KERN_SUCCESS) {
3714 return kr;
3715 }
3716 offset += left_sz;
3717 break;
3718 }
3719 return KERN_SUCCESS;
3720 };
3721
3722 /* this foreach first calls to the first callback with the number of entries, then calls the second for every entry
3723 * when the buffer is exhausted, it is flushed to the sysctl and restarted */
3724 kern_return_t kr = vm_map_entries_foreach(map, write_header, write_entry);
3725
3726 if (kr != KERN_SUCCESS) {
3727 goto out;
3728 }
3729
3730 if (offset > 0) { /* last chunk */
3731 error = SYSCTL_OUT(req, buf, offset);
3732 }
3733
3734 out:
3735 if (buf != NULL) {
3736 kfree_data(buf, SYSCTL_VM_OBJECTS_SLOTMAP_BUF_SIZE)
3737 }
3738 if (map != NULL) {
3739 vm_map_deallocate(map);
3740 }
3741 return error;
3742 }
3743
3744 SYSCTL_PROC(_vm, OID_AUTO, task_vm_objects_slotmap, CTLTYPE_NODE | CTLFLAG_LOCKED | CTLFLAG_RD, 0, 0, sysctl_task_vm_objects_slotmap, "S", "");
3745 static int
3746 systctl_vm_reset_tag SYSCTL_HANDLER_ARGS
3747 {
3748 #pragma unused(oidp, arg1, arg2)
3749 int error;
3750 int tag;
3751 kern_return_t kr;
3752
3753 /* Need to be root */
3754 if (!kauth_cred_issuser(kauth_cred_get())) {
3755 return EPERM;
3756 }
3757
3758 error = SYSCTL_IN(req, &tag, sizeof(tag));
3759 if (error) {
3760 return error;
3761 }
3762
3763 if (tag > VM_MAX_TAG_VALUE) {
3764 return EINVAL;
3765 }
3766
3767 kr = vm_tag_reset_peak((vm_tag_t)tag);
3768
3769 return mach_to_bsd_errno(kr);
3770 }
3771
3772 SYSCTL_PROC(_vm, OID_AUTO, reset_tag,
3773 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED,
3774 0, 0, &systctl_vm_reset_tag, "I", "");
3775
3776 static int
3777 systctl_vm_reset_all_tags SYSCTL_HANDLER_ARGS
3778 {
3779 #pragma unused(oidp, arg1, arg2)
3780 /* Only reset the values if the sysctl is a write */
3781 if (!req->newptr) {
3782 return EINVAL;
3783 }
3784
3785 /* Need to be root */
3786 if (!kauth_cred_issuser(kauth_cred_get())) {
3787 return EPERM;
3788 }
3789
3790 vm_tag_reset_all_peaks();
3791
3792 return 0;
3793 }
3794
3795 SYSCTL_PROC(_vm, OID_AUTO, reset_all_tags,
3796 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_LOCKED,
3797 0, 0, &systctl_vm_reset_all_tags, "I", "");
3798
3799 #endif /* DEVELOPMENT || DEBUG */
3800