xref: /xnu-8020.140.41/bsd/vm/vm_unix.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc) !
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * Mach Operating System
30  * Copyright (c) 1987 Carnegie-Mellon University
31  * All rights reserved.  The CMU software License Agreement specifies
32  * the terms and conditions for use and redistribution.
33  */
34 /*
35  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
36  * support for mandatory and extensible security protections.  This notice
37  * is included in support of clause 2.2 (b) of the Apple Public License,
38  * Version 2.0.
39  */
40 #include <vm/vm_options.h>
41 
42 #include <kern/task.h>
43 #include <kern/thread.h>
44 #include <kern/debug.h>
45 #include <kern/extmod_statistics.h>
46 #include <mach/mach_traps.h>
47 #include <mach/port.h>
48 #include <mach/sdt.h>
49 #include <mach/task.h>
50 #include <mach/task_access.h>
51 #include <mach/task_special_ports.h>
52 #include <mach/time_value.h>
53 #include <mach/vm_map.h>
54 #include <mach/vm_param.h>
55 #include <mach/vm_prot.h>
56 #include <machine/machine_routines.h>
57 
58 #include <sys/file_internal.h>
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/dir.h>
62 #include <sys/namei.h>
63 #include <sys/proc_internal.h>
64 #include <sys/kauth.h>
65 #include <sys/vm.h>
66 #include <sys/file.h>
67 #include <sys/vnode_internal.h>
68 #include <sys/mount.h>
69 #include <sys/xattr.h>
70 #include <sys/trace.h>
71 #include <sys/kernel.h>
72 #include <sys/ubc_internal.h>
73 #include <sys/user.h>
74 #include <sys/syslog.h>
75 #include <sys/stat.h>
76 #include <sys/sysproto.h>
77 #include <sys/mman.h>
78 #include <sys/sysctl.h>
79 #include <sys/cprotect.h>
80 #include <sys/kpi_socket.h>
81 #include <sys/kas_info.h>
82 #include <sys/socket.h>
83 #include <sys/socketvar.h>
84 #include <sys/random.h>
85 #if NECP
86 #include <net/necp.h>
87 #endif /* NECP */
88 #if SKYWALK
89 #include <skywalk/os_channel.h>
90 #endif /* SKYWALK */
91 
92 #include <security/audit/audit.h>
93 #include <security/mac.h>
94 #include <bsm/audit_kevents.h>
95 
96 #include <kern/kalloc.h>
97 #include <vm/vm_map.h>
98 #include <vm/vm_kern.h>
99 #include <vm/vm_pageout.h>
100 
101 #include <mach/shared_region.h>
102 #include <vm/vm_shared_region.h>
103 
104 #include <vm/vm_protos.h>
105 
106 #include <sys/kern_memorystatus.h>
107 #include <sys/kern_memorystatus_freeze.h>
108 #include <sys/proc_internal.h>
109 
110 #if CONFIG_MACF
111 #include <security/mac_framework.h>
112 #endif
113 
114 #include <kern/bits.h>
115 
116 #if CONFIG_CSR
117 #include <sys/csr.h>
118 #endif /* CONFIG_CSR */
119 #include <IOKit/IOBSD.h>
120 
121 #if VM_MAP_DEBUG_APPLE_PROTECT
122 SYSCTL_INT(_vm, OID_AUTO, map_debug_apple_protect, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_apple_protect, 0, "");
123 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
124 
125 #if VM_MAP_DEBUG_FOURK
126 SYSCTL_INT(_vm, OID_AUTO, map_debug_fourk, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_map_debug_fourk, 0, "");
127 #endif /* VM_MAP_DEBUG_FOURK */
128 
129 #if DEVELOPMENT || DEBUG
130 
131 static int
132 sysctl_kmem_alloc_contig SYSCTL_HANDLER_ARGS
133 {
134 #pragma unused(arg1, arg2)
135 	vm_offset_t     kaddr;
136 	kern_return_t   kr;
137 	int     error = 0;
138 	int     size = 0;
139 
140 	error = sysctl_handle_int(oidp, &size, 0, req);
141 	if (error || !req->newptr) {
142 		return error;
143 	}
144 
145 	kr = kmem_alloc_contig(kernel_map, &kaddr, (vm_size_t)size,
146 	    0, 0, 0, KMA_DATA, VM_KERN_MEMORY_IOKIT);
147 
148 	if (kr == KERN_SUCCESS) {
149 		kmem_free(kernel_map, kaddr, size);
150 	}
151 
152 	return error;
153 }
154 
155 SYSCTL_PROC(_vm, OID_AUTO, kmem_alloc_contig, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
156     0, 0, &sysctl_kmem_alloc_contig, "I", "");
157 
158 extern int vm_region_footprint;
159 SYSCTL_INT(_vm, OID_AUTO, region_footprint, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, &vm_region_footprint, 0, "");
160 
161 #endif /* DEVELOPMENT || DEBUG */
162 
163 static int
164 sysctl_vm_self_region_footprint SYSCTL_HANDLER_ARGS
165 {
166 #pragma unused(arg1, arg2, oidp)
167 	int     error = 0;
168 	int     value;
169 
170 	value = task_self_region_footprint();
171 	error = SYSCTL_OUT(req, &value, sizeof(int));
172 	if (error) {
173 		return error;
174 	}
175 
176 	if (!req->newptr) {
177 		return 0;
178 	}
179 
180 	error = SYSCTL_IN(req, &value, sizeof(int));
181 	if (error) {
182 		return error;
183 	}
184 	task_self_region_footprint_set(value);
185 	return 0;
186 }
187 SYSCTL_PROC(_vm, OID_AUTO, self_region_footprint, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_footprint, "I", "");
188 
189 static int
190 sysctl_vm_self_region_page_size SYSCTL_HANDLER_ARGS
191 {
192 #pragma unused(arg1, arg2, oidp)
193 	int     error = 0;
194 	int     value;
195 
196 	value = (1 << thread_self_region_page_shift());
197 	error = SYSCTL_OUT(req, &value, sizeof(int));
198 	if (error) {
199 		return error;
200 	}
201 
202 	if (!req->newptr) {
203 		return 0;
204 	}
205 
206 	error = SYSCTL_IN(req, &value, sizeof(int));
207 	if (error) {
208 		return error;
209 	}
210 
211 	if (value != 0 && value != 4096 && value != 16384) {
212 		return EINVAL;
213 	}
214 
215 #if !__ARM_MIXED_PAGE_SIZE__
216 	if (value != vm_map_page_size(current_map())) {
217 		return EINVAL;
218 	}
219 #endif /* !__ARM_MIXED_PAGE_SIZE__ */
220 
221 	thread_self_region_page_shift_set(bit_first(value));
222 	return 0;
223 }
224 SYSCTL_PROC(_vm, OID_AUTO, self_region_page_size, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED, 0, 0, &sysctl_vm_self_region_page_size, "I", "");
225 
226 
227 #if DEVELOPMENT || DEBUG
228 extern int panic_on_unsigned_execute;
229 SYSCTL_INT(_vm, OID_AUTO, panic_on_unsigned_execute, CTLFLAG_RW | CTLFLAG_LOCKED, &panic_on_unsigned_execute, 0, "");
230 #endif /* DEVELOPMENT || DEBUG */
231 
232 extern int cs_executable_create_upl;
233 extern int cs_executable_wire;
234 SYSCTL_INT(_vm, OID_AUTO, cs_executable_create_upl, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_create_upl, 0, "");
235 SYSCTL_INT(_vm, OID_AUTO, cs_executable_wire, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_executable_wire, 0, "");
236 
237 extern int apple_protect_pager_count;
238 extern int apple_protect_pager_count_mapped;
239 extern unsigned int apple_protect_pager_cache_limit;
240 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count, 0, "");
241 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_count_mapped, CTLFLAG_RD | CTLFLAG_LOCKED, &apple_protect_pager_count_mapped, 0, "");
242 SYSCTL_UINT(_vm, OID_AUTO, apple_protect_pager_cache_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_cache_limit, 0, "");
243 
244 #if DEVELOPMENT || DEBUG
245 extern int radar_20146450;
246 SYSCTL_INT(_vm, OID_AUTO, radar_20146450, CTLFLAG_RW | CTLFLAG_LOCKED, &radar_20146450, 0, "");
247 
248 extern int macho_printf;
249 SYSCTL_INT(_vm, OID_AUTO, macho_printf, CTLFLAG_RW | CTLFLAG_LOCKED, &macho_printf, 0, "");
250 
251 extern int apple_protect_pager_data_request_debug;
252 SYSCTL_INT(_vm, OID_AUTO, apple_protect_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apple_protect_pager_data_request_debug, 0, "");
253 
254 #if __arm__ || __arm64__
255 /* These are meant to support the page table accounting unit test. */
256 extern unsigned int arm_hardware_page_size;
257 extern unsigned int arm_pt_desc_size;
258 extern unsigned int arm_pt_root_size;
259 extern unsigned int free_page_size_tt_count;
260 extern unsigned int free_two_page_size_tt_count;
261 extern unsigned int free_tt_count;
262 extern unsigned int inuse_user_tteroot_count;
263 extern unsigned int inuse_kernel_tteroot_count;
264 extern unsigned int inuse_user_ttepages_count;
265 extern unsigned int inuse_kernel_ttepages_count;
266 extern unsigned int inuse_user_ptepages_count;
267 extern unsigned int inuse_kernel_ptepages_count;
268 SYSCTL_UINT(_vm, OID_AUTO, native_hw_pagesize, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_hardware_page_size, 0, "");
269 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_desc_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_desc_size, 0, "");
270 SYSCTL_UINT(_vm, OID_AUTO, arm_pt_root_size, CTLFLAG_RD | CTLFLAG_LOCKED, &arm_pt_root_size, 0, "");
271 SYSCTL_UINT(_vm, OID_AUTO, free_1page_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_page_size_tt_count, 0, "");
272 SYSCTL_UINT(_vm, OID_AUTO, free_2page_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_two_page_size_tt_count, 0, "");
273 SYSCTL_UINT(_vm, OID_AUTO, free_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &free_tt_count, 0, "");
274 SYSCTL_UINT(_vm, OID_AUTO, user_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_tteroot_count, 0, "");
275 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_root, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_tteroot_count, 0, "");
276 SYSCTL_UINT(_vm, OID_AUTO, user_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ttepages_count, 0, "");
277 SYSCTL_UINT(_vm, OID_AUTO, kernel_tte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ttepages_count, 0, "");
278 SYSCTL_UINT(_vm, OID_AUTO, user_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_user_ptepages_count, 0, "");
279 SYSCTL_UINT(_vm, OID_AUTO, kernel_pte_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &inuse_kernel_ptepages_count, 0, "");
280 #if DEVELOPMENT || DEBUG
281 extern unsigned long pmap_asid_flushes;
282 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_flushes, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_flushes, "");
283 extern unsigned long pmap_asid_hits;
284 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_hits, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_hits, "");
285 extern unsigned long pmap_asid_misses;
286 SYSCTL_ULONG(_vm, OID_AUTO, pmap_asid_misses, CTLFLAG_RD | CTLFLAG_LOCKED, &pmap_asid_misses, "");
287 #endif
288 #endif /* __arm__ || __arm64__ */
289 
290 #if __arm64__
291 extern int fourk_pager_data_request_debug;
292 SYSCTL_INT(_vm, OID_AUTO, fourk_pager_data_request_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &fourk_pager_data_request_debug, 0, "");
293 #endif /* __arm64__ */
294 #endif /* DEVELOPMENT || DEBUG */
295 
296 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor, 0, "");
297 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_compressor_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_compressor_pages, 0, "");
298 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate, 0, "");
299 SYSCTL_INT(_vm, OID_AUTO, vm_do_collapse_terminate_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.do_collapse_terminate_failure, 0, "");
300 SYSCTL_INT(_vm, OID_AUTO, vm_should_cow_but_wired, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.should_cow_but_wired, 0, "");
301 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow, 0, "");
302 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_extra_cow_pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_extra_cow_pages, 0, "");
303 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_write, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_write, 0, "");
304 SYSCTL_INT(_vm, OID_AUTO, vm_create_upl_lookup_failure_copy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_counters.create_upl_lookup_failure_copy, 0, "");
305 #if VM_SCAN_FOR_SHADOW_CHAIN
306 static int vm_shadow_max_enabled = 0;    /* Disabled by default */
307 extern int proc_shadow_max(void);
308 static int
309 vm_shadow_max SYSCTL_HANDLER_ARGS
310 {
311 #pragma unused(arg1, arg2, oidp)
312 	int value = 0;
313 
314 	if (vm_shadow_max_enabled) {
315 		value = proc_shadow_max();
316 	}
317 
318 	return SYSCTL_OUT(req, &value, sizeof(value));
319 }
320 SYSCTL_PROC(_vm, OID_AUTO, vm_shadow_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
321     0, 0, &vm_shadow_max, "I", "");
322 
323 SYSCTL_INT(_vm, OID_AUTO, vm_shadow_max_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_shadow_max_enabled, 0, "");
324 
325 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
326 
327 SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, "");
328 
329 __attribute__((noinline)) int __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(
330 	mach_port_t task_access_port, int32_t calling_pid, uint32_t calling_gid, int32_t target_pid, mach_task_flavor_t flavor);
331 /*
332  * Sysctl's related to data/stack execution.  See osfmk/vm/vm_map.c
333  */
334 
335 #if DEVELOPMENT || DEBUG
336 extern int allow_stack_exec, allow_data_exec;
337 
338 SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, "");
339 SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, "");
340 
341 #endif /* DEVELOPMENT || DEBUG */
342 
343 static const char *prot_values[] = {
344 	"none",
345 	"read-only",
346 	"write-only",
347 	"read-write",
348 	"execute-only",
349 	"read-execute",
350 	"write-execute",
351 	"read-write-execute"
352 };
353 
354 void
log_stack_execution_failure(addr64_t vaddr,vm_prot_t prot)355 log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot)
356 {
357 	printf("Data/Stack execution not permitted: %s[pid %d] at virtual address 0x%qx, protections were %s\n",
358 	    current_proc()->p_comm, proc_getpid(current_proc()), vaddr, prot_values[prot & VM_PROT_ALL]);
359 }
360 
361 /*
362  * shared_region_unnest_logging: level of logging of unnesting events
363  * 0	- no logging
364  * 1	- throttled logging of unexpected unnesting events (default)
365  * 2	- unthrottled logging of unexpected unnesting events
366  * 3+	- unthrottled logging of all unnesting events
367  */
368 int shared_region_unnest_logging = 1;
369 
370 SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED,
371     &shared_region_unnest_logging, 0, "");
372 
373 int vm_shared_region_unnest_log_interval = 10;
374 int shared_region_unnest_log_count_threshold = 5;
375 
376 /*
377  * Shared cache path enforcement.
378  */
379 
380 #if XNU_TARGET_OS_OSX
381 
382 #if defined (__x86_64__)
383 static int scdir_enforce = 1;
384 #else /* defined (__x86_64__) */
385 static int scdir_enforce = 0;   /* AOT caches live elsewhere */
386 #endif /* defined (__x86_64__) */
387 
388 static char scdir_path[] = "/System/Library/dyld/";
389 
390 #else /* XNU_TARGET_OS_OSX */
391 
392 static int scdir_enforce = 0;
393 static char scdir_path[] = "/System/Library/Caches/com.apple.dyld/";
394 
395 #endif /* XNU_TARGET_OS_OSX */
396 
397 static char driverkit_scdir_path[] = "/System/DriverKit/System/Library/dyld/";
398 
399 #ifndef SECURE_KERNEL
400 static int sysctl_scdir_enforce SYSCTL_HANDLER_ARGS
401 {
402 #if CONFIG_CSR
403 	if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
404 		printf("Failed attempt to set vm.enforce_shared_cache_dir sysctl\n");
405 		return EPERM;
406 	}
407 #endif /* CONFIG_CSR */
408 	return sysctl_handle_int(oidp, arg1, arg2, req);
409 }
410 
411 SYSCTL_PROC(_vm, OID_AUTO, enforce_shared_cache_dir, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, sysctl_scdir_enforce, "I", "");
412 #endif
413 
414 /* These log rate throttling state variables aren't thread safe, but
415  * are sufficient unto the task.
416  */
417 static int64_t last_unnest_log_time = 0;
418 static int shared_region_unnest_log_count = 0;
419 
420 void
log_unnest_badness(vm_map_t m,vm_map_offset_t s,vm_map_offset_t e,boolean_t is_nested_map,vm_map_offset_t lowest_unnestable_addr)421 log_unnest_badness(
422 	vm_map_t        m,
423 	vm_map_offset_t s,
424 	vm_map_offset_t e,
425 	boolean_t       is_nested_map,
426 	vm_map_offset_t lowest_unnestable_addr)
427 {
428 	struct timeval  tv;
429 
430 	if (shared_region_unnest_logging == 0) {
431 		return;
432 	}
433 
434 	if (shared_region_unnest_logging <= 2 &&
435 	    is_nested_map &&
436 	    s >= lowest_unnestable_addr) {
437 		/*
438 		 * Unnesting of writable map entries is fine.
439 		 */
440 		return;
441 	}
442 
443 	if (shared_region_unnest_logging <= 1) {
444 		microtime(&tv);
445 		if ((tv.tv_sec - last_unnest_log_time) <
446 		    vm_shared_region_unnest_log_interval) {
447 			if (shared_region_unnest_log_count++ >
448 			    shared_region_unnest_log_count_threshold) {
449 				return;
450 			}
451 		} else {
452 			last_unnest_log_time = tv.tv_sec;
453 			shared_region_unnest_log_count = 0;
454 		}
455 	}
456 
457 	DTRACE_VM4(log_unnest_badness,
458 	    vm_map_t, m,
459 	    vm_map_offset_t, s,
460 	    vm_map_offset_t, e,
461 	    vm_map_offset_t, lowest_unnestable_addr);
462 	printf("%s[%d] triggered unnest of range 0x%qx->0x%qx of DYLD shared region in VM map %p. While not abnormal for debuggers, this increases system memory footprint until the target exits.\n", current_proc()->p_comm, proc_getpid(current_proc()), (uint64_t)s, (uint64_t)e, (void *) VM_KERNEL_ADDRPERM(m));
463 }
464 
465 int
useracc(user_addr_t addr,user_size_t len,int prot)466 useracc(
467 	user_addr_t     addr,
468 	user_size_t     len,
469 	int     prot)
470 {
471 	vm_map_t        map;
472 
473 	map = current_map();
474 	return vm_map_check_protection(
475 		map,
476 		vm_map_trunc_page(addr,
477 		vm_map_page_mask(map)),
478 		vm_map_round_page(addr + len,
479 		vm_map_page_mask(map)),
480 		prot == B_READ ? VM_PROT_READ : VM_PROT_WRITE);
481 }
482 
483 int
vslock(user_addr_t addr,user_size_t len)484 vslock(
485 	user_addr_t     addr,
486 	user_size_t     len)
487 {
488 	kern_return_t   kret;
489 	vm_map_t        map;
490 
491 	map = current_map();
492 	kret = vm_map_wire_kernel(map,
493 	    vm_map_trunc_page(addr,
494 	    vm_map_page_mask(map)),
495 	    vm_map_round_page(addr + len,
496 	    vm_map_page_mask(map)),
497 	    VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_BSD,
498 	    FALSE);
499 
500 	switch (kret) {
501 	case KERN_SUCCESS:
502 		return 0;
503 	case KERN_INVALID_ADDRESS:
504 	case KERN_NO_SPACE:
505 		return ENOMEM;
506 	case KERN_PROTECTION_FAILURE:
507 		return EACCES;
508 	default:
509 		return EINVAL;
510 	}
511 }
512 
513 int
vsunlock(user_addr_t addr,user_size_t len,__unused int dirtied)514 vsunlock(
515 	user_addr_t addr,
516 	user_size_t len,
517 	__unused int dirtied)
518 {
519 #if FIXME  /* [ */
520 	pmap_t          pmap;
521 	vm_page_t       pg;
522 	vm_map_offset_t vaddr;
523 	ppnum_t         paddr;
524 #endif  /* FIXME ] */
525 	kern_return_t   kret;
526 	vm_map_t        map;
527 
528 	map = current_map();
529 
530 #if FIXME  /* [ */
531 	if (dirtied) {
532 		pmap = get_task_pmap(current_task());
533 		for (vaddr = vm_map_trunc_page(addr, PAGE_MASK);
534 		    vaddr < vm_map_round_page(addr + len, PAGE_MASK);
535 		    vaddr += PAGE_SIZE) {
536 			paddr = pmap_find_phys(pmap, vaddr);
537 			pg = PHYS_TO_VM_PAGE(paddr);
538 			vm_page_set_modified(pg);
539 		}
540 	}
541 #endif  /* FIXME ] */
542 #ifdef  lint
543 	dirtied++;
544 #endif  /* lint */
545 	kret = vm_map_unwire(map,
546 	    vm_map_trunc_page(addr,
547 	    vm_map_page_mask(map)),
548 	    vm_map_round_page(addr + len,
549 	    vm_map_page_mask(map)),
550 	    FALSE);
551 	switch (kret) {
552 	case KERN_SUCCESS:
553 		return 0;
554 	case KERN_INVALID_ADDRESS:
555 	case KERN_NO_SPACE:
556 		return ENOMEM;
557 	case KERN_PROTECTION_FAILURE:
558 		return EACCES;
559 	default:
560 		return EINVAL;
561 	}
562 }
563 
564 int
subyte(user_addr_t addr,int byte)565 subyte(
566 	user_addr_t addr,
567 	int byte)
568 {
569 	char character;
570 
571 	character = (char)byte;
572 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
573 }
574 
575 int
suibyte(user_addr_t addr,int byte)576 suibyte(
577 	user_addr_t addr,
578 	int byte)
579 {
580 	char character;
581 
582 	character = (char)byte;
583 	return copyout((void *)&(character), addr, sizeof(char)) == 0 ? 0 : -1;
584 }
585 
586 int
fubyte(user_addr_t addr)587 fubyte(user_addr_t addr)
588 {
589 	unsigned char byte;
590 
591 	if (copyin(addr, (void *) &byte, sizeof(char))) {
592 		return -1;
593 	}
594 	return byte;
595 }
596 
597 int
fuibyte(user_addr_t addr)598 fuibyte(user_addr_t addr)
599 {
600 	unsigned char byte;
601 
602 	if (copyin(addr, (void *) &(byte), sizeof(char))) {
603 		return -1;
604 	}
605 	return byte;
606 }
607 
608 int
suword(user_addr_t addr,long word)609 suword(
610 	user_addr_t addr,
611 	long word)
612 {
613 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
614 }
615 
616 long
fuword(user_addr_t addr)617 fuword(user_addr_t addr)
618 {
619 	long word = 0;
620 
621 	if (copyin(addr, (void *) &word, sizeof(int))) {
622 		return -1;
623 	}
624 	return word;
625 }
626 
627 /* suiword and fuiword are the same as suword and fuword, respectively */
628 
629 int
suiword(user_addr_t addr,long word)630 suiword(
631 	user_addr_t addr,
632 	long word)
633 {
634 	return copyout((void *) &word, addr, sizeof(int)) == 0 ? 0 : -1;
635 }
636 
637 long
fuiword(user_addr_t addr)638 fuiword(user_addr_t addr)
639 {
640 	long word = 0;
641 
642 	if (copyin(addr, (void *) &word, sizeof(int))) {
643 		return -1;
644 	}
645 	return word;
646 }
647 
648 /*
649  * With a 32-bit kernel and mixed 32/64-bit user tasks, this interface allows the
650  * fetching and setting of process-sized size_t and pointer values.
651  */
652 int
sulong(user_addr_t addr,int64_t word)653 sulong(user_addr_t addr, int64_t word)
654 {
655 	if (IS_64BIT_PROCESS(current_proc())) {
656 		return copyout((void *)&word, addr, sizeof(word)) == 0 ? 0 : -1;
657 	} else {
658 		return suiword(addr, (long)word);
659 	}
660 }
661 
662 int64_t
fulong(user_addr_t addr)663 fulong(user_addr_t addr)
664 {
665 	int64_t longword;
666 
667 	if (IS_64BIT_PROCESS(current_proc())) {
668 		if (copyin(addr, (void *)&longword, sizeof(longword)) != 0) {
669 			return -1;
670 		}
671 		return longword;
672 	} else {
673 		return (int64_t)fuiword(addr);
674 	}
675 }
676 
677 int
suulong(user_addr_t addr,uint64_t uword)678 suulong(user_addr_t addr, uint64_t uword)
679 {
680 	if (IS_64BIT_PROCESS(current_proc())) {
681 		return copyout((void *)&uword, addr, sizeof(uword)) == 0 ? 0 : -1;
682 	} else {
683 		return suiword(addr, (uint32_t)uword);
684 	}
685 }
686 
687 uint64_t
fuulong(user_addr_t addr)688 fuulong(user_addr_t addr)
689 {
690 	uint64_t ulongword;
691 
692 	if (IS_64BIT_PROCESS(current_proc())) {
693 		if (copyin(addr, (void *)&ulongword, sizeof(ulongword)) != 0) {
694 			return -1ULL;
695 		}
696 		return ulongword;
697 	} else {
698 		return (uint64_t)fuiword(addr);
699 	}
700 }
701 
702 int
swapon(__unused proc_t procp,__unused struct swapon_args * uap,__unused int * retval)703 swapon(__unused proc_t procp, __unused struct swapon_args *uap, __unused int *retval)
704 {
705 	return ENOTSUP;
706 }
707 
708 /*
709  * pid_for_task
710  *
711  * Find the BSD process ID for the Mach task associated with the given Mach port
712  * name
713  *
714  * Parameters:	args		User argument descriptor (see below)
715  *
716  * Indirect parameters:	args->t		Mach port name
717  *                      args->pid	Process ID (returned value; see below)
718  *
719  * Returns:	KERL_SUCCESS	Success
720  *              KERN_FAILURE	Not success
721  *
722  * Implicit returns: args->pid		Process ID
723  *
724  */
725 kern_return_t
pid_for_task(struct pid_for_task_args * args)726 pid_for_task(
727 	struct pid_for_task_args *args)
728 {
729 	mach_port_name_t        t = args->t;
730 	user_addr_t             pid_addr  = args->pid;
731 	proc_t p;
732 	task_t          t1;
733 	int     pid = -1;
734 	kern_return_t   err = KERN_SUCCESS;
735 
736 	AUDIT_MACH_SYSCALL_ENTER(AUE_PIDFORTASK);
737 	AUDIT_ARG(mach_port1, t);
738 
739 	t1 = port_name_to_task_name(t);
740 
741 	if (t1 == TASK_NULL) {
742 		err = KERN_FAILURE;
743 		goto pftout;
744 	} else {
745 		p = get_bsdtask_info(t1);
746 		if (p) {
747 			pid  = proc_pid(p);
748 			err = KERN_SUCCESS;
749 		} else if (is_corpsetask(t1)) {
750 			pid = task_pid(t1);
751 			err = KERN_SUCCESS;
752 		} else {
753 			err = KERN_FAILURE;
754 		}
755 	}
756 	task_deallocate(t1);
757 pftout:
758 	AUDIT_ARG(pid, pid);
759 	(void) copyout((char *) &pid, pid_addr, sizeof(int));
760 	AUDIT_MACH_SYSCALL_EXIT(err);
761 	return err;
762 }
763 
764 /*
765  *
766  * tfp_policy = KERN_TFP_POLICY_DENY; Deny Mode: None allowed except for self
767  * tfp_policy = KERN_TFP_POLICY_DEFAULT; default mode: all posix checks and upcall via task port for authentication
768  *
769  */
770 static  int tfp_policy = KERN_TFP_POLICY_DEFAULT;
771 
772 /*
773  *	Routine:	task_for_pid_posix_check
774  *	Purpose:
775  *			Verify that the current process should be allowed to
776  *			get the target process's task port. This is only
777  *			permitted if:
778  *			- The current process is root
779  *			OR all of the following are true:
780  *			- The target process's real, effective, and saved uids
781  *			  are the same as the current proc's euid,
782  *			- The target process's group set is a subset of the
783  *			  calling process's group set, and
784  *			- The target process hasn't switched credentials.
785  *
786  *	Returns:	TRUE: permitted
787  *			FALSE: denied
788  */
789 static int
task_for_pid_posix_check(proc_t target)790 task_for_pid_posix_check(proc_t target)
791 {
792 	kauth_cred_t targetcred, mycred;
793 	uid_t myuid;
794 	int allowed;
795 
796 	/* No task_for_pid on bad targets */
797 	if (target->p_stat == SZOMB) {
798 		return FALSE;
799 	}
800 
801 	mycred = kauth_cred_get();
802 	myuid = kauth_cred_getuid(mycred);
803 
804 	/* If we're running as root, the check passes */
805 	if (kauth_cred_issuser(mycred)) {
806 		return TRUE;
807 	}
808 
809 	/* We're allowed to get our own task port */
810 	if (target == current_proc()) {
811 		return TRUE;
812 	}
813 
814 	/*
815 	 * Under DENY, only root can get another proc's task port,
816 	 * so no more checks are needed.
817 	 */
818 	if (tfp_policy == KERN_TFP_POLICY_DENY) {
819 		return FALSE;
820 	}
821 
822 	targetcred = kauth_cred_proc_ref(target);
823 	allowed = TRUE;
824 
825 	/* Do target's ruid, euid, and saved uid match my euid? */
826 	if ((kauth_cred_getuid(targetcred) != myuid) ||
827 	    (kauth_cred_getruid(targetcred) != myuid) ||
828 	    (kauth_cred_getsvuid(targetcred) != myuid)) {
829 		allowed = FALSE;
830 		goto out;
831 	}
832 
833 	/* Are target's groups a subset of my groups? */
834 	if (kauth_cred_gid_subset(targetcred, mycred, &allowed) ||
835 	    allowed == 0) {
836 		allowed = FALSE;
837 		goto out;
838 	}
839 
840 	/* Has target switched credentials? */
841 	if (target->p_flag & P_SUGID) {
842 		allowed = FALSE;
843 		goto out;
844 	}
845 
846 out:
847 	kauth_cred_unref(&targetcred);
848 	return allowed;
849 }
850 
851 /*
852  *	__KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__
853  *
854  *	Description:	Waits for the user space daemon to respond to the request
855  *			we made. Function declared non inline to be visible in
856  *			stackshots and spindumps as well as debugging.
857  */
858 __attribute__((noinline)) int
__KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(mach_port_t task_access_port,int32_t calling_pid,uint32_t calling_gid,int32_t target_pid,mach_task_flavor_t flavor)859 __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(
860 	mach_port_t task_access_port, int32_t calling_pid, uint32_t calling_gid, int32_t target_pid, mach_task_flavor_t flavor)
861 {
862 	return check_task_access_with_flavor(task_access_port, calling_pid, calling_gid, target_pid, flavor);
863 }
864 
865 /*
866  *	Routine:	task_for_pid
867  *	Purpose:
868  *		Get the task port for another "process", named by its
869  *		process ID on the same host as "target_task".
870  *
871  *		Only permitted to privileged processes, or processes
872  *		with the same user ID.
873  *
874  *		Note: if pid == 0, an error is return no matter who is calling.
875  *
876  * XXX This should be a BSD system call, not a Mach trap!!!
877  */
878 kern_return_t
task_for_pid(struct task_for_pid_args * args)879 task_for_pid(
880 	struct task_for_pid_args *args)
881 {
882 	mach_port_name_t        target_tport = args->target_tport;
883 	int                     pid = args->pid;
884 	user_addr_t             task_addr = args->t;
885 	proc_t                  p = PROC_NULL;
886 	task_t                  t1 = TASK_NULL;
887 	task_t                  task = TASK_NULL;
888 	mach_port_name_t        tret = MACH_PORT_NULL;
889 	ipc_port_t              tfpport = MACH_PORT_NULL;
890 	void                    * sright = NULL;
891 	int                     error = 0;
892 	boolean_t               is_current_proc = FALSE;
893 	struct proc_ident       pident = {0};
894 
895 	AUDIT_MACH_SYSCALL_ENTER(AUE_TASKFORPID);
896 	AUDIT_ARG(pid, pid);
897 	AUDIT_ARG(mach_port1, target_tport);
898 
899 	/* Always check if pid == 0 */
900 	if (pid == 0) {
901 		(void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
902 		AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE);
903 		return KERN_FAILURE;
904 	}
905 
906 	t1 = port_name_to_task(target_tport);
907 	if (t1 == TASK_NULL) {
908 		(void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
909 		AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE);
910 		return KERN_FAILURE;
911 	}
912 
913 
914 	p = proc_find(pid);
915 	if (p == PROC_NULL) {
916 		error = KERN_FAILURE;
917 		goto tfpout;
918 	}
919 	pident = proc_ident(p);
920 	is_current_proc = (p == current_proc());
921 
922 #if CONFIG_AUDIT
923 	AUDIT_ARG(process, p);
924 #endif
925 
926 	if (!(task_for_pid_posix_check(p))) {
927 		error = KERN_FAILURE;
928 		goto tfpout;
929 	}
930 
931 	if (p->task == TASK_NULL) {
932 		error = KERN_SUCCESS;
933 		goto tfpout;
934 	}
935 
936 	/*
937 	 * Grab a task reference and drop the proc reference as the proc ref
938 	 * shouldn't be held accross upcalls.
939 	 */
940 	task = p->task;
941 	task_reference(task);
942 
943 	proc_rele(p);
944 	p = PROC_NULL;
945 
946 #if CONFIG_MACF
947 	error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_CONTROL);
948 	if (error) {
949 		error = KERN_FAILURE;
950 		goto tfpout;
951 	}
952 #endif
953 
954 	/* If we aren't root and target's task access port is set... */
955 	if (!kauth_cred_issuser(kauth_cred_get()) &&
956 	    !is_current_proc &&
957 	    (task_get_task_access_port(task, &tfpport) == 0) &&
958 	    (tfpport != IPC_PORT_NULL)) {
959 		if (tfpport == IPC_PORT_DEAD) {
960 			error = KERN_PROTECTION_FAILURE;
961 			goto tfpout;
962 		}
963 
964 		/* Call up to the task access server */
965 		error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport,
966 		    proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_CONTROL);
967 
968 		if (error != MACH_MSG_SUCCESS) {
969 			if (error == MACH_RCV_INTERRUPTED) {
970 				error = KERN_ABORTED;
971 			} else {
972 				error = KERN_FAILURE;
973 			}
974 			goto tfpout;
975 		}
976 	}
977 
978 	/* Grant task port access */
979 	extmod_statistics_incr_task_for_pid(task);
980 
981 	/* this reference will be consumed during conversion */
982 	task_reference(task);
983 	if (task == current_task()) {
984 		/* return pinned self if current_task() so equality check with mach_task_self_ passes */
985 		sright = (void *)convert_task_to_port_pinned(task);
986 	} else {
987 		sright = (void *)convert_task_to_port(task);
988 	}
989 	/* extra task ref consumed */
990 
991 	/*
992 	 * Check if the task has been corpsified. We must do so after conversion
993 	 * since we don't hold locks and may have grabbed a corpse control port
994 	 * above which will prevent no-senders notification delivery.
995 	 */
996 	if (is_corpsetask(task)) {
997 		ipc_port_release_send(sright);
998 		error = KERN_FAILURE;
999 		goto tfpout;
1000 	}
1001 
1002 	tret = ipc_port_copyout_send(
1003 		sright,
1004 		get_task_ipcspace(current_task()));
1005 
1006 	error = KERN_SUCCESS;
1007 
1008 tfpout:
1009 	task_deallocate(t1);
1010 	AUDIT_ARG(mach_port2, tret);
1011 	(void) copyout((char *) &tret, task_addr, sizeof(mach_port_name_t));
1012 
1013 	if (tfpport != IPC_PORT_NULL) {
1014 		ipc_port_release_send(tfpport);
1015 	}
1016 	if (task != TASK_NULL) {
1017 		task_deallocate(task);
1018 	}
1019 	if (p != PROC_NULL) {
1020 		proc_rele(p);
1021 	}
1022 	AUDIT_MACH_SYSCALL_EXIT(error);
1023 	return error;
1024 }
1025 
1026 /*
1027  *	Routine:	task_name_for_pid
1028  *	Purpose:
1029  *		Get the task name port for another "process", named by its
1030  *		process ID on the same host as "target_task".
1031  *
1032  *		Only permitted to privileged processes, or processes
1033  *		with the same user ID.
1034  *
1035  * XXX This should be a BSD system call, not a Mach trap!!!
1036  */
1037 
1038 kern_return_t
task_name_for_pid(struct task_name_for_pid_args * args)1039 task_name_for_pid(
1040 	struct task_name_for_pid_args *args)
1041 {
1042 	mach_port_name_t        target_tport = args->target_tport;
1043 	int                     pid = args->pid;
1044 	user_addr_t             task_addr = args->t;
1045 	proc_t                  p = PROC_NULL;
1046 	task_t                  t1 = TASK_NULL;
1047 	mach_port_name_t        tret = MACH_PORT_NULL;
1048 	void * sright;
1049 	int error = 0, refheld = 0;
1050 	kauth_cred_t target_cred;
1051 
1052 	AUDIT_MACH_SYSCALL_ENTER(AUE_TASKNAMEFORPID);
1053 	AUDIT_ARG(pid, pid);
1054 	AUDIT_ARG(mach_port1, target_tport);
1055 
1056 	t1 = port_name_to_task(target_tport);
1057 	if (t1 == TASK_NULL) {
1058 		(void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
1059 		AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE);
1060 		return KERN_FAILURE;
1061 	}
1062 
1063 	p = proc_find(pid);
1064 	if (p != PROC_NULL) {
1065 		AUDIT_ARG(process, p);
1066 		target_cred = kauth_cred_proc_ref(p);
1067 		refheld = 1;
1068 
1069 		if ((p->p_stat != SZOMB)
1070 		    && ((current_proc() == p)
1071 		    || kauth_cred_issuser(kauth_cred_get())
1072 		    || ((kauth_cred_getuid(target_cred) == kauth_cred_getuid(kauth_cred_get())) &&
1073 		    ((kauth_cred_getruid(target_cred) == kauth_getruid()))))) {
1074 			if (p->task != TASK_NULL) {
1075 				struct proc_ident pident = proc_ident(p);
1076 
1077 				task_t task = p->task;
1078 
1079 				task_reference(p->task);
1080 				proc_rele(p);
1081 				p = PROC_NULL;
1082 #if CONFIG_MACF
1083 				error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_NAME);
1084 				if (error) {
1085 					task_deallocate(task);
1086 					goto noperm;
1087 				}
1088 #endif
1089 				sright = (void *)convert_task_name_to_port(task);
1090 				task = NULL;
1091 				tret = ipc_port_copyout_send(sright,
1092 				    get_task_ipcspace(current_task()));
1093 			} else {
1094 				tret  = MACH_PORT_NULL;
1095 			}
1096 
1097 			AUDIT_ARG(mach_port2, tret);
1098 			(void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
1099 			task_deallocate(t1);
1100 			error = KERN_SUCCESS;
1101 			goto tnfpout;
1102 		}
1103 	}
1104 
1105 #if CONFIG_MACF
1106 noperm:
1107 #endif
1108 	task_deallocate(t1);
1109 	tret = MACH_PORT_NULL;
1110 	(void) copyout((char *) &tret, task_addr, sizeof(mach_port_name_t));
1111 	error = KERN_FAILURE;
1112 tnfpout:
1113 	if (refheld != 0) {
1114 		kauth_cred_unref(&target_cred);
1115 	}
1116 	if (p != PROC_NULL) {
1117 		proc_rele(p);
1118 	}
1119 	AUDIT_MACH_SYSCALL_EXIT(error);
1120 	return error;
1121 }
1122 
1123 /*
1124  *	Routine:	task_inspect_for_pid
1125  *	Purpose:
1126  *		Get the task inspect port for another "process", named by its
1127  *		process ID on the same host as "target_task".
1128  */
1129 int
task_inspect_for_pid(struct proc * p __unused,struct task_inspect_for_pid_args * args,int * ret)1130 task_inspect_for_pid(struct proc *p __unused, struct task_inspect_for_pid_args *args, int *ret)
1131 {
1132 	mach_port_name_t        target_tport = args->target_tport;
1133 	int                     pid = args->pid;
1134 	user_addr_t             task_addr = args->t;
1135 
1136 	proc_t                  proc = PROC_NULL;
1137 	task_t                  t1 = TASK_NULL;
1138 	task_inspect_t          task_insp = TASK_INSPECT_NULL;
1139 	mach_port_name_t        tret = MACH_PORT_NULL;
1140 	ipc_port_t              tfpport = MACH_PORT_NULL;
1141 	int                     error = 0;
1142 	void                    *sright = NULL;
1143 	boolean_t               is_current_proc = FALSE;
1144 	struct proc_ident       pident = {0};
1145 
1146 	/* Disallow inspect port for kernel_task */
1147 	if (pid == 0) {
1148 		(void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
1149 		return EPERM;
1150 	}
1151 
1152 	t1 = port_name_to_task(target_tport);
1153 	if (t1 == TASK_NULL) {
1154 		(void) copyout((char *) &tret, task_addr, sizeof(mach_port_name_t));
1155 		return EINVAL;
1156 	}
1157 
1158 	proc = proc_find(pid);
1159 	if (proc == PROC_NULL) {
1160 		error = ESRCH;
1161 		goto tifpout;
1162 	}
1163 	pident = proc_ident(proc);
1164 	is_current_proc = (proc == current_proc());
1165 
1166 	if (!(task_for_pid_posix_check(proc))) {
1167 		error = EPERM;
1168 		goto tifpout;
1169 	}
1170 
1171 	task_insp = proc->task;
1172 	if (task_insp == TASK_INSPECT_NULL) {
1173 		goto tifpout;
1174 	}
1175 
1176 	/*
1177 	 * Grab a task reference and drop the proc reference before making any upcalls.
1178 	 */
1179 	task_reference(task_insp);
1180 
1181 	proc_rele(proc);
1182 	proc = PROC_NULL;
1183 
1184 #if CONFIG_MACF
1185 	error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_INSPECT);
1186 	if (error) {
1187 		error = EPERM;
1188 		goto tifpout;
1189 	}
1190 #endif
1191 
1192 	/* If we aren't root and target's task access port is set... */
1193 	if (!kauth_cred_issuser(kauth_cred_get()) &&
1194 	    !is_current_proc &&
1195 	    (task_get_task_access_port(task_insp, &tfpport) == 0) &&
1196 	    (tfpport != IPC_PORT_NULL)) {
1197 		if (tfpport == IPC_PORT_DEAD) {
1198 			error = EACCES;
1199 			goto tifpout;
1200 		}
1201 
1202 
1203 		/* Call up to the task access server */
1204 		error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport,
1205 		    proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_INSPECT);
1206 
1207 		if (error != MACH_MSG_SUCCESS) {
1208 			if (error == MACH_RCV_INTERRUPTED) {
1209 				error = EINTR;
1210 			} else {
1211 				error = EPERM;
1212 			}
1213 			goto tifpout;
1214 		}
1215 	}
1216 
1217 	/* Check if the task has been corpsified */
1218 	if (is_corpsetask(task_insp)) {
1219 		error = EACCES;
1220 		goto tifpout;
1221 	}
1222 
1223 	/* could be IP_NULL, consumes a ref */
1224 	sright = (void*) convert_task_inspect_to_port(task_insp);
1225 	task_insp = TASK_INSPECT_NULL;
1226 	tret = ipc_port_copyout_send(sright, get_task_ipcspace(current_task()));
1227 
1228 tifpout:
1229 	task_deallocate(t1);
1230 	(void) copyout((char *) &tret, task_addr, sizeof(mach_port_name_t));
1231 	if (proc != PROC_NULL) {
1232 		proc_rele(proc);
1233 	}
1234 	if (tfpport != IPC_PORT_NULL) {
1235 		ipc_port_release_send(tfpport);
1236 	}
1237 	if (task_insp != TASK_INSPECT_NULL) {
1238 		task_deallocate(task_insp);
1239 	}
1240 
1241 	*ret = error;
1242 	return error;
1243 }
1244 
1245 /*
1246  *	Routine:	task_read_for_pid
1247  *	Purpose:
1248  *		Get the task read port for another "process", named by its
1249  *		process ID on the same host as "target_task".
1250  */
1251 int
task_read_for_pid(struct proc * p __unused,struct task_read_for_pid_args * args,int * ret)1252 task_read_for_pid(struct proc *p __unused, struct task_read_for_pid_args *args, int *ret)
1253 {
1254 	mach_port_name_t        target_tport = args->target_tport;
1255 	int                     pid = args->pid;
1256 	user_addr_t             task_addr = args->t;
1257 
1258 	proc_t                  proc = PROC_NULL;
1259 	task_t                  t1 = TASK_NULL;
1260 	task_read_t             task_read = TASK_READ_NULL;
1261 	mach_port_name_t        tret = MACH_PORT_NULL;
1262 	ipc_port_t              tfpport = MACH_PORT_NULL;
1263 	int                     error = 0;
1264 	void                    *sright = NULL;
1265 	boolean_t               is_current_proc = FALSE;
1266 	struct proc_ident       pident = {0};
1267 
1268 	/* Disallow read port for kernel_task */
1269 	if (pid == 0) {
1270 		(void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
1271 		return EPERM;
1272 	}
1273 
1274 	t1 = port_name_to_task(target_tport);
1275 	if (t1 == TASK_NULL) {
1276 		(void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
1277 		return EINVAL;
1278 	}
1279 
1280 	proc = proc_find(pid);
1281 	if (proc == PROC_NULL) {
1282 		error = ESRCH;
1283 		goto trfpout;
1284 	}
1285 	pident = proc_ident(proc);
1286 	is_current_proc = (proc == current_proc());
1287 
1288 	if (!(task_for_pid_posix_check(proc))) {
1289 		error = EPERM;
1290 		goto trfpout;
1291 	}
1292 
1293 	task_read = proc->task;
1294 	if (task_read == TASK_INSPECT_NULL) {
1295 		goto trfpout;
1296 	}
1297 
1298 	/*
1299 	 * Grab a task reference and drop the proc reference before making any upcalls.
1300 	 */
1301 	task_reference(task_read);
1302 
1303 	proc_rele(proc);
1304 	proc = PROC_NULL;
1305 
1306 #if CONFIG_MACF
1307 	error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_READ);
1308 	if (error) {
1309 		error = EPERM;
1310 		goto trfpout;
1311 	}
1312 #endif
1313 
1314 	/* If we aren't root and target's task access port is set... */
1315 	if (!kauth_cred_issuser(kauth_cred_get()) &&
1316 	    !is_current_proc &&
1317 	    (task_get_task_access_port(task_read, &tfpport) == 0) &&
1318 	    (tfpport != IPC_PORT_NULL)) {
1319 		if (tfpport == IPC_PORT_DEAD) {
1320 			error = EACCES;
1321 			goto trfpout;
1322 		}
1323 
1324 
1325 		/* Call up to the task access server */
1326 		error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport,
1327 		    proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_READ);
1328 
1329 		if (error != MACH_MSG_SUCCESS) {
1330 			if (error == MACH_RCV_INTERRUPTED) {
1331 				error = EINTR;
1332 			} else {
1333 				error = EPERM;
1334 			}
1335 			goto trfpout;
1336 		}
1337 	}
1338 
1339 	/* Check if the task has been corpsified */
1340 	if (is_corpsetask(task_read)) {
1341 		error = EACCES;
1342 		goto trfpout;
1343 	}
1344 
1345 	/* could be IP_NULL, consumes a ref */
1346 	sright = (void*) convert_task_read_to_port(task_read);
1347 	task_read = TASK_READ_NULL;
1348 	tret = ipc_port_copyout_send(sright, get_task_ipcspace(current_task()));
1349 
1350 trfpout:
1351 	task_deallocate(t1);
1352 	(void) copyout((char *) &tret, task_addr, sizeof(mach_port_name_t));
1353 	if (proc != PROC_NULL) {
1354 		proc_rele(proc);
1355 	}
1356 	if (tfpport != IPC_PORT_NULL) {
1357 		ipc_port_release_send(tfpport);
1358 	}
1359 	if (task_read != TASK_READ_NULL) {
1360 		task_deallocate(task_read);
1361 	}
1362 
1363 	*ret = error;
1364 	return error;
1365 }
1366 
1367 kern_return_t
pid_suspend(struct proc * p __unused,struct pid_suspend_args * args,int * ret)1368 pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret)
1369 {
1370 	task_t  target = NULL;
1371 	proc_t  targetproc = PROC_NULL;
1372 	int     pid = args->pid;
1373 	int     error = 0;
1374 	mach_port_t tfpport = MACH_PORT_NULL;
1375 
1376 	if (pid == 0) {
1377 		error = EPERM;
1378 		goto out;
1379 	}
1380 
1381 	targetproc = proc_find(pid);
1382 	if (targetproc == PROC_NULL) {
1383 		error = ESRCH;
1384 		goto out;
1385 	}
1386 
1387 	if (!task_for_pid_posix_check(targetproc) &&
1388 	    !IOCurrentTaskHasEntitlement(PROCESS_RESUME_SUSPEND_ENTITLEMENT)) {
1389 		error = EPERM;
1390 		goto out;
1391 	}
1392 
1393 #if CONFIG_MACF
1394 	error = mac_proc_check_suspend_resume(targetproc, MAC_PROC_CHECK_SUSPEND);
1395 	if (error) {
1396 		error = EPERM;
1397 		goto out;
1398 	}
1399 #endif
1400 
1401 	target = targetproc->task;
1402 #if XNU_TARGET_OS_OSX
1403 	if (target != TASK_NULL) {
1404 		/* If we aren't root and target's task access port is set... */
1405 		if (!kauth_cred_issuser(kauth_cred_get()) &&
1406 		    targetproc != current_proc() &&
1407 		    (task_get_task_access_port(target, &tfpport) == 0) &&
1408 		    (tfpport != IPC_PORT_NULL)) {
1409 			if (tfpport == IPC_PORT_DEAD) {
1410 				error = EACCES;
1411 				goto out;
1412 			}
1413 
1414 			/* Call up to the task access server */
1415 			error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport,
1416 			    proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_CONTROL);
1417 
1418 			if (error != MACH_MSG_SUCCESS) {
1419 				if (error == MACH_RCV_INTERRUPTED) {
1420 					error = EINTR;
1421 				} else {
1422 					error = EPERM;
1423 				}
1424 				goto out;
1425 			}
1426 		}
1427 	}
1428 #endif /* XNU_TARGET_OS_OSX */
1429 
1430 	task_reference(target);
1431 	error = task_pidsuspend(target);
1432 	if (error) {
1433 		if (error == KERN_INVALID_ARGUMENT) {
1434 			error = EINVAL;
1435 		} else {
1436 			error = EPERM;
1437 		}
1438 	}
1439 #if CONFIG_MEMORYSTATUS
1440 	else {
1441 		memorystatus_on_suspend(targetproc);
1442 	}
1443 #endif
1444 
1445 	task_deallocate(target);
1446 
1447 out:
1448 	if (tfpport != IPC_PORT_NULL) {
1449 		ipc_port_release_send(tfpport);
1450 	}
1451 
1452 	if (targetproc != PROC_NULL) {
1453 		proc_rele(targetproc);
1454 	}
1455 	*ret = error;
1456 	return error;
1457 }
1458 
1459 kern_return_t
debug_control_port_for_pid(struct debug_control_port_for_pid_args * args)1460 debug_control_port_for_pid(struct debug_control_port_for_pid_args *args)
1461 {
1462 	mach_port_name_t        target_tport = args->target_tport;
1463 	int                     pid = args->pid;
1464 	user_addr_t             task_addr = args->t;
1465 	proc_t                  p = PROC_NULL;
1466 	task_t                  t1 = TASK_NULL;
1467 	task_t                  task = TASK_NULL;
1468 	mach_port_name_t        tret = MACH_PORT_NULL;
1469 	ipc_port_t              tfpport = MACH_PORT_NULL;
1470 	ipc_port_t              sright = NULL;
1471 	int                     error = 0;
1472 	boolean_t               is_current_proc = FALSE;
1473 	struct proc_ident       pident = {0};
1474 
1475 	AUDIT_MACH_SYSCALL_ENTER(AUE_DBGPORTFORPID);
1476 	AUDIT_ARG(pid, pid);
1477 	AUDIT_ARG(mach_port1, target_tport);
1478 
1479 	/* Always check if pid == 0 */
1480 	if (pid == 0) {
1481 		(void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
1482 		AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE);
1483 		return KERN_FAILURE;
1484 	}
1485 
1486 	t1 = port_name_to_task(target_tport);
1487 	if (t1 == TASK_NULL) {
1488 		(void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t));
1489 		AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE);
1490 		return KERN_FAILURE;
1491 	}
1492 
1493 	p = proc_find(pid);
1494 	if (p == PROC_NULL) {
1495 		error = KERN_FAILURE;
1496 		goto tfpout;
1497 	}
1498 	pident = proc_ident(p);
1499 	is_current_proc = (p == current_proc());
1500 
1501 #if CONFIG_AUDIT
1502 	AUDIT_ARG(process, p);
1503 #endif
1504 
1505 	if (!(task_for_pid_posix_check(p))) {
1506 		error = KERN_FAILURE;
1507 		goto tfpout;
1508 	}
1509 
1510 	if (p->task == TASK_NULL) {
1511 		error = KERN_SUCCESS;
1512 		goto tfpout;
1513 	}
1514 
1515 	/*
1516 	 * Grab a task reference and drop the proc reference before making any upcalls.
1517 	 */
1518 	task = p->task;
1519 	task_reference(task);
1520 
1521 	proc_rele(p);
1522 	p = PROC_NULL;
1523 
1524 	if (!IOCurrentTaskHasEntitlement(DEBUG_PORT_ENTITLEMENT)) {
1525 #if CONFIG_MACF
1526 		error = mac_proc_check_get_task(kauth_cred_get(), &pident, TASK_FLAVOR_CONTROL);
1527 		if (error) {
1528 			error = KERN_FAILURE;
1529 			goto tfpout;
1530 		}
1531 #endif
1532 
1533 		/* If we aren't root and target's task access port is set... */
1534 		if (!kauth_cred_issuser(kauth_cred_get()) &&
1535 		    !is_current_proc &&
1536 		    (task_get_task_access_port(task, &tfpport) == 0) &&
1537 		    (tfpport != IPC_PORT_NULL)) {
1538 			if (tfpport == IPC_PORT_DEAD) {
1539 				error = KERN_PROTECTION_FAILURE;
1540 				goto tfpout;
1541 			}
1542 
1543 
1544 			/* Call up to the task access server */
1545 			error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport,
1546 			    proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_CONTROL);
1547 
1548 			if (error != MACH_MSG_SUCCESS) {
1549 				if (error == MACH_RCV_INTERRUPTED) {
1550 					error = KERN_ABORTED;
1551 				} else {
1552 					error = KERN_FAILURE;
1553 				}
1554 				goto tfpout;
1555 			}
1556 		}
1557 	}
1558 
1559 	/* Check if the task has been corpsified */
1560 	if (is_corpsetask(task)) {
1561 		error = KERN_FAILURE;
1562 		goto tfpout;
1563 	}
1564 
1565 	error = task_get_debug_control_port(task, &sright);
1566 	if (error != KERN_SUCCESS) {
1567 		goto tfpout;
1568 	}
1569 
1570 	tret = ipc_port_copyout_send(
1571 		sright,
1572 		get_task_ipcspace(current_task()));
1573 
1574 	error = KERN_SUCCESS;
1575 
1576 tfpout:
1577 	task_deallocate(t1);
1578 	AUDIT_ARG(mach_port2, tret);
1579 	(void) copyout((char *) &tret, task_addr, sizeof(mach_port_name_t));
1580 
1581 	if (tfpport != IPC_PORT_NULL) {
1582 		ipc_port_release_send(tfpport);
1583 	}
1584 	if (task != TASK_NULL) {
1585 		task_deallocate(task);
1586 	}
1587 	if (p != PROC_NULL) {
1588 		proc_rele(p);
1589 	}
1590 	AUDIT_MACH_SYSCALL_EXIT(error);
1591 	return error;
1592 }
1593 
1594 kern_return_t
pid_resume(struct proc * p __unused,struct pid_resume_args * args,int * ret)1595 pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret)
1596 {
1597 	task_t  target = NULL;
1598 	proc_t  targetproc = PROC_NULL;
1599 	int     pid = args->pid;
1600 	int     error = 0;
1601 	mach_port_t tfpport = MACH_PORT_NULL;
1602 
1603 	if (pid == 0) {
1604 		error = EPERM;
1605 		goto out;
1606 	}
1607 
1608 	targetproc = proc_find(pid);
1609 	if (targetproc == PROC_NULL) {
1610 		error = ESRCH;
1611 		goto out;
1612 	}
1613 
1614 	if (!task_for_pid_posix_check(targetproc) &&
1615 	    !IOCurrentTaskHasEntitlement(PROCESS_RESUME_SUSPEND_ENTITLEMENT)) {
1616 		error = EPERM;
1617 		goto out;
1618 	}
1619 
1620 #if CONFIG_MACF
1621 	error = mac_proc_check_suspend_resume(targetproc, MAC_PROC_CHECK_RESUME);
1622 	if (error) {
1623 		error = EPERM;
1624 		goto out;
1625 	}
1626 #endif
1627 
1628 	target = targetproc->task;
1629 #if XNU_TARGET_OS_OSX
1630 	if (target != TASK_NULL) {
1631 		/* If we aren't root and target's task access port is set... */
1632 		if (!kauth_cred_issuser(kauth_cred_get()) &&
1633 		    targetproc != current_proc() &&
1634 		    (task_get_task_access_port(target, &tfpport) == 0) &&
1635 		    (tfpport != IPC_PORT_NULL)) {
1636 			if (tfpport == IPC_PORT_DEAD) {
1637 				error = EACCES;
1638 				goto out;
1639 			}
1640 
1641 			/* Call up to the task access server */
1642 			error = __KERNEL_WAITING_ON_TASKGATED_CHECK_ACCESS_UPCALL__(tfpport,
1643 			    proc_selfpid(), kauth_getgid(), pid, TASK_FLAVOR_CONTROL);
1644 
1645 			if (error != MACH_MSG_SUCCESS) {
1646 				if (error == MACH_RCV_INTERRUPTED) {
1647 					error = EINTR;
1648 				} else {
1649 					error = EPERM;
1650 				}
1651 				goto out;
1652 			}
1653 		}
1654 	}
1655 #endif /* XNU_TARGET_OS_OSX */
1656 
1657 #if !XNU_TARGET_OS_OSX
1658 #if SOCKETS
1659 	resume_proc_sockets(targetproc);
1660 #endif /* SOCKETS */
1661 #endif /* !XNU_TARGET_OS_OSX */
1662 
1663 	task_reference(target);
1664 
1665 #if CONFIG_MEMORYSTATUS
1666 	memorystatus_on_resume(targetproc);
1667 #endif
1668 
1669 	error = task_pidresume(target);
1670 	if (error) {
1671 		if (error == KERN_INVALID_ARGUMENT) {
1672 			error = EINVAL;
1673 		} else {
1674 			if (error == KERN_MEMORY_ERROR) {
1675 				psignal(targetproc, SIGKILL);
1676 				error = EIO;
1677 			} else {
1678 				error = EPERM;
1679 			}
1680 		}
1681 	}
1682 
1683 	task_deallocate(target);
1684 
1685 out:
1686 	if (tfpport != IPC_PORT_NULL) {
1687 		ipc_port_release_send(tfpport);
1688 	}
1689 
1690 	if (targetproc != PROC_NULL) {
1691 		proc_rele(targetproc);
1692 	}
1693 
1694 	*ret = error;
1695 	return error;
1696 }
1697 
1698 #if !XNU_TARGET_OS_OSX
1699 /*
1700  * Freeze the specified process (provided in args->pid), or find and freeze a PID.
1701  * When a process is specified, this call is blocking, otherwise we wake up the
1702  * freezer thread and do not block on a process being frozen.
1703  */
1704 kern_return_t
pid_hibernate(struct proc * p __unused,struct pid_hibernate_args * args,int * ret)1705 pid_hibernate(struct proc *p __unused, struct pid_hibernate_args *args, int *ret)
1706 {
1707 	int     error = 0;
1708 	proc_t  targetproc = PROC_NULL;
1709 	int     pid = args->pid;
1710 
1711 #ifndef CONFIG_FREEZE
1712 	#pragma unused(pid)
1713 #else
1714 
1715 	/*
1716 	 * If a pid has been provided, we obtain the process handle and call task_for_pid_posix_check().
1717 	 */
1718 
1719 	if (pid >= 0) {
1720 		targetproc = proc_find(pid);
1721 
1722 		if (targetproc == PROC_NULL) {
1723 			error = ESRCH;
1724 			goto out;
1725 		}
1726 
1727 		if (!task_for_pid_posix_check(targetproc)) {
1728 			error = EPERM;
1729 			goto out;
1730 		}
1731 	}
1732 
1733 #if CONFIG_MACF
1734 	//Note that targetproc may be null
1735 	error = mac_proc_check_suspend_resume(targetproc, MAC_PROC_CHECK_HIBERNATE);
1736 	if (error) {
1737 		error = EPERM;
1738 		goto out;
1739 	}
1740 #endif
1741 
1742 	if (pid == -2) {
1743 		vm_pageout_anonymous_pages();
1744 	} else if (pid == -1) {
1745 		memorystatus_on_inactivity(targetproc);
1746 	} else {
1747 		error = memorystatus_freeze_process_sync(targetproc);
1748 	}
1749 
1750 out:
1751 
1752 #endif /* CONFIG_FREEZE */
1753 
1754 	if (targetproc != PROC_NULL) {
1755 		proc_rele(targetproc);
1756 	}
1757 	*ret = error;
1758 	return error;
1759 }
1760 #endif /* !XNU_TARGET_OS_OSX */
1761 
1762 #if SOCKETS
1763 int
networking_memstatus_callout(proc_t p,uint32_t status)1764 networking_memstatus_callout(proc_t p, uint32_t status)
1765 {
1766 	struct fileproc *fp;
1767 
1768 	/*
1769 	 * proc list lock NOT held
1770 	 * proc lock NOT held
1771 	 * a reference on the proc has been held / shall be dropped by the caller.
1772 	 */
1773 	LCK_MTX_ASSERT(&proc_list_mlock, LCK_MTX_ASSERT_NOTOWNED);
1774 	LCK_MTX_ASSERT(&p->p_mlock, LCK_MTX_ASSERT_NOTOWNED);
1775 
1776 	proc_fdlock(p);
1777 
1778 	fdt_foreach(fp, p) {
1779 		switch (FILEGLOB_DTYPE(fp->fp_glob)) {
1780 #if NECP
1781 		case DTYPE_NETPOLICY:
1782 			necp_fd_memstatus(p, status,
1783 			    (struct necp_fd_data *)fp_get_data(fp));
1784 			break;
1785 #endif /* NECP */
1786 #if SKYWALK
1787 		case DTYPE_CHANNEL:
1788 			kern_channel_memstatus(p, status,
1789 			    (struct kern_channel *)fp_get_data(fp));
1790 			break;
1791 #endif /* SKYWALK */
1792 		default:
1793 			break;
1794 		}
1795 	}
1796 	proc_fdunlock(p);
1797 
1798 	return 1;
1799 }
1800 
1801 #if SKYWALK
1802 /*
1803  * Since we make multiple passes across the fileproc array, record the
1804  * first MAX_CHANNELS channel handles found.  MAX_CHANNELS should be
1805  * large enough to accomodate most, if not all cases.  If we find more,
1806  * we'll go to the slow path during second pass.
1807  */
1808 #define MAX_CHANNELS    8       /* should be more than enough */
1809 #endif /* SKYWALK */
1810 
1811 static int
networking_defunct_callout(proc_t p,void * arg)1812 networking_defunct_callout(proc_t p, void *arg)
1813 {
1814 	struct pid_shutdown_sockets_args *args = arg;
1815 	int pid = args->pid;
1816 	int level = args->level;
1817 	struct fileproc *fp;
1818 #if SKYWALK
1819 	int i;
1820 	int channel_count = 0;
1821 	struct kern_channel *channel_array[MAX_CHANNELS];
1822 
1823 	bzero(&channel_array, sizeof(channel_array));
1824 #endif /* SKYWALK */
1825 
1826 	proc_fdlock(p);
1827 
1828 	fdt_foreach(fp, p) {
1829 		struct fileglob *fg = fp->fp_glob;
1830 
1831 		switch (FILEGLOB_DTYPE(fg)) {
1832 		case DTYPE_SOCKET: {
1833 			struct socket *so = (struct socket *)fg_get_data(fg);
1834 			if (proc_getpid(p) == pid || so->last_pid == pid ||
1835 			    ((so->so_flags & SOF_DELEGATED) && so->e_pid == pid)) {
1836 				/* Call networking stack with socket and level */
1837 				(void)socket_defunct(p, so, level);
1838 			}
1839 			break;
1840 		}
1841 #if NECP
1842 		case DTYPE_NETPOLICY:
1843 			/* first pass: defunct necp and get stats for ntstat */
1844 			if (proc_getpid(p) == pid) {
1845 				necp_fd_defunct(p,
1846 				    (struct necp_fd_data *)fg_get_data(fg));
1847 			}
1848 			break;
1849 #endif /* NECP */
1850 #if SKYWALK
1851 		case DTYPE_CHANNEL:
1852 			/* first pass: get channels and total count */
1853 			if (proc_getpid(p) == pid) {
1854 				if (channel_count < MAX_CHANNELS) {
1855 					channel_array[channel_count] =
1856 					    (struct kern_channel *)fg_get_data(fg);
1857 				}
1858 				++channel_count;
1859 			}
1860 			break;
1861 #endif /* SKYWALK */
1862 		default:
1863 			break;
1864 		}
1865 	}
1866 
1867 #if SKYWALK
1868 	/*
1869 	 * Second pass: defunct channels/flows (after NECP).  Handle
1870 	 * the common case of up to MAX_CHANNELS count with fast path,
1871 	 * and traverse the fileproc array again only if we exceed it.
1872 	 */
1873 	if (channel_count != 0 && channel_count <= MAX_CHANNELS) {
1874 		ASSERT(proc_getpid(p) == pid);
1875 		for (i = 0; i < channel_count; i++) {
1876 			ASSERT(channel_array[i] != NULL);
1877 			kern_channel_defunct(p, channel_array[i]);
1878 		}
1879 	} else if (channel_count != 0) {
1880 		ASSERT(proc_getpid(p) == pid);
1881 		fdt_foreach(fp, p) {
1882 			struct fileglob *fg = fp->fp_glob;
1883 
1884 			if (FILEGLOB_DTYPE(fg) == DTYPE_CHANNEL) {
1885 				kern_channel_defunct(p,
1886 				    (struct kern_channel *)fg_get_data(fg));
1887 			}
1888 		}
1889 	}
1890 #endif /* SKYWALK */
1891 	proc_fdunlock(p);
1892 
1893 	return PROC_RETURNED;
1894 }
1895 
1896 int
pid_shutdown_sockets(struct proc * p __unused,struct pid_shutdown_sockets_args * args,int * ret)1897 pid_shutdown_sockets(struct proc *p __unused, struct pid_shutdown_sockets_args *args, int *ret)
1898 {
1899 	int                             error = 0;
1900 	proc_t                          targetproc = PROC_NULL;
1901 	int                             pid = args->pid;
1902 	int                             level = args->level;
1903 
1904 	if (level != SHUTDOWN_SOCKET_LEVEL_DISCONNECT_SVC &&
1905 	    level != SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL) {
1906 		error = EINVAL;
1907 		goto out;
1908 	}
1909 
1910 	targetproc = proc_find(pid);
1911 	if (targetproc == PROC_NULL) {
1912 		error = ESRCH;
1913 		goto out;
1914 	}
1915 
1916 	if (!task_for_pid_posix_check(targetproc) &&
1917 	    !IOCurrentTaskHasEntitlement(PROCESS_RESUME_SUSPEND_ENTITLEMENT)) {
1918 		error = EPERM;
1919 		goto out;
1920 	}
1921 
1922 #if CONFIG_MACF
1923 	error = mac_proc_check_suspend_resume(targetproc, MAC_PROC_CHECK_SHUTDOWN_SOCKETS);
1924 	if (error) {
1925 		error = EPERM;
1926 		goto out;
1927 	}
1928 #endif
1929 
1930 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS,
1931 	    networking_defunct_callout, args, NULL, NULL);
1932 
1933 out:
1934 	if (targetproc != PROC_NULL) {
1935 		proc_rele(targetproc);
1936 	}
1937 	*ret = error;
1938 	return error;
1939 }
1940 
1941 #endif /* SOCKETS */
1942 
1943 static int
sysctl_settfp_policy(__unused struct sysctl_oid * oidp,void * arg1,__unused int arg2,struct sysctl_req * req)1944 sysctl_settfp_policy(__unused struct sysctl_oid *oidp, void *arg1,
1945     __unused int arg2, struct sysctl_req *req)
1946 {
1947 	int error = 0;
1948 	int new_value;
1949 
1950 	error = SYSCTL_OUT(req, arg1, sizeof(int));
1951 	if (error || req->newptr == USER_ADDR_NULL) {
1952 		return error;
1953 	}
1954 
1955 	if (!kauth_cred_issuser(kauth_cred_get())) {
1956 		return EPERM;
1957 	}
1958 
1959 	if ((error = SYSCTL_IN(req, &new_value, sizeof(int)))) {
1960 		goto out;
1961 	}
1962 	if ((new_value == KERN_TFP_POLICY_DENY)
1963 	    || (new_value == KERN_TFP_POLICY_DEFAULT)) {
1964 		tfp_policy = new_value;
1965 	} else {
1966 		error = EINVAL;
1967 	}
1968 out:
1969 	return error;
1970 }
1971 
1972 #if defined(SECURE_KERNEL)
1973 static int kern_secure_kernel = 1;
1974 #else
1975 static int kern_secure_kernel = 0;
1976 #endif
1977 
1978 SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_secure_kernel, 0, "");
1979 
1980 SYSCTL_NODE(_kern, KERN_TFP, tfp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "tfp");
1981 SYSCTL_PROC(_kern_tfp, KERN_TFP_POLICY, policy, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
1982     &tfp_policy, sizeof(uint32_t), &sysctl_settfp_policy, "I", "policy");
1983 
1984 SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW | CTLFLAG_LOCKED,
1985     &shared_region_trace_level, 0, "");
1986 SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD | CTLFLAG_LOCKED,
1987     &shared_region_version, 0, "");
1988 SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED,
1989     &shared_region_persistence, 0, "");
1990 
1991 /*
1992  * shared_region_check_np:
1993  *
1994  * This system call is intended for dyld.
1995  *
1996  * dyld calls this when any process starts to see if the process's shared
1997  * region is already set up and ready to use.
1998  * This call returns the base address of the first mapping in the
1999  * process's shared region's first mapping.
2000  * dyld will then check what's mapped at that address.
2001  *
2002  * If the shared region is empty, dyld will then attempt to map the shared
2003  * cache file in the shared region via the shared_region_map_np() system call.
2004  *
2005  * If something's already mapped in the shared region, dyld will check if it
2006  * matches the shared cache it would like to use for that process.
2007  * If it matches, evrything's ready and the process can proceed and use the
2008  * shared region.
2009  * If it doesn't match, dyld will unmap the shared region and map the shared
2010  * cache into the process's address space via mmap().
2011  *
2012  * A NULL pointer argument can be used by dyld to indicate it has unmapped
2013  * the shared region. We will remove the shared_region reference from the task.
2014  *
2015  * ERROR VALUES
2016  * EINVAL	no shared region
2017  * ENOMEM	shared region is empty
2018  * EFAULT	bad address for "start_address"
2019  */
2020 int
shared_region_check_np(__unused struct proc * p,struct shared_region_check_np_args * uap,__unused int * retvalp)2021 shared_region_check_np(
2022 	__unused struct proc                    *p,
2023 	struct shared_region_check_np_args      *uap,
2024 	__unused int                            *retvalp)
2025 {
2026 	vm_shared_region_t      shared_region;
2027 	mach_vm_offset_t        start_address = 0;
2028 	int                     error = 0;
2029 	kern_return_t           kr;
2030 	task_t                  task = current_task();
2031 
2032 	SHARED_REGION_TRACE_DEBUG(
2033 		("shared_region: %p [%d(%s)] -> check_np(0x%llx)\n",
2034 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
2035 		proc_getpid(p), p->p_comm,
2036 		(uint64_t)uap->start_address));
2037 
2038 	/* retrieve the current tasks's shared region */
2039 	shared_region = vm_shared_region_get(task);
2040 	if (shared_region != NULL) {
2041 		/*
2042 		 * A NULL argument is used by dyld to indicate the task
2043 		 * has unmapped its shared region.
2044 		 */
2045 		if (uap->start_address == 0) {
2046 			vm_shared_region_set(task, NULL);
2047 		} else {
2048 			/* retrieve address of its first mapping... */
2049 			kr = vm_shared_region_start_address(shared_region, &start_address, task);
2050 			if (kr != KERN_SUCCESS) {
2051 				SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
2052 				    "check_np(0x%llx) "
2053 				    "vm_shared_region_start_address() failed\n",
2054 				    (void *)VM_KERNEL_ADDRPERM(current_thread()),
2055 				    proc_getpid(p), p->p_comm,
2056 				    (uint64_t)uap->start_address));
2057 				error = ENOMEM;
2058 			} else {
2059 #if __has_feature(ptrauth_calls)
2060 				/*
2061 				 * Remap any section of the shared library that
2062 				 * has authenticated pointers into private memory.
2063 				 */
2064 				if (vm_shared_region_auth_remap(shared_region) != KERN_SUCCESS) {
2065 					SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] "
2066 					    "check_np(0x%llx) "
2067 					    "vm_shared_region_auth_remap() failed\n",
2068 					    (void *)VM_KERNEL_ADDRPERM(current_thread()),
2069 					    proc_getpid(p), p->p_comm,
2070 					    (uint64_t)uap->start_address));
2071 					error = ENOMEM;
2072 				}
2073 #endif /* __has_feature(ptrauth_calls) */
2074 
2075 				/* ... and give it to the caller */
2076 				if (error == 0) {
2077 					error = copyout(&start_address,
2078 					    (user_addr_t) uap->start_address,
2079 					    sizeof(start_address));
2080 					if (error != 0) {
2081 						SHARED_REGION_TRACE_ERROR(
2082 							("shared_region: %p [%d(%s)] "
2083 							"check_np(0x%llx) "
2084 							"copyout(0x%llx) error %d\n",
2085 							(void *)VM_KERNEL_ADDRPERM(current_thread()),
2086 							proc_getpid(p), p->p_comm,
2087 							(uint64_t)uap->start_address, (uint64_t)start_address,
2088 							error));
2089 					}
2090 				}
2091 			}
2092 		}
2093 		vm_shared_region_deallocate(shared_region);
2094 	} else {
2095 		/* no shared region ! */
2096 		error = EINVAL;
2097 	}
2098 
2099 	SHARED_REGION_TRACE_DEBUG(
2100 		("shared_region: %p [%d(%s)] check_np(0x%llx) <- 0x%llx %d\n",
2101 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
2102 		proc_getpid(p), p->p_comm,
2103 		(uint64_t)uap->start_address, (uint64_t)start_address, error));
2104 
2105 	return error;
2106 }
2107 
2108 
2109 static int
shared_region_copyin(struct proc * p,user_addr_t user_addr,unsigned int count,unsigned int element_size,void * kernel_data)2110 shared_region_copyin(
2111 	struct proc  *p,
2112 	user_addr_t  user_addr,
2113 	unsigned int count,
2114 	unsigned int element_size,
2115 	void         *kernel_data)
2116 {
2117 	int             error = 0;
2118 	vm_size_t       size = count * element_size;
2119 
2120 	error = copyin(user_addr, kernel_data, size);
2121 	if (error) {
2122 		SHARED_REGION_TRACE_ERROR(
2123 			("shared_region: %p [%d(%s)] map(): "
2124 			"copyin(0x%llx, %ld) failed (error=%d)\n",
2125 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2126 			proc_getpid(p), p->p_comm,
2127 			(uint64_t)user_addr, (long)size, error));
2128 	}
2129 	return error;
2130 }
2131 
2132 /*
2133  * A reasonable upper limit to prevent overflow of allocation/copyin.
2134  */
2135 #define _SR_FILE_MAPPINGS_MAX_FILES 256
2136 
2137 /* forward declaration */
2138 __attribute__((noinline))
2139 static void shared_region_map_and_slide_cleanup(
2140 	struct proc              *p,
2141 	uint32_t                 files_count,
2142 	struct _sr_file_mappings *sr_file_mappings,
2143 	struct vm_shared_region  *shared_region,
2144 	struct vnode             *scdir_vp);
2145 
2146 /*
2147  * Setup part of _shared_region_map_and_slide().
2148  * It had to be broken out of _shared_region_map_and_slide() to
2149  * prevent compiler inlining from blowing out the stack.
2150  */
2151 __attribute__((noinline))
2152 static int
shared_region_map_and_slide_setup(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings,struct _sr_file_mappings ** sr_file_mappings,struct vm_shared_region ** shared_region_ptr,struct vnode ** scdir_vp,struct vnode * rdir_vp)2153 shared_region_map_and_slide_setup(
2154 	struct proc                         *p,
2155 	uint32_t                            files_count,
2156 	struct shared_file_np               *files,
2157 	uint32_t                            mappings_count,
2158 	struct shared_file_mapping_slide_np *mappings,
2159 	struct _sr_file_mappings            **sr_file_mappings,
2160 	struct vm_shared_region             **shared_region_ptr,
2161 	struct vnode                        **scdir_vp,
2162 	struct vnode                        *rdir_vp)
2163 {
2164 	int                             error = 0;
2165 	struct _sr_file_mappings        *srfmp;
2166 	uint32_t                        mappings_next;
2167 	struct vnode_attr               va;
2168 	off_t                           fs;
2169 #if CONFIG_MACF
2170 	vm_prot_t                       maxprot = VM_PROT_ALL;
2171 #endif
2172 	uint32_t                        i;
2173 	struct vm_shared_region         *shared_region = NULL;
2174 	boolean_t                       is_driverkit = task_is_driver(current_task());
2175 	const char                      *expected_scdir_path = is_driverkit ? driverkit_scdir_path : scdir_path;
2176 
2177 	SHARED_REGION_TRACE_DEBUG(
2178 		("shared_region: %p [%d(%s)] -> map\n",
2179 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
2180 		proc_getpid(p), p->p_comm));
2181 
2182 	if (files_count > _SR_FILE_MAPPINGS_MAX_FILES) {
2183 		error = E2BIG;
2184 		goto done;
2185 	}
2186 	if (files_count == 0) {
2187 		error = EINVAL;
2188 		goto done;
2189 	}
2190 	*sr_file_mappings = kalloc_type(struct _sr_file_mappings, files_count,
2191 	    Z_WAITOK | Z_ZERO);
2192 	if (*sr_file_mappings == NULL) {
2193 		error = ENOMEM;
2194 		goto done;
2195 	}
2196 	mappings_next = 0;
2197 	for (i = 0; i < files_count; i++) {
2198 		srfmp = &(*sr_file_mappings)[i];
2199 		srfmp->fd = files[i].sf_fd;
2200 		srfmp->mappings_count = files[i].sf_mappings_count;
2201 		srfmp->mappings = &mappings[mappings_next];
2202 		mappings_next += srfmp->mappings_count;
2203 		if (mappings_next > mappings_count) {
2204 			error = EINVAL;
2205 			goto done;
2206 		}
2207 		srfmp->slide = files[i].sf_slide;
2208 	}
2209 
2210 	if (scdir_enforce) {
2211 		/* get vnode for expected_scdir_path */
2212 		error = vnode_lookup(expected_scdir_path, 0, scdir_vp, vfs_context_current());
2213 		if (error) {
2214 			SHARED_REGION_TRACE_ERROR(
2215 				("shared_region: %p [%d(%s)]: "
2216 				"vnode_lookup(%s) failed (error=%d)\n",
2217 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
2218 				proc_getpid(p), p->p_comm,
2219 				expected_scdir_path, error));
2220 			goto done;
2221 		}
2222 	}
2223 
2224 	/* get the process's shared region (setup in vm_map_exec()) */
2225 	shared_region = vm_shared_region_trim_and_get(current_task());
2226 	*shared_region_ptr = shared_region;
2227 	if (shared_region == NULL) {
2228 		SHARED_REGION_TRACE_ERROR(
2229 			("shared_region: %p [%d(%s)] map(): "
2230 			"no shared region\n",
2231 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2232 			proc_getpid(p), p->p_comm));
2233 		error = EINVAL;
2234 		goto done;
2235 	}
2236 
2237 	/*
2238 	 * Check the shared region matches the current root
2239 	 * directory of this process.  Deny the mapping to
2240 	 * avoid tainting the shared region with something that
2241 	 * doesn't quite belong into it.
2242 	 */
2243 	struct vnode *sr_vnode = vm_shared_region_root_dir(shared_region);
2244 	if (sr_vnode != NULL ?  rdir_vp != sr_vnode : rdir_vp != rootvnode) {
2245 		SHARED_REGION_TRACE_ERROR(
2246 			("shared_region: map(%p) root_dir mismatch\n",
2247 			(void *)VM_KERNEL_ADDRPERM(current_thread())));
2248 		error = EPERM;
2249 		goto done;
2250 	}
2251 
2252 
2253 	for (srfmp = &(*sr_file_mappings)[0];
2254 	    srfmp < &(*sr_file_mappings)[files_count];
2255 	    srfmp++) {
2256 		if (srfmp->mappings_count == 0) {
2257 			/* no mappings here... */
2258 			continue;
2259 		}
2260 
2261 		/* get file structure from file descriptor */
2262 		error = fp_get_ftype(p, srfmp->fd, DTYPE_VNODE, EINVAL, &srfmp->fp);
2263 		if (error) {
2264 			SHARED_REGION_TRACE_ERROR(
2265 				("shared_region: %p [%d(%s)] map: "
2266 				"fd=%d lookup failed (error=%d)\n",
2267 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
2268 				proc_getpid(p), p->p_comm, srfmp->fd, error));
2269 			goto done;
2270 		}
2271 
2272 		/* we need at least read permission on the file */
2273 		if (!(srfmp->fp->fp_glob->fg_flag & FREAD)) {
2274 			SHARED_REGION_TRACE_ERROR(
2275 				("shared_region: %p [%d(%s)] map: "
2276 				"fd=%d not readable\n",
2277 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
2278 				proc_getpid(p), p->p_comm, srfmp->fd));
2279 			error = EPERM;
2280 			goto done;
2281 		}
2282 
2283 		/* get vnode from file structure */
2284 		error = vnode_getwithref((vnode_t)fp_get_data(srfmp->fp));
2285 		if (error) {
2286 			SHARED_REGION_TRACE_ERROR(
2287 				("shared_region: %p [%d(%s)] map: "
2288 				"fd=%d getwithref failed (error=%d)\n",
2289 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
2290 				proc_getpid(p), p->p_comm, srfmp->fd, error));
2291 			goto done;
2292 		}
2293 		srfmp->vp = (struct vnode *)fp_get_data(srfmp->fp);
2294 
2295 		/* make sure the vnode is a regular file */
2296 		if (srfmp->vp->v_type != VREG) {
2297 			SHARED_REGION_TRACE_ERROR(
2298 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
2299 				"not a file (type=%d)\n",
2300 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
2301 				proc_getpid(p), p->p_comm,
2302 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2303 				srfmp->vp->v_name, srfmp->vp->v_type));
2304 			error = EINVAL;
2305 			goto done;
2306 		}
2307 
2308 #if CONFIG_MACF
2309 		/* pass in 0 for the offset argument because AMFI does not need the offset
2310 		 *       of the shared cache */
2311 		error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()),
2312 		    srfmp->fp->fp_glob, VM_PROT_ALL, MAP_FILE, 0, &maxprot);
2313 		if (error) {
2314 			goto done;
2315 		}
2316 #endif /* MAC */
2317 
2318 #if XNU_TARGET_OS_OSX && defined(__arm64__)
2319 		/*
2320 		 * Check if the shared cache is in the trust cache;
2321 		 * if so, we can skip the root ownership check.
2322 		 */
2323 #if DEVELOPMENT || DEBUG
2324 		/*
2325 		 * Skip both root ownership and trust cache check if
2326 		 * enforcement is disabled.
2327 		 */
2328 		if (!cs_system_enforcement()) {
2329 			goto after_root_check;
2330 		}
2331 #endif /* DEVELOPMENT || DEBUG */
2332 		struct cs_blob *blob = csvnode_get_blob(srfmp->vp, 0);
2333 		if (blob == NULL) {
2334 			SHARED_REGION_TRACE_ERROR(
2335 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
2336 				"missing CS blob\n",
2337 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
2338 				proc_getpid(p), p->p_comm,
2339 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2340 				srfmp->vp->v_name));
2341 			goto root_check;
2342 		}
2343 		const uint8_t *cdhash = csblob_get_cdhash(blob);
2344 		if (cdhash == NULL) {
2345 			SHARED_REGION_TRACE_ERROR(
2346 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
2347 				"missing cdhash\n",
2348 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
2349 				proc_getpid(p), p->p_comm,
2350 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2351 				srfmp->vp->v_name));
2352 			goto root_check;
2353 		}
2354 		uint32_t result = pmap_lookup_in_static_trust_cache(cdhash);
2355 		boolean_t in_trust_cache = result & (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT);
2356 		if (!in_trust_cache) {
2357 			SHARED_REGION_TRACE_ERROR(
2358 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
2359 				"not in trust cache\n",
2360 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
2361 				proc_getpid(p), p->p_comm,
2362 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2363 				srfmp->vp->v_name));
2364 			goto root_check;
2365 		}
2366 		goto after_root_check;
2367 root_check:
2368 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
2369 
2370 		/* The shared cache file must be owned by root */
2371 		VATTR_INIT(&va);
2372 		VATTR_WANTED(&va, va_uid);
2373 		error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
2374 		if (error) {
2375 			SHARED_REGION_TRACE_ERROR(
2376 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
2377 				"vnode_getattr(%p) failed (error=%d)\n",
2378 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
2379 				proc_getpid(p), p->p_comm,
2380 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2381 				srfmp->vp->v_name,
2382 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2383 				error));
2384 			goto done;
2385 		}
2386 		if (va.va_uid != 0) {
2387 			SHARED_REGION_TRACE_ERROR(
2388 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
2389 				"owned by uid=%d instead of 0\n",
2390 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
2391 				proc_getpid(p), p->p_comm,
2392 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2393 				srfmp->vp->v_name, va.va_uid));
2394 			error = EPERM;
2395 			goto done;
2396 		}
2397 
2398 #if XNU_TARGET_OS_OSX && defined(__arm64__)
2399 after_root_check:
2400 #endif /* XNU_TARGET_OS_OSX && defined(__arm64__) */
2401 
2402 #if CONFIG_CSR
2403 		if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
2404 			VATTR_INIT(&va);
2405 			VATTR_WANTED(&va, va_flags);
2406 			error = vnode_getattr(srfmp->vp, &va, vfs_context_current());
2407 			if (error) {
2408 				SHARED_REGION_TRACE_ERROR(
2409 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
2410 					"vnode_getattr(%p) failed (error=%d)\n",
2411 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
2412 					proc_getpid(p), p->p_comm,
2413 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2414 					srfmp->vp->v_name,
2415 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2416 					error));
2417 				goto done;
2418 			}
2419 
2420 			if (!(va.va_flags & SF_RESTRICTED)) {
2421 				/*
2422 				 * CSR is not configured in CSR_ALLOW_UNRESTRICTED_FS mode, and
2423 				 * the shared cache file is NOT SIP-protected, so reject the
2424 				 * mapping request
2425 				 */
2426 				SHARED_REGION_TRACE_ERROR(
2427 					("shared_region: %p [%d(%s)] map(%p:'%s'), "
2428 					"vnode is not SIP-protected. \n",
2429 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
2430 					proc_getpid(p), p->p_comm,
2431 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2432 					srfmp->vp->v_name));
2433 				error = EPERM;
2434 				goto done;
2435 			}
2436 		}
2437 #else /* CONFIG_CSR */
2438 		/* Devices without SIP/ROSP need to make sure that the shared cache is on the root volume. */
2439 
2440 		assert(rdir_vp != NULL);
2441 		if (srfmp->vp->v_mount != rdir_vp->v_mount) {
2442 			SHARED_REGION_TRACE_ERROR(
2443 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
2444 				"not on process's root volume\n",
2445 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
2446 				proc_getpid(p), p->p_comm,
2447 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2448 				srfmp->vp->v_name));
2449 			error = EPERM;
2450 			goto done;
2451 		}
2452 #endif /* CONFIG_CSR */
2453 
2454 		if (scdir_enforce) {
2455 			/* ensure parent is scdir_vp */
2456 			assert(*scdir_vp != NULL);
2457 			if (vnode_parent(srfmp->vp) != *scdir_vp) {
2458 				SHARED_REGION_TRACE_ERROR(
2459 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
2460 					"shared cache file not in %s\n",
2461 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
2462 					proc_getpid(p), p->p_comm,
2463 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2464 					srfmp->vp->v_name, expected_scdir_path));
2465 				error = EPERM;
2466 				goto done;
2467 			}
2468 		}
2469 
2470 		/* get vnode size */
2471 		error = vnode_size(srfmp->vp, &fs, vfs_context_current());
2472 		if (error) {
2473 			SHARED_REGION_TRACE_ERROR(
2474 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
2475 				"vnode_size(%p) failed (error=%d)\n",
2476 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
2477 				proc_getpid(p), p->p_comm,
2478 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2479 				srfmp->vp->v_name,
2480 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp), error));
2481 			goto done;
2482 		}
2483 		srfmp->file_size = fs;
2484 
2485 		/* get the file's memory object handle */
2486 		srfmp->file_control = ubc_getobject(srfmp->vp, UBC_HOLDOBJECT);
2487 		if (srfmp->file_control == MEMORY_OBJECT_CONTROL_NULL) {
2488 			SHARED_REGION_TRACE_ERROR(
2489 				("shared_region: %p [%d(%s)] map(%p:'%s'): "
2490 				"no memory object\n",
2491 				(void *)VM_KERNEL_ADDRPERM(current_thread()),
2492 				proc_getpid(p), p->p_comm,
2493 				(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2494 				srfmp->vp->v_name));
2495 			error = EINVAL;
2496 			goto done;
2497 		}
2498 
2499 		/* check that the mappings are properly covered by code signatures */
2500 		if (!cs_system_enforcement()) {
2501 			/* code signing is not enforced: no need to check */
2502 		} else {
2503 			for (i = 0; i < srfmp->mappings_count; i++) {
2504 				if (srfmp->mappings[i].sms_init_prot & VM_PROT_ZF) {
2505 					/* zero-filled mapping: not backed by the file */
2506 					continue;
2507 				}
2508 				if (ubc_cs_is_range_codesigned(srfmp->vp,
2509 				    srfmp->mappings[i].sms_file_offset,
2510 				    srfmp->mappings[i].sms_size)) {
2511 					/* this mapping is fully covered by code signatures */
2512 					continue;
2513 				}
2514 				SHARED_REGION_TRACE_ERROR(
2515 					("shared_region: %p [%d(%s)] map(%p:'%s'): "
2516 					"mapping #%d/%d [0x%llx:0x%llx:0x%llx:0x%x:0x%x] "
2517 					"is not code-signed\n",
2518 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
2519 					proc_getpid(p), p->p_comm,
2520 					(void *)VM_KERNEL_ADDRPERM(srfmp->vp),
2521 					srfmp->vp->v_name,
2522 					i, srfmp->mappings_count,
2523 					srfmp->mappings[i].sms_address,
2524 					srfmp->mappings[i].sms_size,
2525 					srfmp->mappings[i].sms_file_offset,
2526 					srfmp->mappings[i].sms_max_prot,
2527 					srfmp->mappings[i].sms_init_prot));
2528 				error = EINVAL;
2529 				goto done;
2530 			}
2531 		}
2532 	}
2533 done:
2534 	if (error != 0) {
2535 		shared_region_map_and_slide_cleanup(p, files_count, *sr_file_mappings, shared_region, *scdir_vp);
2536 		*sr_file_mappings = NULL;
2537 		*shared_region_ptr = NULL;
2538 		*scdir_vp = NULL;
2539 	}
2540 	return error;
2541 }
2542 
2543 /*
2544  * shared_region_map_np()
2545  *
2546  * This system call is intended for dyld.
2547  *
2548  * dyld uses this to map a shared cache file into a shared region.
2549  * This is usually done only the first time a shared cache is needed.
2550  * Subsequent processes will just use the populated shared region without
2551  * requiring any further setup.
2552  */
2553 static int
_shared_region_map_and_slide(struct proc * p,uint32_t files_count,struct shared_file_np * files,uint32_t mappings_count,struct shared_file_mapping_slide_np * mappings)2554 _shared_region_map_and_slide(
2555 	struct proc                         *p,
2556 	uint32_t                            files_count,
2557 	struct shared_file_np               *files,
2558 	uint32_t                            mappings_count,
2559 	struct shared_file_mapping_slide_np *mappings)
2560 {
2561 	int                             error = 0;
2562 	kern_return_t                   kr = KERN_SUCCESS;
2563 	struct _sr_file_mappings        *sr_file_mappings = NULL;
2564 	struct vnode                    *scdir_vp = NULL;
2565 	struct vnode                    *rdir_vp = NULL;
2566 	struct vm_shared_region         *shared_region = NULL;
2567 
2568 	/*
2569 	 * Get a reference to the current proc's root dir.
2570 	 * Need this to prevent racing with chroot.
2571 	 */
2572 	proc_fdlock(p);
2573 	rdir_vp = p->p_fd.fd_rdir;
2574 	if (rdir_vp == NULL) {
2575 		rdir_vp = rootvnode;
2576 	}
2577 	assert(rdir_vp != NULL);
2578 	vnode_get(rdir_vp);
2579 	proc_fdunlock(p);
2580 
2581 	/*
2582 	 * Turn files, mappings into sr_file_mappings and other setup.
2583 	 */
2584 	error = shared_region_map_and_slide_setup(p, files_count,
2585 	    files, mappings_count, mappings,
2586 	    &sr_file_mappings, &shared_region, &scdir_vp, rdir_vp);
2587 	if (error != 0) {
2588 		vnode_put(rdir_vp);
2589 		return error;
2590 	}
2591 
2592 	/* map the file(s) into that shared region's submap */
2593 	kr = vm_shared_region_map_file(shared_region, files_count, sr_file_mappings);
2594 	if (kr != KERN_SUCCESS) {
2595 		SHARED_REGION_TRACE_ERROR(("shared_region: %p [%d(%s)] map(): "
2596 		    "vm_shared_region_map_file() failed kr=0x%x\n",
2597 		    (void *)VM_KERNEL_ADDRPERM(current_thread()),
2598 		    proc_getpid(p), p->p_comm, kr));
2599 	}
2600 
2601 	/* convert kern_return_t to errno */
2602 	switch (kr) {
2603 	case KERN_SUCCESS:
2604 		error = 0;
2605 		break;
2606 	case KERN_INVALID_ADDRESS:
2607 		error = EFAULT;
2608 		break;
2609 	case KERN_PROTECTION_FAILURE:
2610 		error = EPERM;
2611 		break;
2612 	case KERN_NO_SPACE:
2613 		error = ENOMEM;
2614 		break;
2615 	case KERN_FAILURE:
2616 	case KERN_INVALID_ARGUMENT:
2617 	default:
2618 		error = EINVAL;
2619 		break;
2620 	}
2621 
2622 	/*
2623 	 * Mark that this process is now using split libraries.
2624 	 */
2625 	if (error == 0 && (p->p_flag & P_NOSHLIB)) {
2626 		OSBitAndAtomic(~((uint32_t)P_NOSHLIB), &p->p_flag);
2627 	}
2628 
2629 	vnode_put(rdir_vp);
2630 	shared_region_map_and_slide_cleanup(p, files_count, sr_file_mappings, shared_region, scdir_vp);
2631 
2632 	SHARED_REGION_TRACE_DEBUG(
2633 		("shared_region: %p [%d(%s)] <- map\n",
2634 		(void *)VM_KERNEL_ADDRPERM(current_thread()),
2635 		proc_getpid(p), p->p_comm));
2636 
2637 	return error;
2638 }
2639 
2640 /*
2641  * Clean up part of _shared_region_map_and_slide()
2642  * It had to be broken out of _shared_region_map_and_slide() to
2643  * prevent compiler inlining from blowing out the stack.
2644  */
2645 __attribute__((noinline))
2646 static void
shared_region_map_and_slide_cleanup(struct proc * p,uint32_t files_count,struct _sr_file_mappings * sr_file_mappings,struct vm_shared_region * shared_region,struct vnode * scdir_vp)2647 shared_region_map_and_slide_cleanup(
2648 	struct proc              *p,
2649 	uint32_t                 files_count,
2650 	struct _sr_file_mappings *sr_file_mappings,
2651 	struct vm_shared_region  *shared_region,
2652 	struct vnode             *scdir_vp)
2653 {
2654 	struct _sr_file_mappings *srfmp;
2655 	struct vnode_attr        va;
2656 
2657 	if (sr_file_mappings != NULL) {
2658 		for (srfmp = &sr_file_mappings[0]; srfmp < &sr_file_mappings[files_count]; srfmp++) {
2659 			if (srfmp->vp != NULL) {
2660 				vnode_lock_spin(srfmp->vp);
2661 				srfmp->vp->v_flag |= VSHARED_DYLD;
2662 				vnode_unlock(srfmp->vp);
2663 
2664 				/* update the vnode's access time */
2665 				if (!(vnode_vfsvisflags(srfmp->vp) & MNT_NOATIME)) {
2666 					VATTR_INIT(&va);
2667 					nanotime(&va.va_access_time);
2668 					VATTR_SET_ACTIVE(&va, va_access_time);
2669 					vnode_setattr(srfmp->vp, &va, vfs_context_current());
2670 				}
2671 
2672 #if NAMEDSTREAMS
2673 				/*
2674 				 * If the shared cache is compressed, it may
2675 				 * have a namedstream vnode instantiated for
2676 				 * for it. That namedstream vnode will also
2677 				 * have to be marked with VSHARED_DYLD.
2678 				 */
2679 				if (vnode_hasnamedstreams(srfmp->vp)) {
2680 					vnode_t svp;
2681 					if (vnode_getnamedstream(srfmp->vp, &svp, XATTR_RESOURCEFORK_NAME,
2682 					    NS_OPEN, 0, vfs_context_kernel()) == 0) {
2683 						vnode_lock_spin(svp);
2684 						svp->v_flag |= VSHARED_DYLD;
2685 						vnode_unlock(svp);
2686 						vnode_put(svp);
2687 					}
2688 				}
2689 #endif /* NAMEDSTREAMS */
2690 				/*
2691 				 * release the vnode...
2692 				 * ubc_map() still holds it for us in the non-error case
2693 				 */
2694 				(void) vnode_put(srfmp->vp);
2695 				srfmp->vp = NULL;
2696 			}
2697 			if (srfmp->fp != NULL) {
2698 				/* release the file descriptor */
2699 				fp_drop(p, srfmp->fd, srfmp->fp, 0);
2700 				srfmp->fp = NULL;
2701 			}
2702 		}
2703 		kfree_type(struct _sr_file_mappings, files_count, sr_file_mappings);
2704 	}
2705 
2706 	if (scdir_vp != NULL) {
2707 		(void)vnode_put(scdir_vp);
2708 		scdir_vp = NULL;
2709 	}
2710 
2711 	if (shared_region != NULL) {
2712 		vm_shared_region_deallocate(shared_region);
2713 	}
2714 }
2715 
2716 
2717 /*
2718  * For each file mapped, we may have mappings for:
2719  *    TEXT, EXECUTE, LINKEDIT, DATA_CONST, __AUTH, DATA
2720  * so let's round up to 8 mappings per file.
2721  */
2722 #define SFM_MAX       (_SR_FILE_MAPPINGS_MAX_FILES * 8)     /* max mapping structs allowed to pass in */
2723 
2724 /*
2725  * This is the older interface that dyld uses to map in the shared
2726  * library. dyld is slowly moving to the new shared_region_map_and_slide_2_np()
2727  * call as needed.
2728  */
2729 int
shared_region_map_and_slide_np(struct proc * p,struct shared_region_map_and_slide_np_args * uap,__unused int * retvalp)2730 shared_region_map_and_slide_np(
2731 	struct proc                                *p,
2732 	struct shared_region_map_and_slide_np_args *uap,
2733 	__unused int                               *retvalp)
2734 {
2735 	unsigned int                        mappings_count = uap->count;
2736 	unsigned int                        m;
2737 	uint32_t                            slide = uap->slide;
2738 	struct shared_file_np               shared_files[1];
2739 	struct shared_file_mapping_np       legacy_mapping;
2740 	struct shared_file_mapping_slide_np *mappings = NULL;
2741 	kern_return_t                       kr = KERN_SUCCESS;
2742 
2743 	if ((kr = vm_shared_region_sliding_valid(slide)) != KERN_SUCCESS) {
2744 		if (kr == KERN_INVALID_ARGUMENT) {
2745 			/*
2746 			 * This will happen if we request sliding again
2747 			 * with the same slide value that was used earlier
2748 			 * for the very first sliding.
2749 			 */
2750 			kr = KERN_SUCCESS;
2751 		}
2752 		goto done;
2753 	}
2754 
2755 	if (mappings_count == 0) {
2756 		SHARED_REGION_TRACE_INFO(
2757 			("shared_region: %p [%d(%s)] map(): "
2758 			"no mappings\n",
2759 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2760 			proc_getpid(p), p->p_comm));
2761 		kr = 0; /* no mappings: we're done ! */
2762 		goto done;
2763 	} else if (mappings_count <= SFM_MAX) {
2764 		mappings = kalloc_data(mappings_count * sizeof(mappings[0]), Z_WAITOK);
2765 		if (mappings == NULL) {
2766 			kr = KERN_RESOURCE_SHORTAGE;
2767 			goto done;
2768 		}
2769 	} else {
2770 		SHARED_REGION_TRACE_ERROR(
2771 			("shared_region: %p [%d(%s)] map(): "
2772 			"too many mappings (%d) max %d\n",
2773 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2774 			proc_getpid(p), p->p_comm,
2775 			mappings_count, SFM_MAX));
2776 		kr = KERN_FAILURE;
2777 		goto done;
2778 	}
2779 
2780 	/*
2781 	 * Read in the mappings and translate to new format.
2782 	 */
2783 	for (m = 0; m < mappings_count; ++m) {
2784 		user_addr_t from_uaddr = uap->mappings + (m * sizeof(struct shared_file_mapping_np));
2785 		kr = shared_region_copyin(p, from_uaddr, 1, sizeof(legacy_mapping), &legacy_mapping);
2786 		if (kr != 0) {
2787 			goto done;
2788 		}
2789 		mappings[m].sms_address = legacy_mapping.sfm_address;
2790 		mappings[m].sms_size = legacy_mapping.sfm_size;
2791 		mappings[m].sms_file_offset = legacy_mapping.sfm_file_offset;
2792 		mappings[m].sms_max_prot = legacy_mapping.sfm_max_prot;
2793 		mappings[m].sms_init_prot = legacy_mapping.sfm_init_prot;
2794 		mappings[m].sms_slide_size = uap->slide_size;
2795 		mappings[m].sms_slide_start = uap->slide_start;
2796 	}
2797 
2798 	bzero(shared_files, sizeof(shared_files));
2799 	shared_files[0].sf_fd = uap->fd;
2800 	shared_files[0].sf_mappings_count = mappings_count;
2801 	shared_files[0].sf_slide = slide;
2802 
2803 	kr = _shared_region_map_and_slide(p,
2804 	    1,                 /* # of files to map */
2805 	    &shared_files[0],  /* files to map */
2806 	    mappings_count,
2807 	    mappings);
2808 
2809 done:
2810 	kfree_data(mappings, mappings_count * sizeof(mappings[0]));
2811 	return kr;
2812 }
2813 
2814 /*
2815  * This is the new interface for setting up shared region mappings.
2816  *
2817  * The slide used for shared regions setup using this interface is done differently
2818  * from the old interface. The slide value passed in the shared_files_np represents
2819  * a max value. The kernel will choose a random value based on that, then use it
2820  * for all shared regions.
2821  */
2822 #if defined (__x86_64__)
2823 #define SLIDE_AMOUNT_MASK ~FOURK_PAGE_MASK
2824 #else
2825 #define SLIDE_AMOUNT_MASK ~SIXTEENK_PAGE_MASK
2826 #endif
2827 
2828 int
shared_region_map_and_slide_2_np(struct proc * p,struct shared_region_map_and_slide_2_np_args * uap,__unused int * retvalp)2829 shared_region_map_and_slide_2_np(
2830 	struct proc                                  *p,
2831 	struct shared_region_map_and_slide_2_np_args *uap,
2832 	__unused int                                 *retvalp)
2833 {
2834 	unsigned int                  files_count;
2835 	struct shared_file_np         *shared_files = NULL;
2836 	unsigned int                  mappings_count;
2837 	struct shared_file_mapping_slide_np *mappings = NULL;
2838 	kern_return_t                 kr = KERN_SUCCESS;
2839 
2840 	files_count = uap->files_count;
2841 	mappings_count = uap->mappings_count;
2842 
2843 	if (files_count == 0) {
2844 		SHARED_REGION_TRACE_INFO(
2845 			("shared_region: %p [%d(%s)] map(): "
2846 			"no files\n",
2847 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2848 			proc_getpid(p), p->p_comm));
2849 		kr = 0; /* no files to map: we're done ! */
2850 		goto done;
2851 	} else if (files_count <= _SR_FILE_MAPPINGS_MAX_FILES) {
2852 		shared_files = kalloc_data(files_count * sizeof(shared_files[0]), Z_WAITOK);
2853 		if (shared_files == NULL) {
2854 			kr = KERN_RESOURCE_SHORTAGE;
2855 			goto done;
2856 		}
2857 	} else {
2858 		SHARED_REGION_TRACE_ERROR(
2859 			("shared_region: %p [%d(%s)] map(): "
2860 			"too many files (%d) max %d\n",
2861 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2862 			proc_getpid(p), p->p_comm,
2863 			files_count, _SR_FILE_MAPPINGS_MAX_FILES));
2864 		kr = KERN_FAILURE;
2865 		goto done;
2866 	}
2867 
2868 	if (mappings_count == 0) {
2869 		SHARED_REGION_TRACE_INFO(
2870 			("shared_region: %p [%d(%s)] map(): "
2871 			"no mappings\n",
2872 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2873 			proc_getpid(p), p->p_comm));
2874 		kr = 0; /* no mappings: we're done ! */
2875 		goto done;
2876 	} else if (mappings_count <= SFM_MAX) {
2877 		mappings = kalloc_data(mappings_count * sizeof(mappings[0]), Z_WAITOK);
2878 		if (mappings == NULL) {
2879 			kr = KERN_RESOURCE_SHORTAGE;
2880 			goto done;
2881 		}
2882 	} else {
2883 		SHARED_REGION_TRACE_ERROR(
2884 			("shared_region: %p [%d(%s)] map(): "
2885 			"too many mappings (%d) max %d\n",
2886 			(void *)VM_KERNEL_ADDRPERM(current_thread()),
2887 			proc_getpid(p), p->p_comm,
2888 			mappings_count, SFM_MAX));
2889 		kr = KERN_FAILURE;
2890 		goto done;
2891 	}
2892 
2893 	kr = shared_region_copyin(p, uap->files, files_count, sizeof(shared_files[0]), shared_files);
2894 	if (kr != KERN_SUCCESS) {
2895 		goto done;
2896 	}
2897 
2898 	kr = shared_region_copyin(p, uap->mappings, mappings_count, sizeof(mappings[0]), mappings);
2899 	if (kr != KERN_SUCCESS) {
2900 		goto done;
2901 	}
2902 
2903 	uint32_t max_slide = shared_files[0].sf_slide;
2904 	uint32_t random_val;
2905 	uint32_t slide_amount;
2906 
2907 	if (max_slide != 0) {
2908 		read_random(&random_val, sizeof random_val);
2909 		slide_amount = ((random_val % max_slide) & SLIDE_AMOUNT_MASK);
2910 	} else {
2911 		slide_amount = 0;
2912 	}
2913 #if DEVELOPMENT || DEBUG
2914 	extern bool bootarg_disable_aslr;
2915 	if (bootarg_disable_aslr) {
2916 		slide_amount = 0;
2917 	}
2918 #endif /* DEVELOPMENT || DEBUG */
2919 
2920 	/*
2921 	 * Fix up the mappings to reflect the desired slide.
2922 	 */
2923 	unsigned int f;
2924 	unsigned int m = 0;
2925 	unsigned int i;
2926 	for (f = 0; f < files_count; ++f) {
2927 		shared_files[f].sf_slide = slide_amount;
2928 		for (i = 0; i < shared_files[f].sf_mappings_count; ++i, ++m) {
2929 			if (m >= mappings_count) {
2930 				SHARED_REGION_TRACE_ERROR(
2931 					("shared_region: %p [%d(%s)] map(): "
2932 					"mapping count argument was too small\n",
2933 					(void *)VM_KERNEL_ADDRPERM(current_thread()),
2934 					proc_getpid(p), p->p_comm));
2935 				kr = KERN_FAILURE;
2936 				goto done;
2937 			}
2938 			mappings[m].sms_address += slide_amount;
2939 			if (mappings[m].sms_slide_size != 0) {
2940 				mappings[m].sms_slide_start += slide_amount;
2941 			}
2942 		}
2943 	}
2944 
2945 	kr = _shared_region_map_and_slide(p, files_count, shared_files, mappings_count, mappings);
2946 done:
2947 	kfree_data(shared_files, files_count * sizeof(shared_files[0]));
2948 	kfree_data(mappings, mappings_count * sizeof(mappings[0]));
2949 	return kr;
2950 }
2951 
2952 /* sysctl overflow room */
2953 
2954 SYSCTL_INT(_vm, OID_AUTO, pagesize, CTLFLAG_RD | CTLFLAG_LOCKED,
2955     (int *) &page_size, 0, "vm page size");
2956 
2957 /* vm_page_free_target is provided as a makeshift solution for applications that want to
2958  *       allocate buffer space, possibly purgeable memory, but not cause inactive pages to be
2959  *       reclaimed. It allows the app to calculate how much memory is free outside the free target. */
2960 extern unsigned int     vm_page_free_target;
2961 SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED,
2962     &vm_page_free_target, 0, "Pageout daemon free target");
2963 
2964 SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED,
2965     &vm_pageout_state.vm_memory_pressure, 0, "Memory pressure indicator");
2966 
2967 static int
2968 vm_ctl_page_free_wanted SYSCTL_HANDLER_ARGS
2969 {
2970 #pragma unused(oidp, arg1, arg2)
2971 	unsigned int page_free_wanted;
2972 
2973 	page_free_wanted = mach_vm_ctl_page_free_wanted();
2974 	return SYSCTL_OUT(req, &page_free_wanted, sizeof(page_free_wanted));
2975 }
2976 SYSCTL_PROC(_vm, OID_AUTO, page_free_wanted,
2977     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
2978     0, 0, vm_ctl_page_free_wanted, "I", "");
2979 
2980 extern unsigned int     vm_page_purgeable_count;
2981 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2982     &vm_page_purgeable_count, 0, "Purgeable page count");
2983 
2984 extern unsigned int     vm_page_purgeable_wired_count;
2985 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2986     &vm_page_purgeable_wired_count, 0, "Wired purgeable page count");
2987 
2988 extern unsigned int vm_page_kern_lpage_count;
2989 SYSCTL_INT(_vm, OID_AUTO, kern_lpage_count, CTLFLAG_RD | CTLFLAG_LOCKED,
2990     &vm_page_kern_lpage_count, 0, "kernel used large pages");
2991 
2992 #if DEVELOPMENT || DEBUG
2993 #if __ARM_MIXED_PAGE_SIZE__
2994 static int vm_mixed_pagesize_supported = 1;
2995 #else
2996 static int vm_mixed_pagesize_supported = 0;
2997 #endif /*__ARM_MIXED_PAGE_SIZE__ */
2998 SYSCTL_INT(_debug, OID_AUTO, vm_mixed_pagesize_supported, CTLFLAG_ANYBODY | CTLFLAG_RD | CTLFLAG_LOCKED,
2999     &vm_mixed_pagesize_supported, 0, "kernel support for mixed pagesize");
3000 
3001 SCALABLE_COUNTER_DECLARE(vm_page_grab_count);
3002 SYSCTL_SCALABLE_COUNTER(_vm, pages_grabbed, vm_page_grab_count, "Total pages grabbed");
3003 SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
3004     &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed");
3005 
3006 SYSCTL_INT(_vm, OID_AUTO, pageout_purged_objects, CTLFLAG_RD | CTLFLAG_LOCKED,
3007     &vm_pageout_debug.vm_pageout_purged_objects, 0, "System purged object count");
3008 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED,
3009     &vm_pageout_debug.vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)");
3010 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED,
3011     &vm_pageout_debug.vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)");
3012 
3013 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
3014     &vm_pageout_debug.vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated");
3015 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
3016     &vm_pageout_debug.vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated");
3017 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
3018     &vm_pageout_debug.vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated");         /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */
3019 SYSCTL_ULONG(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
3020     &vm_pageout_vminfo.vm_pageout_freed_cleaned, "Cleaned pages freed");
3021 SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
3022     &vm_pageout_debug.vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated");
3023 SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
3024     &vm_pageout_debug.vm_pageout_enqueued_cleaned, 0, "");         /* sum of next two */
3025 #endif /* DEVELOPMENT || DEBUG */
3026 
3027 extern int madvise_free_debug;
3028 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
3029     &madvise_free_debug, 0, "zero-fill on madvise(MADV_FREE*)");
3030 
3031 SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD | CTLFLAG_LOCKED,
3032     &vm_page_stats_reusable.reusable_count, 0, "Reusable page count");
3033 SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED,
3034     &vm_page_stats_reusable.reusable_pages_success, "");
3035 SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
3036     &vm_page_stats_reusable.reusable_pages_failure, "");
3037 SYSCTL_QUAD(_vm, OID_AUTO, reusable_pages_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
3038     &vm_page_stats_reusable.reusable_pages_shared, "");
3039 SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
3040     &vm_page_stats_reusable.all_reusable_calls, "");
3041 SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
3042     &vm_page_stats_reusable.partial_reusable_calls, "");
3043 SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
3044     &vm_page_stats_reusable.reuse_pages_success, "");
3045 SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
3046     &vm_page_stats_reusable.reuse_pages_failure, "");
3047 SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
3048     &vm_page_stats_reusable.all_reuse_calls, "");
3049 SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED,
3050     &vm_page_stats_reusable.partial_reuse_calls, "");
3051 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED,
3052     &vm_page_stats_reusable.can_reuse_success, "");
3053 SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED,
3054     &vm_page_stats_reusable.can_reuse_failure, "");
3055 SYSCTL_QUAD(_vm, OID_AUTO, reusable_reclaimed, CTLFLAG_RD | CTLFLAG_LOCKED,
3056     &vm_page_stats_reusable.reusable_reclaimed, "");
3057 SYSCTL_QUAD(_vm, OID_AUTO, reusable_nonwritable, CTLFLAG_RD | CTLFLAG_LOCKED,
3058     &vm_page_stats_reusable.reusable_nonwritable, "");
3059 SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
3060     &vm_page_stats_reusable.reusable_shared, "");
3061 SYSCTL_QUAD(_vm, OID_AUTO, free_shared, CTLFLAG_RD | CTLFLAG_LOCKED,
3062     &vm_page_stats_reusable.free_shared, "");
3063 
3064 
3065 extern unsigned int vm_page_free_count, vm_page_speculative_count;
3066 SYSCTL_UINT(_vm, OID_AUTO, page_free_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_count, 0, "");
3067 SYSCTL_UINT(_vm, OID_AUTO, page_speculative_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_speculative_count, 0, "");
3068 
3069 extern unsigned int vm_page_cleaned_count;
3070 SYSCTL_UINT(_vm, OID_AUTO, page_cleaned_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_cleaned_count, 0, "Cleaned queue size");
3071 
3072 extern unsigned int vm_page_pageable_internal_count, vm_page_pageable_external_count;
3073 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_internal_count, 0, "");
3074 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, "");
3075 
3076 /* pageout counts */
3077 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_clean, 0, "");
3078 SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_used, 0, "");
3079 
3080 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, "");
3081 SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_external, "");
3082 SYSCTL_ULONG(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
3083 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_external, "");
3084 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
3085 SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_cleaned, "");
3086 
3087 
3088 /* counts of pages prefaulted when entering a memory object */
3089 extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout;
3090 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_pages, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_pages, "");
3091 SYSCTL_QUAD(_vm, OID_AUTO, prefault_nb_bailout, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_prefault_nb_bailout, "");
3092 
3093 #if defined (__x86_64__)
3094 extern unsigned int vm_clump_promote_threshold;
3095 SYSCTL_UINT(_vm, OID_AUTO, vm_clump_promote_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_clump_promote_threshold, 0, "clump size threshold for promotes");
3096 #if DEVELOPMENT || DEBUG
3097 extern unsigned long vm_clump_stats[];
3098 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[1], "free page allocations from clump of 1 page");
3099 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[2], "free page allocations from clump of 2 pages");
3100 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[3], "free page allocations from clump of 3 pages");
3101 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats4, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[4], "free page allocations from clump of 4 pages");
3102 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats5, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[5], "free page allocations from clump of 5 pages");
3103 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats6, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[6], "free page allocations from clump of 6 pages");
3104 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats7, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[7], "free page allocations from clump of 7 pages");
3105 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats8, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[8], "free page allocations from clump of 8 pages");
3106 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats9, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[9], "free page allocations from clump of 9 pages");
3107 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats10, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[10], "free page allocations from clump of 10 pages");
3108 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats11, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[11], "free page allocations from clump of 11 pages");
3109 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats12, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[12], "free page allocations from clump of 12 pages");
3110 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats13, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[13], "free page allocations from clump of 13 pages");
3111 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats14, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[14], "free page allocations from clump of 14 pages");
3112 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats15, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[15], "free page allocations from clump of 15 pages");
3113 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_stats16, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_stats[16], "free page allocations from clump of 16 pages");
3114 extern unsigned long vm_clump_allocs, vm_clump_inserts, vm_clump_inrange, vm_clump_promotes;
3115 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_alloc, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_allocs, "free page allocations");
3116 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inserts, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inserts, "free page insertions");
3117 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_inrange, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_inrange, "free page insertions that are part of vm_pages");
3118 SYSCTL_LONG(_vm, OID_AUTO, vm_clump_promotes, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_clump_promotes, "pages promoted to head");
3119 #endif  /* if DEVELOPMENT || DEBUG */
3120 #endif  /* #if defined (__x86_64__) */
3121 
3122 #if CONFIG_SECLUDED_MEMORY
3123 
3124 SYSCTL_UINT(_vm, OID_AUTO, num_tasks_can_use_secluded_mem, CTLFLAG_RD | CTLFLAG_LOCKED, &num_tasks_can_use_secluded_mem, 0, "");
3125 extern unsigned int vm_page_secluded_target;
3126 extern unsigned int vm_page_secluded_count;
3127 extern unsigned int vm_page_secluded_count_free;
3128 extern unsigned int vm_page_secluded_count_inuse;
3129 extern unsigned int vm_page_secluded_count_over_target;
3130 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_target, 0, "");
3131 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count, 0, "");
3132 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_free, 0, "");
3133 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_inuse, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_inuse, 0, "");
3134 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_count_over_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded_count_over_target, 0, "");
3135 
3136 extern struct vm_page_secluded_data vm_page_secluded;
3137 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_eligible, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.eligible_for_secluded, 0, "");
3138 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_free, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_free, 0, "");
3139 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_success_other, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_success_other, 0, "");
3140 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_locked, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_locked, 0, "");
3141 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_state, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_state, 0, "");
3142 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_failure_dirty, 0, "");
3143 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, "");
3144 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, "");
3145 
3146 #endif /* CONFIG_SECLUDED_MEMORY */
3147 
3148 #include <kern/thread.h>
3149 #include <sys/user.h>
3150 
3151 void vm_pageout_io_throttle(void);
3152 
3153 void
vm_pageout_io_throttle(void)3154 vm_pageout_io_throttle(void)
3155 {
3156 	struct uthread *uthread = current_uthread();
3157 
3158 	/*
3159 	 * thread is marked as a low priority I/O type
3160 	 * and the I/O we issued while in this cleaning operation
3161 	 * collided with normal I/O operations... we'll
3162 	 * delay in order to mitigate the impact of this
3163 	 * task on the normal operation of the system
3164 	 */
3165 
3166 	if (uthread->uu_lowpri_window) {
3167 		throttle_lowpri_io(1);
3168 	}
3169 }
3170 
3171 int
vm_pressure_monitor(__unused struct proc * p,struct vm_pressure_monitor_args * uap,int * retval)3172 vm_pressure_monitor(
3173 	__unused struct proc *p,
3174 	struct vm_pressure_monitor_args *uap,
3175 	int *retval)
3176 {
3177 	kern_return_t   kr;
3178 	uint32_t        pages_reclaimed;
3179 	uint32_t        pages_wanted;
3180 
3181 	kr = mach_vm_pressure_monitor(
3182 		(boolean_t) uap->wait_for_pressure,
3183 		uap->nsecs_monitored,
3184 		(uap->pages_reclaimed) ? &pages_reclaimed : NULL,
3185 		&pages_wanted);
3186 
3187 	switch (kr) {
3188 	case KERN_SUCCESS:
3189 		break;
3190 	case KERN_ABORTED:
3191 		return EINTR;
3192 	default:
3193 		return EINVAL;
3194 	}
3195 
3196 	if (uap->pages_reclaimed) {
3197 		if (copyout((void *)&pages_reclaimed,
3198 		    uap->pages_reclaimed,
3199 		    sizeof(pages_reclaimed)) != 0) {
3200 			return EFAULT;
3201 		}
3202 	}
3203 
3204 	*retval = (int) pages_wanted;
3205 	return 0;
3206 }
3207 
3208 int
kas_info(struct proc * p,struct kas_info_args * uap,int * retval __unused)3209 kas_info(struct proc *p,
3210     struct kas_info_args *uap,
3211     int *retval __unused)
3212 {
3213 #ifndef CONFIG_KAS_INFO
3214 	(void)p;
3215 	(void)uap;
3216 	return ENOTSUP;
3217 #else /* CONFIG_KAS_INFO */
3218 	int                     selector = uap->selector;
3219 	user_addr_t     valuep = uap->value;
3220 	user_addr_t     sizep = uap->size;
3221 	user_size_t size, rsize;
3222 	int                     error;
3223 
3224 	if (!kauth_cred_issuser(kauth_cred_get())) {
3225 		return EPERM;
3226 	}
3227 
3228 #if CONFIG_MACF
3229 	error = mac_system_check_kas_info(kauth_cred_get(), selector);
3230 	if (error) {
3231 		return error;
3232 	}
3233 #endif
3234 
3235 	if (IS_64BIT_PROCESS(p)) {
3236 		user64_size_t size64;
3237 		error = copyin(sizep, &size64, sizeof(size64));
3238 		size = (user_size_t)size64;
3239 	} else {
3240 		user32_size_t size32;
3241 		error = copyin(sizep, &size32, sizeof(size32));
3242 		size = (user_size_t)size32;
3243 	}
3244 	if (error) {
3245 		return error;
3246 	}
3247 
3248 	switch (selector) {
3249 	case KAS_INFO_KERNEL_TEXT_SLIDE_SELECTOR:
3250 	{
3251 		uint64_t slide = vm_kernel_slide;
3252 
3253 		if (sizeof(slide) != size) {
3254 			return EINVAL;
3255 		}
3256 
3257 		error = copyout(&slide, valuep, sizeof(slide));
3258 		if (error) {
3259 			return error;
3260 		}
3261 		rsize = size;
3262 	}
3263 	break;
3264 	case KAS_INFO_KERNEL_SEGMENT_VMADDR_SELECTOR:
3265 	{
3266 		uint32_t i;
3267 		kernel_mach_header_t *mh = &_mh_execute_header;
3268 		struct load_command *cmd;
3269 		cmd = (struct load_command*) &mh[1];
3270 		uint64_t *bases;
3271 		rsize = mh->ncmds * sizeof(uint64_t);
3272 
3273 		/*
3274 		 * Return the size if no data was passed
3275 		 */
3276 		if (valuep == 0) {
3277 			break;
3278 		}
3279 
3280 		if (rsize > size) {
3281 			return EINVAL;
3282 		}
3283 
3284 		bases = kalloc_data(rsize, Z_WAITOK | Z_ZERO);
3285 
3286 		for (i = 0; i < mh->ncmds; i++) {
3287 			if (cmd->cmd == LC_SEGMENT_KERNEL) {
3288 				__IGNORE_WCASTALIGN(kernel_segment_command_t * sg = (kernel_segment_command_t *) cmd);
3289 				bases[i] = (uint64_t)sg->vmaddr;
3290 			}
3291 			cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize);
3292 		}
3293 
3294 		error = copyout(bases, valuep, rsize);
3295 
3296 		kfree_data(bases, rsize);
3297 
3298 		if (error) {
3299 			return error;
3300 		}
3301 	}
3302 	break;
3303 	default:
3304 		return EINVAL;
3305 	}
3306 
3307 	if (IS_64BIT_PROCESS(p)) {
3308 		user64_size_t size64 = (user64_size_t)rsize;
3309 		error = copyout(&size64, sizep, sizeof(size64));
3310 	} else {
3311 		user32_size_t size32 = (user32_size_t)rsize;
3312 		error = copyout(&size32, sizep, sizeof(size32));
3313 	}
3314 
3315 	return error;
3316 #endif /* CONFIG_KAS_INFO */
3317 }
3318 
3319 #if __has_feature(ptrauth_calls)
3320 /*
3321  * Generate a random pointer signing key that isn't 0.
3322  */
3323 uint64_t
generate_jop_key(void)3324 generate_jop_key(void)
3325 {
3326 	uint64_t key;
3327 
3328 	do {
3329 		read_random(&key, sizeof key);
3330 	} while (key == 0);
3331 	return key;
3332 }
3333 #endif /* __has_feature(ptrauth_calls) */
3334 
3335 
3336 #pragma clang diagnostic push
3337 #pragma clang diagnostic ignored "-Wcast-qual"
3338 #pragma clang diagnostic ignored "-Wunused-function"
3339 
3340 static void
asserts()3341 asserts()
3342 {
3343 	static_assert(sizeof(vm_min_kernel_address) == sizeof(unsigned long));
3344 	static_assert(sizeof(vm_max_kernel_address) == sizeof(unsigned long));
3345 }
3346 
3347 SYSCTL_ULONG(_vm, OID_AUTO, vm_min_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_min_kernel_address, "");
3348 SYSCTL_ULONG(_vm, OID_AUTO, vm_max_kernel_address, CTLFLAG_RD, (unsigned long *) &vm_max_kernel_address, "");
3349 #pragma clang diagnostic pop
3350 
3351 extern uint32_t vm_page_pages;
3352 SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0, "");
3353 
3354 extern uint32_t vm_page_busy_absent_skipped;
3355 SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, "");
3356 
3357 extern uint32_t vm_page_upl_tainted;
3358 SYSCTL_UINT(_vm, OID_AUTO, upl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_upl_tainted, 0, "");
3359 
3360 extern uint32_t vm_page_iopl_tainted;
3361 SYSCTL_UINT(_vm, OID_AUTO, iopl_pages_tainted, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_iopl_tainted, 0, "");
3362 
3363 #if (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG)
3364 extern int vm_footprint_suspend_allowed;
3365 SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, "");
3366 
3367 extern void pmap_footprint_suspend(vm_map_t map, boolean_t suspend);
3368 static int
3369 sysctl_vm_footprint_suspend SYSCTL_HANDLER_ARGS
3370 {
3371 #pragma unused(oidp, arg1, arg2)
3372 	int error = 0;
3373 	int new_value;
3374 
3375 	if (req->newptr == USER_ADDR_NULL) {
3376 		return 0;
3377 	}
3378 	error = SYSCTL_IN(req, &new_value, sizeof(int));
3379 	if (error) {
3380 		return error;
3381 	}
3382 	if (!vm_footprint_suspend_allowed) {
3383 		if (new_value != 0) {
3384 			/* suspends are not allowed... */
3385 			return 0;
3386 		}
3387 		/* ... but let resumes proceed */
3388 	}
3389 	DTRACE_VM2(footprint_suspend,
3390 	    vm_map_t, current_map(),
3391 	    int, new_value);
3392 
3393 	pmap_footprint_suspend(current_map(), new_value);
3394 
3395 	return 0;
3396 }
3397 SYSCTL_PROC(_vm, OID_AUTO, footprint_suspend,
3398     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3399     0, 0, &sysctl_vm_footprint_suspend, "I", "");
3400 #endif /* (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG) */
3401 
3402 extern uint64_t vm_map_corpse_footprint_count;
3403 extern uint64_t vm_map_corpse_footprint_size_avg;
3404 extern uint64_t vm_map_corpse_footprint_size_max;
3405 extern uint64_t vm_map_corpse_footprint_full;
3406 extern uint64_t vm_map_corpse_footprint_no_buf;
3407 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_count,
3408     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_count, "");
3409 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_avg,
3410     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_avg, "");
3411 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_max,
3412     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_max, "");
3413 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_full,
3414     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_full, "");
3415 SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_no_buf,
3416     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_no_buf, "");
3417 
3418 
3419 extern uint64_t shared_region_pager_copied;
3420 extern uint64_t shared_region_pager_slid;
3421 extern uint64_t shared_region_pager_slid_error;
3422 extern uint64_t shared_region_pager_reclaimed;
3423 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_copied,
3424     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_copied, "");
3425 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid,
3426     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid, "");
3427 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid_error,
3428     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid_error, "");
3429 SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed,
3430     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_reclaimed, "");
3431 extern int shared_region_destroy_delay;
3432 SYSCTL_INT(_vm, OID_AUTO, shared_region_destroy_delay,
3433     CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_destroy_delay, 0, "");
3434 
3435 #if MACH_ASSERT
3436 extern int pmap_ledgers_panic_leeway;
3437 SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
3438 #endif /* MACH_ASSERT */
3439 
3440 
3441 extern uint64_t vm_map_lookup_locked_copy_slowly_count;
3442 extern uint64_t vm_map_lookup_locked_copy_slowly_size;
3443 extern uint64_t vm_map_lookup_locked_copy_slowly_max;
3444 extern uint64_t vm_map_lookup_locked_copy_slowly_restart;
3445 extern uint64_t vm_map_lookup_locked_copy_slowly_error;
3446 extern uint64_t vm_map_lookup_locked_copy_strategically_count;
3447 extern uint64_t vm_map_lookup_locked_copy_strategically_size;
3448 extern uint64_t vm_map_lookup_locked_copy_strategically_max;
3449 extern uint64_t vm_map_lookup_locked_copy_strategically_restart;
3450 extern uint64_t vm_map_lookup_locked_copy_strategically_error;
3451 extern uint64_t vm_map_lookup_locked_copy_shadow_count;
3452 extern uint64_t vm_map_lookup_locked_copy_shadow_size;
3453 extern uint64_t vm_map_lookup_locked_copy_shadow_max;
3454 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_count,
3455     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_count, "");
3456 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_size,
3457     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_size, "");
3458 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_max,
3459     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_max, "");
3460 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_restart,
3461     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_restart, "");
3462 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_slowly_error,
3463     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_slowly_error, "");
3464 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_count,
3465     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_count, "");
3466 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_size,
3467     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_size, "");
3468 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_max,
3469     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_max, "");
3470 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_restart,
3471     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_restart, "");
3472 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_strategically_error,
3473     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_strategically_error, "");
3474 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_count,
3475     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_shadow_count, "");
3476 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_size,
3477     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_shadow_size, "");
3478 SYSCTL_QUAD(_vm, OID_AUTO, map_lookup_locked_copy_shadow_max,
3479     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_lookup_locked_copy_shadow_max, "");
3480 
3481 extern int vm_protect_privileged_from_untrusted;
3482 SYSCTL_INT(_vm, OID_AUTO, protect_privileged_from_untrusted,
3483     CTLFLAG_RW | CTLFLAG_LOCKED, &vm_protect_privileged_from_untrusted, 0, "");
3484 extern uint64_t vm_copied_on_read;
3485 SYSCTL_QUAD(_vm, OID_AUTO, copied_on_read,
3486     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_copied_on_read, "");
3487 
3488 extern int vm_shared_region_count;
3489 extern int vm_shared_region_peak;
3490 SYSCTL_INT(_vm, OID_AUTO, shared_region_count,
3491     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_count, 0, "");
3492 SYSCTL_INT(_vm, OID_AUTO, shared_region_peak,
3493     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_peak, 0, "");
3494 #if DEVELOPMENT || DEBUG
3495 extern unsigned int shared_region_pagers_resident_count;
3496 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_count,
3497     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_count, 0, "");
3498 extern unsigned int shared_region_pagers_resident_peak;
3499 SYSCTL_INT(_vm, OID_AUTO, shared_region_pagers_resident_peak,
3500     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pagers_resident_peak, 0, "");
3501 extern int shared_region_pager_count;
3502 SYSCTL_INT(_vm, OID_AUTO, shared_region_pager_count,
3503     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_count, 0, "");
3504 #if __has_feature(ptrauth_calls)
3505 extern int shared_region_key_count;
3506 SYSCTL_INT(_vm, OID_AUTO, shared_region_key_count,
3507     CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_key_count, 0, "");
3508 extern int vm_shared_region_reslide_count;
3509 SYSCTL_INT(_vm, OID_AUTO, shared_region_reslide_count,
3510     CTLFLAG_RD | CTLFLAG_LOCKED, &vm_shared_region_reslide_count, 0, "");
3511 #endif /* __has_feature(ptrauth_calls) */
3512 #endif /* DEVELOPMENT || DEBUG */
3513 
3514 #if MACH_ASSERT
3515 extern int debug4k_filter;
3516 SYSCTL_INT(_vm, OID_AUTO, debug4k_filter, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_filter, 0, "");
3517 extern int debug4k_panic_on_terminate;
3518 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_terminate, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_terminate, 0, "");
3519 extern int debug4k_panic_on_exception;
3520 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_exception, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_exception, 0, "");
3521 extern int debug4k_panic_on_misaligned_sharing;
3522 SYSCTL_INT(_vm, OID_AUTO, debug4k_panic_on_misaligned_sharing, CTLFLAG_RW | CTLFLAG_LOCKED, &debug4k_panic_on_misaligned_sharing, 0, "");
3523 #endif /* MACH_ASSERT */
3524 
3525 extern uint64_t vm_map_set_size_limit_count;
3526 extern uint64_t vm_map_set_data_limit_count;
3527 extern uint64_t vm_map_enter_RLIMIT_AS_count;
3528 extern uint64_t vm_map_enter_RLIMIT_DATA_count;
3529 SYSCTL_QUAD(_vm, OID_AUTO, map_set_size_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_size_limit_count, "");
3530 SYSCTL_QUAD(_vm, OID_AUTO, map_set_data_limit_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_set_data_limit_count, "");
3531 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_AS_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_AS_count, "");
3532 SYSCTL_QUAD(_vm, OID_AUTO, map_enter_RLIMIT_DATA_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_enter_RLIMIT_DATA_count, "");
3533 
3534 extern uint64_t vm_fault_resilient_media_initiate;
3535 extern uint64_t vm_fault_resilient_media_retry;
3536 extern uint64_t vm_fault_resilient_media_proceed;
3537 extern uint64_t vm_fault_resilient_media_release;
3538 extern uint64_t vm_fault_resilient_media_abort1;
3539 extern uint64_t vm_fault_resilient_media_abort2;
3540 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_initiate, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_initiate, "");
3541 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_retry, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_retry, "");
3542 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_proceed, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_proceed, "");
3543 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_release, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_release, "");
3544 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort1, "");
3545 SYSCTL_QUAD(_vm, OID_AUTO, fault_resilient_media_abort2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_abort2, "");
3546 #if MACH_ASSERT
3547 extern int vm_fault_resilient_media_inject_error1_rate;
3548 extern int vm_fault_resilient_media_inject_error1;
3549 extern int vm_fault_resilient_media_inject_error2_rate;
3550 extern int vm_fault_resilient_media_inject_error2;
3551 extern int vm_fault_resilient_media_inject_error3_rate;
3552 extern int vm_fault_resilient_media_inject_error3;
3553 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1_rate, 0, "");
3554 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error1, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error1, 0, "");
3555 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2_rate, 0, "");
3556 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error2, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error2, 0, "");
3557 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3_rate, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3_rate, 0, "");
3558 SYSCTL_INT(_vm, OID_AUTO, fault_resilient_media_inject_error3, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_fault_resilient_media_inject_error3, 0, "");
3559 #endif /* MACH_ASSERT */
3560 
3561 /*
3562  * A sysctl which causes all existing shared regions to become stale. They
3563  * will no longer be used by anything new and will be torn down as soon as
3564  * the last existing user exits. A write of non-zero value causes that to happen.
3565  * This should only be used by launchd, so we check that this is initproc.
3566  */
3567 static int
shared_region_pivot(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3568 shared_region_pivot(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3569 {
3570 	unsigned int value = 0;
3571 	int changed = 0;
3572 	int error = sysctl_io_number(req, 0, sizeof(value), &value, &changed);
3573 	if (error || !changed) {
3574 		return error;
3575 	}
3576 	if (current_proc() != initproc) {
3577 		return EPERM;
3578 	}
3579 
3580 	vm_shared_region_pivot();
3581 
3582 	return 0;
3583 }
3584 
3585 SYSCTL_PROC(_vm, OID_AUTO, shared_region_pivot,
3586     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
3587     0, 0, shared_region_pivot, "I", "");
3588 
3589 SYSCTL_INT(_vm, OID_AUTO, vmtc_total, CTLFLAG_RD | CTLFLAG_LOCKED,
3590     &vmtc_total, 0, "total text page corruptions detected");
3591 
3592 /*
3593  * sysctl to return the number of pages on retired_pages_object
3594  */
3595 static int
3596 retired_pages_count SYSCTL_HANDLER_ARGS
3597 {
3598 #pragma unused(arg1, arg2, oidp)
3599 	extern uint32_t vm_retired_pages_count(void);
3600 	uint32_t value = vm_retired_pages_count();
3601 
3602 	return SYSCTL_OUT(req, &value, sizeof(value));
3603 }
3604 SYSCTL_PROC(_vm, OID_AUTO, retired_pages_count, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
3605     0, 0, &retired_pages_count, "I", "");
3606 
3607 #if DEBUG || DEVELOPMENT
3608 /*
3609  * A sysctl that can be used to corrupt a text page with an illegal instruction.
3610  * Used for testing text page self healing.
3611  */
3612 extern kern_return_t vm_corrupt_text_addr(uintptr_t);
3613 static int
corrupt_text_addr(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)3614 corrupt_text_addr(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
3615 {
3616 	uint64_t value = 0;
3617 	int error = sysctl_handle_quad(oidp, &value, 0, req);
3618 	if (error || !req->newptr) {
3619 		return error;
3620 	}
3621 
3622 	if (vm_corrupt_text_addr((uintptr_t)value) == KERN_SUCCESS) {
3623 		return 0;
3624 	} else {
3625 		return EINVAL;
3626 	}
3627 }
3628 
3629 SYSCTL_PROC(_vm, OID_AUTO, corrupt_text_addr,
3630     CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
3631     0, 0, corrupt_text_addr, "-", "");
3632 #endif /* DEBUG || DEVELOPMENT */
3633 
3634 extern uint64_t c_seg_filled_no_contention;
3635 extern uint64_t c_seg_filled_contention;
3636 extern clock_sec_t c_seg_filled_contention_sec_max;
3637 extern clock_nsec_t c_seg_filled_contention_nsec_max;
3638 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_no_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_no_contention, "");
3639 SYSCTL_QUAD(_vm, OID_AUTO, c_seg_filled_contention, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention, "");
3640 SYSCTL_ULONG(_vm, OID_AUTO, c_seg_filled_contention_sec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_sec_max, "");
3641 SYSCTL_UINT(_vm, OID_AUTO, c_seg_filled_contention_nsec_max, CTLFLAG_RD | CTLFLAG_LOCKED, &c_seg_filled_contention_nsec_max, 0, "");
3642 #if (XNU_TARGET_OS_OSX && __arm64__)
3643 extern clock_nsec_t c_process_major_report_over_ms; /* report if over ? ms */
3644 extern int c_process_major_yield_after; /* yield after moving ? segments */
3645 extern uint64_t c_process_major_reports;
3646 extern clock_sec_t c_process_major_max_sec;
3647 extern clock_nsec_t c_process_major_max_nsec;
3648 extern uint32_t c_process_major_peak_segcount;
3649 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_report_over_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_report_over_ms, 0, "");
3650 SYSCTL_INT(_vm, OID_AUTO, c_process_major_yield_after, CTLFLAG_RW | CTLFLAG_LOCKED, &c_process_major_yield_after, 0, "");
3651 SYSCTL_QUAD(_vm, OID_AUTO, c_process_major_reports, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_reports, "");
3652 SYSCTL_ULONG(_vm, OID_AUTO, c_process_major_max_sec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_sec, "");
3653 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_max_nsec, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_max_nsec, 0, "");
3654 SYSCTL_UINT(_vm, OID_AUTO, c_process_major_peak_segcount, CTLFLAG_RD | CTLFLAG_LOCKED, &c_process_major_peak_segcount, 0, "");
3655 #endif /* (XNU_TARGET_OS_OSX && __arm64__) */
3656