xref: /xnu-11417.121.6/osfmk/vm/vm_map.c (revision a1e26a70f38d1d7daa7b49b258e2f8538ad81650)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68 
69 #include <vm/vm_options.h>
70 
71 #include <libkern/OSAtomic.h>
72 
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm_server.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83 
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 #include <kern/telemetry.h>
91 #include <kern/trap_telemetry.h>
92 
93 #include <vm/cpm_internal.h>
94 #include <vm/memory_types.h>
95 #include <vm/vm_compressor_xnu.h>
96 #include <vm/vm_compressor_pager_internal.h>
97 #include <vm/vm_init_xnu.h>
98 #include <vm/vm_fault_internal.h>
99 #include <vm/vm_map_internal.h>
100 #include <vm/vm_object_internal.h>
101 #include <vm/vm_page_internal.h>
102 #include <vm/vm_pageout.h>
103 #include <vm/pmap.h>
104 #include <vm/vm_kern_internal.h>
105 #include <ipc/ipc_port.h>
106 #include <kern/sched_prim.h>
107 #include <kern/misc_protos.h>
108 
109 #include <mach/vm_map_server.h>
110 #include <mach/mach_host_server.h>
111 #include <vm/vm_memtag.h>
112 #include <vm/vm_protos_internal.h>
113 #include <vm/vm_purgeable_internal.h>
114 
115 #include <vm/vm_iokit.h>
116 #include <vm/vm_shared_region_internal.h>
117 #include <vm/vm_map_store_internal.h>
118 #include <vm/vm_memory_entry_xnu.h>
119 #include <vm/memory_object_internal.h>
120 #include <vm/vm_memory_entry.h>
121 #include <vm/vm_sanitize_internal.h>
122 #include <vm/vm_reclaim_xnu.h>
123 #if DEVELOPMENT || DEBUG
124 #include <vm/vm_compressor_info.h>
125 #endif /* DEVELOPMENT || DEBUG */
126 #include <san/kasan.h>
127 
128 #include <sys/resource.h>
129 #include <sys/random.h>
130 #include <sys/codesign.h>
131 #include <sys/code_signing.h>
132 #include <sys/mman.h>
133 #include <sys/reboot.h>
134 #include <sys/kdebug_triage.h>
135 #include <sys/reason.h>
136 
137 #include <os/log.h>
138 
139 #include <libkern/section_keywords.h>
140 
141 #include <os/hash.h>
142 
143 #if DEVELOPMENT || DEBUG
144 extern int proc_selfcsflags(void);
145 int vm_log_xnu_user_debug = 0;
146 int panic_on_unsigned_execute = 0;
147 int panic_on_mlock_failure = 0;
148 #endif /* DEVELOPMENT || DEBUG */
149 
150 #if DEVELOPMENT || DEBUG
151 int debug4k_filter = 0;
152 char debug4k_proc_name[1024] = "";
153 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
154 int debug4k_panic_on_misaligned_sharing = 0;
155 const char *debug4k_category_name[] = {
156 	"error",        /* 0 */
157 	"life",         /* 1 */
158 	"load",         /* 2 */
159 	"fault",        /* 3 */
160 	"copy",         /* 4 */
161 	"share",        /* 5 */
162 	"adjust",       /* 6 */
163 	"pmap",         /* 7 */
164 	"mementry",     /* 8 */
165 	"iokit",        /* 9 */
166 	"upl",          /* 10 */
167 	"exc",          /* 11 */
168 	"vfs"           /* 12 */
169 };
170 #endif /* DEVELOPMENT || DEBUG */
171 int debug4k_no_cow_copyin = 0;
172 
173 
174 #if __arm64__
175 extern const int fourk_binary_compatibility_unsafe;
176 #endif /* __arm64__ */
177 extern int proc_selfpid(void);
178 extern char *proc_name_address(void *p);
179 extern const char *proc_best_name(struct proc *p);
180 
181 #if VM_MAP_DEBUG_APPLE_PROTECT
182 int vm_map_debug_apple_protect = 0;
183 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
184 #if VM_MAP_DEBUG_FOURK
185 int vm_map_debug_fourk = 0;
186 #endif /* VM_MAP_DEBUG_FOURK */
187 
188 #if DEBUG || DEVELOPMENT
189 static TUNABLE(bool, vm_map_executable_immutable,
190     "vm_map_executable_immutable", true);
191 #else
192 #define vm_map_executable_immutable true
193 #endif
194 
195 /** Do not enforce the kernel allocation size limit */
196 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS (0)
197 /** Enforce the kernel allocation limit by refusing too large requests */
198 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT (1)
199 /** Enforce the kernel allocation limit by panicking on any too large request */
200 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_PANIC (2)
201 /** Do not enforce the kernel allocation limit but generate a telemetry trap */
202 #define VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP (3)
203 
204 #if DEVELOPMENT || DEBUG
205 static TUNABLE(int, vm_map_kernel_alloc_limit_mode,
206     "vm_map_kernel_alloc_limit_mode", VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP);
207 #else
208 #define vm_map_kernel_alloc_limit_mode VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS
209 #endif /* DEVELOPMENT || DEBUG */
210 
211 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
212 
213 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
214 /* Internal prototypes
215  */
216 
217 typedef struct vm_map_zap {
218 	vm_map_entry_t          vmz_head;
219 	vm_map_entry_t         *vmz_tail;
220 } *vm_map_zap_t;
221 
222 #define VM_MAP_ZAP_DECLARE(zap) \
223 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
224 
225 extern kern_return_t vm_map_wire_external(
226 	vm_map_t                map,
227 	vm_map_offset_ut        start_u,
228 	vm_map_offset_ut        end_u,
229 	vm_prot_ut              prot_u,
230 	boolean_t               user_wire) __exported;
231 
232 #if XNU_PLATFORM_MacOSX
233 extern /* exported via Private.<arch>.MacOSX.exports on macOS */
234 #else
235 static
236 #endif
237 kern_return_t vm_map_copyin_common(
238 	vm_map_t                src_map,
239 	vm_map_address_ut       src_addr,
240 	vm_map_size_ut          len,
241 	boolean_t               src_destroy,
242 	boolean_t               src_volatile,
243 	vm_map_copy_t          *copy_result,                           /* OUT */
244 	boolean_t               use_maxprot);
245 
246 static vm_map_entry_t   vm_map_entry_insert(
247 	vm_map_t                map,
248 	vm_map_entry_t          insp_entry,
249 	vm_map_offset_t         start,
250 	vm_map_offset_t         end,
251 	vm_object_t             object,
252 	vm_object_offset_t      offset,
253 	vm_map_kernel_flags_t   vmk_flags,
254 	boolean_t               needs_copy,
255 	vm_prot_t               cur_protection,
256 	vm_prot_t               max_protection,
257 	vm_inherit_t            inheritance,
258 	boolean_t               clear_map_aligned);
259 
260 static void vm_map_simplify_range(
261 	vm_map_t        map,
262 	vm_map_offset_t start,
263 	vm_map_offset_t end);   /* forward */
264 
265 static boolean_t        vm_map_range_check(
266 	vm_map_t        map,
267 	vm_map_offset_t start,
268 	vm_map_offset_t end,
269 	vm_map_entry_t  *entry);
270 
271 static void vm_map_submap_pmap_clean(
272 	vm_map_t        map,
273 	vm_map_offset_t start,
274 	vm_map_offset_t end,
275 	vm_map_t        sub_map,
276 	vm_map_offset_t offset);
277 
278 static void             vm_map_pmap_enter(
279 	vm_map_t                map,
280 	vm_map_offset_t         addr,
281 	vm_map_offset_t         end_addr,
282 	vm_object_t             object,
283 	vm_object_offset_t      offset,
284 	vm_prot_t               protection);
285 
286 static void             _vm_map_clip_end(
287 	struct vm_map_header    *map_header,
288 	vm_map_entry_t          entry,
289 	vm_map_offset_t         end);
290 
291 static void             _vm_map_clip_start(
292 	struct vm_map_header    *map_header,
293 	vm_map_entry_t          entry,
294 	vm_map_offset_t         start);
295 
296 static kmem_return_t vm_map_delete(
297 	vm_map_t        map,
298 	vm_map_offset_t start,
299 	vm_map_offset_t end,
300 	vmr_flags_t     flags,
301 	kmem_guard_t    guard,
302 	vm_map_zap_t    zap);
303 
304 static void             vm_map_copy_insert(
305 	vm_map_t        map,
306 	vm_map_entry_t  after_where,
307 	vm_map_copy_t   copy);
308 
309 static kern_return_t    vm_map_copy_overwrite_unaligned(
310 	vm_map_t        dst_map,
311 	vm_map_entry_t  entry,
312 	vm_map_copy_t   copy,
313 	vm_map_address_t start,
314 	boolean_t       discard_on_success);
315 
316 static kern_return_t    vm_map_copy_overwrite_aligned(
317 	vm_map_t        dst_map,
318 	vm_map_entry_t  tmp_entry,
319 	vm_map_copy_t   copy,
320 	vm_map_offset_t start,
321 	pmap_t          pmap);
322 
323 static kern_return_t    vm_map_copyin_kernel_buffer(
324 	vm_map_t        src_map,
325 	vm_map_address_t src_addr,
326 	vm_map_size_t   len,
327 	boolean_t       src_destroy,
328 	vm_map_copy_t   *copy_result);  /* OUT */
329 
330 static kern_return_t    vm_map_copyout_kernel_buffer(
331 	vm_map_t        map,
332 	vm_map_address_t *addr, /* IN/OUT */
333 	vm_map_copy_t   copy,
334 	vm_map_size_t   copy_size,
335 	boolean_t       overwrite,
336 	boolean_t       consume_on_success);
337 
338 static void             vm_map_fork_share(
339 	vm_map_t        old_map,
340 	vm_map_entry_t  old_entry,
341 	vm_map_t        new_map);
342 
343 static boolean_t        vm_map_fork_copy(
344 	vm_map_t        old_map,
345 	vm_map_entry_t  *old_entry_p,
346 	vm_map_t        new_map,
347 	int             vm_map_copyin_flags);
348 
349 static kern_return_t    vm_map_wire_nested(
350 	vm_map_t                   map,
351 	vm_map_offset_t            start,
352 	vm_map_offset_t            end,
353 	vm_prot_t                  caller_prot,
354 	vm_tag_t                   tag,
355 	boolean_t                  user_wire,
356 	pmap_t                     map_pmap,
357 	vm_map_offset_t            pmap_addr,
358 	ppnum_t                   *physpage_p);
359 
360 static kern_return_t    vm_map_unwire_nested(
361 	vm_map_t                   map,
362 	vm_map_offset_t            start,
363 	vm_map_offset_t            end,
364 	boolean_t                  user_wire,
365 	pmap_t                     map_pmap,
366 	vm_map_offset_t            pmap_addr);
367 
368 static kern_return_t    vm_map_overwrite_submap_recurse(
369 	vm_map_t                   dst_map,
370 	vm_map_offset_t            dst_addr,
371 	vm_map_size_t              dst_size);
372 
373 static kern_return_t    vm_map_copy_overwrite_nested(
374 	vm_map_t                   dst_map,
375 	vm_map_offset_t            dst_addr,
376 	vm_map_copy_t              copy,
377 	boolean_t                  interruptible,
378 	pmap_t                     pmap,
379 	boolean_t                  discard_on_success);
380 
381 static kern_return_t    vm_map_remap_extract(
382 	vm_map_t                map,
383 	vm_map_offset_t         addr,
384 	vm_map_size_t           size,
385 	boolean_t               copy,
386 	vm_map_copy_t           map_copy,
387 	vm_prot_t               *cur_protection,
388 	vm_prot_t               *max_protection,
389 	vm_inherit_t            inheritance,
390 	vm_map_kernel_flags_t   vmk_flags);
391 
392 static void             vm_map_region_look_for_page(
393 	vm_map_t                   map,
394 	vm_map_offset_t            va,
395 	vm_object_t                object,
396 	vm_object_offset_t         offset,
397 	int                        max_refcnt,
398 	unsigned short             depth,
399 	vm_region_extended_info_t  extended,
400 	mach_msg_type_number_t count);
401 
402 static boolean_t        vm_map_region_has_obj_ref(
403 	vm_map_entry_t             entry,
404 	vm_object_t                object);
405 
406 
407 static kern_return_t    vm_map_willneed(
408 	vm_map_t        map,
409 	vm_map_offset_t start,
410 	vm_map_offset_t end);
411 
412 static kern_return_t    vm_map_reuse_pages(
413 	vm_map_t        map,
414 	vm_map_offset_t start,
415 	vm_map_offset_t end);
416 
417 static kern_return_t    vm_map_reusable_pages(
418 	vm_map_t        map,
419 	vm_map_offset_t start,
420 	vm_map_offset_t end);
421 
422 static kern_return_t    vm_map_can_reuse(
423 	vm_map_t        map,
424 	vm_map_offset_t start,
425 	vm_map_offset_t end);
426 
427 static kern_return_t    vm_map_zero(
428 	vm_map_t        map,
429 	vm_map_offset_t start,
430 	vm_map_offset_t end);
431 
432 static kern_return_t    vm_map_random_address_for_size(
433 	vm_map_t                map,
434 	vm_map_offset_t        *address,
435 	vm_map_size_t           size,
436 	vm_map_kernel_flags_t   vmk_flags);
437 
438 
439 #if CONFIG_MAP_RANGES
440 
441 static vm_map_range_id_t vm_map_user_range_resolve(
442 	vm_map_t                map,
443 	mach_vm_address_t       addr,
444 	mach_vm_address_t       size,
445 	mach_vm_range_t         range);
446 
447 #endif /* CONFIG_MAP_RANGES */
448 #if MACH_ASSERT
449 static kern_return_t    vm_map_pageout(
450 	vm_map_t        map,
451 	vm_map_offset_t start,
452 	vm_map_offset_t end);
453 #endif /* MACH_ASSERT */
454 
455 kern_return_t vm_map_corpse_footprint_collect(
456 	vm_map_t        old_map,
457 	vm_map_entry_t  old_entry,
458 	vm_map_t        new_map);
459 void vm_map_corpse_footprint_collect_done(
460 	vm_map_t        new_map);
461 void vm_map_corpse_footprint_destroy(
462 	vm_map_t        map);
463 kern_return_t vm_map_corpse_footprint_query_page_info(
464 	vm_map_t        map,
465 	vm_map_offset_t va,
466 	int             *disposition_p);
467 void vm_map_footprint_query_page_info(
468 	vm_map_t        map,
469 	vm_map_entry_t  map_entry,
470 	vm_map_offset_t curr_s_offset,
471 	int             *disposition_p);
472 
473 #if CONFIG_MAP_RANGES
474 static void vm_map_range_map_init(void);
475 #endif /* CONFIG_MAP_RANGES */
476 
477 pid_t find_largest_process_vm_map_entries(void);
478 
479 __attribute__((always_inline))
480 int
vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)481 vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)
482 {
483 	int flags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
484 
485 	/* in vmk flags the meaning of fixed/anywhere is inverted */
486 	return flags ^ (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
487 }
488 
489 __attribute__((always_inline, overloadable))
490 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags,vm_tag_t vm_tag)491 vm_map_kernel_flags_set_vmflags(
492 	vm_map_kernel_flags_t  *vmk_flags,
493 	int                     vm_flags,
494 	vm_tag_t                vm_tag)
495 {
496 	vm_flags ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
497 	vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
498 	vmk_flags->__vm_flags |= (vm_flags & VM_FLAGS_ANY_MASK);
499 	vmk_flags->vm_tag = vm_tag;
500 }
501 
502 __attribute__((always_inline, overloadable))
503 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_and_tag)504 vm_map_kernel_flags_set_vmflags(
505 	vm_map_kernel_flags_t  *vmk_flags,
506 	int                     vm_flags_and_tag)
507 {
508 	vm_flags_and_tag ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
509 	vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
510 	vmk_flags->__vm_flags |= (vm_flags_and_tag & VM_FLAGS_ANY_MASK);
511 	VM_GET_FLAGS_ALIAS(vm_flags_and_tag, vmk_flags->vm_tag);
512 }
513 
514 __attribute__((always_inline))
515 void
vm_map_kernel_flags_and_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_mask)516 vm_map_kernel_flags_and_vmflags(
517 	vm_map_kernel_flags_t  *vmk_flags,
518 	int                     vm_flags_mask)
519 {
520 	/* this function doesn't handle the inverted FIXED/ANYWHERE */
521 	assert(vm_flags_mask & VM_FLAGS_ANYWHERE);
522 	vmk_flags->__vm_flags &= vm_flags_mask;
523 }
524 
525 __attribute__((always_inline))
526 bool
vm_map_kernel_flags_check_vm_and_kflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)527 vm_map_kernel_flags_check_vm_and_kflags(
528 	vm_map_kernel_flags_t   vmk_flags,
529 	int                     vm_flags_mask)
530 {
531 	return (vmk_flags.__vm_flags & ~vm_flags_mask) == 0;
532 }
533 
534 bool
vm_map_kernel_flags_check_vmflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)535 vm_map_kernel_flags_check_vmflags(
536 	vm_map_kernel_flags_t   vmk_flags,
537 	int                     vm_flags_mask)
538 {
539 	int vmflags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
540 
541 	/* Note: up to 16 still has good calling conventions */
542 	static_assert(sizeof(vm_map_kernel_flags_t) == 16);
543 
544 #if DEBUG || DEVELOPMENT
545 	/*
546 	 * All of this compiles to nothing if all checks pass.
547 	 */
548 #define check(field, value)  ({ \
549 	vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE; \
550 	fl.__vm_flags = (value); \
551 	fl.field = 0; \
552 	assert(fl.__vm_flags == 0); \
553 })
554 
555 	/* bits 0-7 */
556 	check(vmf_fixed, VM_FLAGS_ANYWHERE); // kind of a lie this is inverted
557 	check(vmf_purgeable, VM_FLAGS_PURGABLE);
558 	check(vmf_4gb_chunk, VM_FLAGS_4GB_CHUNK);
559 	check(vmf_random_addr, VM_FLAGS_RANDOM_ADDR);
560 	check(vmf_no_cache, VM_FLAGS_NO_CACHE);
561 	check(vmf_resilient_codesign, VM_FLAGS_RESILIENT_CODESIGN);
562 	check(vmf_resilient_media, VM_FLAGS_RESILIENT_MEDIA);
563 	check(vmf_permanent, VM_FLAGS_PERMANENT);
564 
565 	/* bits 8-15 */
566 	check(vmf_tpro, VM_FLAGS_TPRO);
567 	check(vmf_overwrite, VM_FLAGS_OVERWRITE);
568 
569 	/* bits 16-23 */
570 	check(vmf_superpage_size, VM_FLAGS_SUPERPAGE_MASK);
571 	check(vmf_return_data_addr, VM_FLAGS_RETURN_DATA_ADDR);
572 	check(vmf_return_4k_data_addr, VM_FLAGS_RETURN_4K_DATA_ADDR);
573 
574 	{
575 		vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE;
576 
577 		/* check user tags will never clip */
578 		fl.vm_tag = VM_MEMORY_COUNT - 1;
579 		assert(fl.vm_tag == VM_MEMORY_COUNT - 1);
580 
581 		/* check kernel tags will never clip */
582 		fl.vm_tag = VM_MAX_TAG_VALUE - 1;
583 		assert(fl.vm_tag == VM_MAX_TAG_VALUE - 1);
584 	}
585 
586 
587 #undef check
588 #endif /* DEBUG || DEVELOPMENT */
589 
590 	return (vmflags & ~vm_flags_mask) == 0;
591 }
592 
593 /*
594  * Macros to copy a vm_map_entry. We must be careful to correctly
595  * manage the wired page count. vm_map_entry_copy() creates a new
596  * map entry to the same memory - the wired count in the new entry
597  * must be set to zero. vm_map_entry_copy_full() creates a new
598  * entry that is identical to the old entry.  This preserves the
599  * wire count; it's used for map splitting and zone changing in
600  * vm_map_copyout.
601  */
602 
603 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)604 vm_map_entry_copy_csm_assoc(
605 	vm_map_t map __unused,
606 	vm_map_entry_t new __unused,
607 	vm_map_entry_t old __unused)
608 {
609 #if CODE_SIGNING_MONITOR
610 	/* when code signing monitor is enabled, we want to reset on copy */
611 	new->csm_associated = FALSE;
612 #else
613 	/* when code signing monitor is not enabled, assert as a sanity check */
614 	assert(new->csm_associated == FALSE);
615 #endif
616 #if DEVELOPMENT || DEBUG
617 	if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
618 		printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug\n",
619 		    proc_selfpid(),
620 		    (get_bsdtask_info(current_task())
621 		    ? proc_name_address(get_bsdtask_info(current_task()))
622 		    : "?"),
623 		    __FUNCTION__, __LINE__,
624 		    map, new, new->vme_start, new->vme_end);
625 	}
626 #endif /* DEVELOPMENT || DEBUG */
627 #if XNU_TARGET_OS_OSX
628 	/*
629 	 * On macOS, entries with "vme_xnu_user_debug" can be copied during fork()
630 	 * and we want the child's entry to keep its "vme_xnu_user_debug" to avoid
631 	 * trigggering CSM assertions when the child accesses its mapping.
632 	 */
633 #else /* XNU_TARGET_OS_OSX */
634 	new->vme_xnu_user_debug = FALSE;
635 #endif /* XNU_TARGET_OS_OSX */
636 }
637 
638 /*
639  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
640  * But for security reasons on some platforms, we don't want the
641  * new mapping to be "used for jit", so we reset the flag here.
642  */
643 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)644 vm_map_entry_copy_code_signing(
645 	vm_map_t map,
646 	vm_map_entry_t new,
647 	vm_map_entry_t old __unused)
648 {
649 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
650 		assert(new->used_for_jit == old->used_for_jit);
651 	} else {
652 		if (old->used_for_jit) {
653 			DTRACE_VM3(cs_wx,
654 			    uint64_t, new->vme_start,
655 			    uint64_t, new->vme_end,
656 			    vm_prot_t, new->protection);
657 			printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
658 			    proc_selfpid(),
659 			    (get_bsdtask_info(current_task())
660 			    ? proc_name_address(get_bsdtask_info(current_task()))
661 			    : "?"),
662 			    __FUNCTION__,
663 			    "removing execute access");
664 			new->protection &= ~VM_PROT_EXECUTE;
665 			new->max_protection &= ~VM_PROT_EXECUTE;
666 		}
667 		new->used_for_jit = FALSE;
668 	}
669 }
670 
671 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)672 vm_map_entry_copy_full(
673 	vm_map_entry_t new,
674 	vm_map_entry_t old)
675 {
676 #if MAP_ENTRY_CREATION_DEBUG
677 	btref_put(new->vme_creation_bt);
678 	btref_retain(old->vme_creation_bt);
679 #endif
680 #if MAP_ENTRY_INSERTION_DEBUG
681 	btref_put(new->vme_insertion_bt);
682 	btref_retain(old->vme_insertion_bt);
683 #endif
684 #if VM_BTLOG_TAGS
685 	/* Discard the btref that might be in the new entry */
686 	if (new->vme_kernel_object) {
687 		btref_put(new->vme_tag_btref);
688 	}
689 	/* Retain the btref in the old entry to account for its copy */
690 	if (old->vme_kernel_object) {
691 		btref_retain(old->vme_tag_btref);
692 	}
693 #endif /* VM_BTLOG_TAGS */
694 	*new = *old;
695 }
696 
697 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)698 vm_map_entry_copy(
699 	vm_map_t map,
700 	vm_map_entry_t new,
701 	vm_map_entry_t old)
702 {
703 	vm_map_entry_copy_full(new, old);
704 
705 	new->is_shared = FALSE;
706 	new->needs_wakeup = FALSE;
707 	new->in_transition = FALSE;
708 	new->wired_count = 0;
709 	new->user_wired_count = 0;
710 	new->vme_permanent = FALSE;
711 	vm_map_entry_copy_code_signing(map, new, old);
712 	vm_map_entry_copy_csm_assoc(map, new, old);
713 	if (new->iokit_acct) {
714 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
715 		new->iokit_acct = FALSE;
716 		new->use_pmap = TRUE;
717 	}
718 	new->vme_resilient_codesign = FALSE;
719 	new->vme_resilient_media = FALSE;
720 	new->vme_atomic = FALSE;
721 	new->vme_no_copy_on_read = FALSE;
722 }
723 
724 /*
725  * Normal lock_read_to_write() returns FALSE/0 on failure.
726  * These functions evaluate to zero on success and non-zero value on failure.
727  */
728 __attribute__((always_inline))
729 int
vm_map_lock_read_to_write(vm_map_t map)730 vm_map_lock_read_to_write(vm_map_t map)
731 {
732 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
733 		DTRACE_VM(vm_map_lock_upgrade);
734 		return 0;
735 	}
736 	return 1;
737 }
738 
739 __attribute__((always_inline))
740 boolean_t
vm_map_try_lock(vm_map_t map)741 vm_map_try_lock(vm_map_t map)
742 {
743 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
744 		DTRACE_VM(vm_map_lock_w);
745 		return TRUE;
746 	}
747 	return FALSE;
748 }
749 
750 __attribute__((always_inline))
751 boolean_t
vm_map_try_lock_read(vm_map_t map)752 vm_map_try_lock_read(vm_map_t map)
753 {
754 	if (lck_rw_try_lock_shared(&(map)->lock)) {
755 		DTRACE_VM(vm_map_lock_r);
756 		return TRUE;
757 	}
758 	return FALSE;
759 }
760 
761 /*!
762  * @function kdp_vm_map_is_acquired_exclusive
763  *
764  * @abstract
765  * Checks if vm map is acquired exclusive.
766  *
767  * @discussion
768  * NOT SAFE: To be used only by kernel debugger.
769  *
770  * @param map map to check
771  *
772  * @returns TRUE if the map is acquired exclusively.
773  */
774 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)775 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
776 {
777 	return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
778 }
779 
780 /*
781  * Routines to get the page size the caller should
782  * use while inspecting the target address space.
783  * Use the "_safely" variant if the caller is dealing with a user-provided
784  * array whose size depends on the page size, to avoid any overflow or
785  * underflow of a user-allocated buffer.
786  */
787 int
vm_self_region_page_shift_safely(vm_map_t target_map)788 vm_self_region_page_shift_safely(
789 	vm_map_t target_map)
790 {
791 	int effective_page_shift = 0;
792 
793 	if (PAGE_SIZE == (4096)) {
794 		/* x86_64 and 4k watches: always use 4k */
795 		return PAGE_SHIFT;
796 	}
797 	/* did caller provide an explicit page size for this thread to use? */
798 	effective_page_shift = thread_self_region_page_shift();
799 	if (effective_page_shift) {
800 		/* use the explicitly-provided page size */
801 		return effective_page_shift;
802 	}
803 	/* no explicit page size: use the caller's page size... */
804 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
805 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
806 		/* page size match: safe to use */
807 		return effective_page_shift;
808 	}
809 	/* page size mismatch */
810 	return -1;
811 }
812 int
vm_self_region_page_shift(vm_map_t target_map)813 vm_self_region_page_shift(
814 	vm_map_t target_map)
815 {
816 	int effective_page_shift;
817 
818 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
819 	if (effective_page_shift == -1) {
820 		/* no safe value but OK to guess for caller */
821 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
822 		    VM_MAP_PAGE_SHIFT(target_map));
823 	}
824 	return effective_page_shift;
825 }
826 
827 
828 /*
829  *	Decide if we want to allow processes to execute from their data or stack areas.
830  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
831  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
832  *	or allow_stack_exec to enable data execution for that type of data area for that particular
833  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
834  *	specific pmap files since the default behavior varies according to architecture.  The
835  *	main reason it varies is because of the need to provide binary compatibility with old
836  *	applications that were written before these restrictions came into being.  In the old
837  *	days, an app could execute anything it could read, but this has slowly been tightened
838  *	up over time.  The default behavior is:
839  *
840  *	32-bit PPC apps		may execute from both stack and data areas
841  *	32-bit Intel apps	may exeucte from data areas but not stack
842  *	64-bit PPC/Intel apps	may not execute from either data or stack
843  *
844  *	An application on any architecture may override these defaults by explicitly
845  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
846  *	system call.  This code here just determines what happens when an app tries to
847  *      execute from a page that lacks execute permission.
848  *
849  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
850  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
851  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
852  *	execution from data areas for a particular binary even if the arch normally permits it. As
853  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
854  *	to support some complicated use cases, notably browsers with out-of-process plugins that
855  *	are not all NX-safe.
856  */
857 
858 extern int allow_data_exec, allow_stack_exec;
859 
860 int
override_nx(vm_map_t map,uint32_t user_tag)861 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
862 {
863 	int current_abi;
864 
865 	if (map->pmap == kernel_pmap) {
866 		return FALSE;
867 	}
868 
869 	/*
870 	 * Determine if the app is running in 32 or 64 bit mode.
871 	 */
872 
873 	if (vm_map_is_64bit(map)) {
874 		current_abi = VM_ABI_64;
875 	} else {
876 		current_abi = VM_ABI_32;
877 	}
878 
879 	/*
880 	 * Determine if we should allow the execution based on whether it's a
881 	 * stack or data area and the current architecture.
882 	 */
883 
884 	if (user_tag == VM_MEMORY_STACK) {
885 		return allow_stack_exec & current_abi;
886 	}
887 
888 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
889 }
890 
891 
892 /*
893  *	Virtual memory maps provide for the mapping, protection,
894  *	and sharing of virtual memory objects.  In addition,
895  *	this module provides for an efficient virtual copy of
896  *	memory from one map to another.
897  *
898  *	Synchronization is required prior to most operations.
899  *
900  *	Maps consist of an ordered doubly-linked list of simple
901  *	entries; a single hint is used to speed up lookups.
902  *
903  *	Sharing maps have been deleted from this version of Mach.
904  *	All shared objects are now mapped directly into the respective
905  *	maps.  This requires a change in the copy on write strategy;
906  *	the asymmetric (delayed) strategy is used for shared temporary
907  *	objects instead of the symmetric (shadow) strategy.  All maps
908  *	are now "top level" maps (either task map, kernel map or submap
909  *	of the kernel map).
910  *
911  *	Since portions of maps are specified by start/end addreses,
912  *	which may not align with existing map entries, all
913  *	routines merely "clip" entries to these start/end values.
914  *	[That is, an entry is split into two, bordering at a
915  *	start or end value.]  Note that these clippings may not
916  *	always be necessary (as the two resulting entries are then
917  *	not changed); however, the clipping is done for convenience.
918  *	No attempt is currently made to "glue back together" two
919  *	abutting entries.
920  *
921  *	The symmetric (shadow) copy strategy implements virtual copy
922  *	by copying VM object references from one map to
923  *	another, and then marking both regions as copy-on-write.
924  *	It is important to note that only one writeable reference
925  *	to a VM object region exists in any map when this strategy
926  *	is used -- this means that shadow object creation can be
927  *	delayed until a write operation occurs.  The symmetric (delayed)
928  *	strategy allows multiple maps to have writeable references to
929  *	the same region of a vm object, and hence cannot delay creating
930  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
931  *	Copying of permanent objects is completely different; see
932  *	vm_object_copy_strategically() in vm_object.c.
933  */
934 
935 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
936 
937 #define VM_MAP_ZONE_NAME        "maps"
938 #define VM_MAP_ZFLAGS           (ZC_NOENCRYPT | ZC_VM)
939 
940 #define VM_MAP_ENTRY_ZONE_NAME  "VM map entries"
941 #define VM_MAP_ENTRY_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
942 
943 #define VM_MAP_HOLES_ZONE_NAME  "VM map holes"
944 #define VM_MAP_HOLES_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
945 
946 /*
947  * Asserts that a vm_map_copy object is coming from the
948  * vm_map_copy_zone to ensure that it isn't a fake constructed
949  * anywhere else.
950  */
951 void
vm_map_copy_require(struct vm_map_copy * copy)952 vm_map_copy_require(struct vm_map_copy *copy)
953 {
954 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
955 }
956 
957 /*
958  *	vm_map_require:
959  *
960  *	Ensures that the argument is memory allocated from the genuine
961  *	vm map zone. (See zone_id_require_allow_foreign).
962  */
963 void
vm_map_require(vm_map_t map)964 vm_map_require(vm_map_t map)
965 {
966 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
967 }
968 
969 #define VM_MAP_EARLY_COUNT_MAX         16
970 static __startup_data vm_offset_t      map_data;
971 static __startup_data vm_size_t        map_data_size;
972 static __startup_data vm_offset_t      kentry_data;
973 static __startup_data vm_size_t        kentry_data_size;
974 static __startup_data vm_offset_t      map_holes_data;
975 static __startup_data vm_size_t        map_holes_data_size;
976 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
977 static __startup_data uint32_t         early_map_count;
978 
979 #if XNU_TARGET_OS_OSX
980 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
981 #else /* XNU_TARGET_OS_OSX */
982 #define         NO_COALESCE_LIMIT  0
983 #endif /* XNU_TARGET_OS_OSX */
984 
985 /* Skip acquiring locks if we're in the midst of a kernel core dump */
986 unsigned int not_in_kdp = 1;
987 
988 unsigned int vm_map_set_cache_attr_count = 0;
989 
990 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)991 vm_map_set_cache_attr(
992 	vm_map_t        map,
993 	vm_map_offset_t va)
994 {
995 	vm_map_entry_t  map_entry;
996 	vm_object_t     object;
997 	kern_return_t   kr = KERN_SUCCESS;
998 
999 	vm_map_lock_read(map);
1000 
1001 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
1002 	    map_entry->is_sub_map) {
1003 		/*
1004 		 * that memory is not properly mapped
1005 		 */
1006 		kr = KERN_INVALID_ARGUMENT;
1007 		goto done;
1008 	}
1009 	object = VME_OBJECT(map_entry);
1010 
1011 	if (object == VM_OBJECT_NULL) {
1012 		/*
1013 		 * there should be a VM object here at this point
1014 		 */
1015 		kr = KERN_INVALID_ARGUMENT;
1016 		goto done;
1017 	}
1018 	vm_object_lock(object);
1019 	object->set_cache_attr = TRUE;
1020 	vm_object_unlock(object);
1021 
1022 	vm_map_set_cache_attr_count++;
1023 done:
1024 	vm_map_unlock_read(map);
1025 
1026 	return kr;
1027 }
1028 
1029 
1030 #if CONFIG_CODE_DECRYPTION
1031 /*
1032  * vm_map_apple_protected:
1033  * This remaps the requested part of the object with an object backed by
1034  * the decrypting pager.
1035  * crypt_info contains entry points and session data for the crypt module.
1036  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
1037  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
1038  */
1039 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)1040 vm_map_apple_protected(
1041 	vm_map_t                map,
1042 	vm_map_offset_t         start,
1043 	vm_map_offset_t         end,
1044 	vm_object_offset_t      crypto_backing_offset,
1045 	struct pager_crypt_info *crypt_info,
1046 	uint32_t                cryptid)
1047 {
1048 	boolean_t       map_locked;
1049 	kern_return_t   kr;
1050 	vm_map_entry_t  map_entry;
1051 	struct vm_map_entry tmp_entry;
1052 	memory_object_t unprotected_mem_obj;
1053 	vm_object_t     protected_object;
1054 	vm_map_offset_t map_addr;
1055 	vm_map_offset_t start_aligned, end_aligned;
1056 	vm_object_offset_t      crypto_start, crypto_end;
1057 	boolean_t       cache_pager;
1058 
1059 	map_locked = FALSE;
1060 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
1061 
1062 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
1063 		return KERN_INVALID_ADDRESS;
1064 	}
1065 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
1066 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
1067 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
1068 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
1069 
1070 #if __arm64__
1071 	/*
1072 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
1073 	 * so we might have to loop and establish up to 3 mappings:
1074 	 *
1075 	 * + the first 16K-page, which might overlap with the previous
1076 	 *   4K-aligned mapping,
1077 	 * + the center,
1078 	 * + the last 16K-page, which might overlap with the next
1079 	 *   4K-aligned mapping.
1080 	 * Each of these mapping might be backed by a vnode pager (if
1081 	 * properly page-aligned) or a "fourk_pager", itself backed by a
1082 	 * vnode pager (if 4K-aligned but not page-aligned).
1083 	 */
1084 #endif /* __arm64__ */
1085 
1086 	map_addr = start_aligned;
1087 	for (map_addr = start_aligned;
1088 	    map_addr < end;
1089 	    map_addr = tmp_entry.vme_end) {
1090 		vm_map_lock(map);
1091 		map_locked = TRUE;
1092 
1093 		/* lookup the protected VM object */
1094 		if (!vm_map_lookup_entry(map,
1095 		    map_addr,
1096 		    &map_entry) ||
1097 		    map_entry->is_sub_map ||
1098 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
1099 			/* that memory is not properly mapped */
1100 			kr = KERN_INVALID_ARGUMENT;
1101 			goto done;
1102 		}
1103 
1104 		/* ensure mapped memory is mapped as executable except
1105 		 *  except for model decryption flow */
1106 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
1107 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
1108 			kr = KERN_INVALID_ARGUMENT;
1109 			goto done;
1110 		}
1111 
1112 		/* get the protected object to be decrypted */
1113 		protected_object = VME_OBJECT(map_entry);
1114 		if (protected_object == VM_OBJECT_NULL) {
1115 			/* there should be a VM object here at this point */
1116 			kr = KERN_INVALID_ARGUMENT;
1117 			goto done;
1118 		}
1119 		/* ensure protected object stays alive while map is unlocked */
1120 		vm_object_reference(protected_object);
1121 
1122 		/* limit the map entry to the area we want to cover */
1123 		vm_map_clip_start(map, map_entry, start_aligned);
1124 		vm_map_clip_end(map, map_entry, end_aligned);
1125 
1126 		tmp_entry = *map_entry;
1127 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
1128 		vm_map_unlock(map);
1129 		map_locked = FALSE;
1130 
1131 		/*
1132 		 * This map entry might be only partially encrypted
1133 		 * (if not fully "page-aligned").
1134 		 */
1135 		crypto_start = 0;
1136 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
1137 		if (tmp_entry.vme_start < start) {
1138 			if (tmp_entry.vme_start != start_aligned) {
1139 				kr = KERN_INVALID_ADDRESS;
1140 				vm_object_deallocate(protected_object);
1141 				goto done;
1142 			}
1143 			crypto_start += (start - tmp_entry.vme_start);
1144 		}
1145 		if (tmp_entry.vme_end > end) {
1146 			if (tmp_entry.vme_end != end_aligned) {
1147 				kr = KERN_INVALID_ADDRESS;
1148 				vm_object_deallocate(protected_object);
1149 				goto done;
1150 			}
1151 			crypto_end -= (tmp_entry.vme_end - end);
1152 		}
1153 
1154 		/*
1155 		 * This "extra backing offset" is needed to get the decryption
1156 		 * routine to use the right key.  It adjusts for the possibly
1157 		 * relative offset of an interposed "4K" pager...
1158 		 */
1159 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
1160 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
1161 		}
1162 
1163 		cache_pager = TRUE;
1164 #if XNU_TARGET_OS_OSX
1165 		if (vm_map_is_alien(map)) {
1166 			cache_pager = FALSE;
1167 		}
1168 #endif /* XNU_TARGET_OS_OSX */
1169 
1170 		/*
1171 		 * Lookup (and create if necessary) the protected memory object
1172 		 * matching that VM object.
1173 		 * If successful, this also grabs a reference on the memory object,
1174 		 * to guarantee that it doesn't go away before we get a chance to map
1175 		 * it.
1176 		 */
1177 		unprotected_mem_obj = apple_protect_pager_setup(
1178 			protected_object,
1179 			VME_OFFSET(&tmp_entry),
1180 			crypto_backing_offset,
1181 			crypt_info,
1182 			crypto_start,
1183 			crypto_end,
1184 			cache_pager);
1185 
1186 		/* release extra ref on protected object */
1187 		vm_object_deallocate(protected_object);
1188 
1189 		if (unprotected_mem_obj == NULL) {
1190 			kr = KERN_FAILURE;
1191 			goto done;
1192 		}
1193 
1194 		/* can overwrite an immutable mapping */
1195 		vm_map_kernel_flags_t vmk_flags = {
1196 			.vmf_fixed = true,
1197 			.vmf_overwrite = true,
1198 			.vmkf_overwrite_immutable = true,
1199 		};
1200 		/* make the new mapping as "permanent" as the one it replaces */
1201 		vmk_flags.vmf_permanent = tmp_entry.vme_permanent;
1202 
1203 		/* map this memory object in place of the current one */
1204 		map_addr = tmp_entry.vme_start;
1205 		kr = mach_vm_map_kernel(map,
1206 		    vm_sanitize_wrap_addr_ref(&map_addr),
1207 		    (tmp_entry.vme_end -
1208 		    tmp_entry.vme_start),
1209 		    (mach_vm_offset_t) 0,
1210 		    vmk_flags,
1211 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1212 		    0,
1213 		    TRUE,
1214 		    tmp_entry.protection,
1215 		    tmp_entry.max_protection,
1216 		    tmp_entry.inheritance);
1217 		assertf(kr == KERN_SUCCESS,
1218 		    "kr = 0x%x\n", kr);
1219 		assertf(map_addr == tmp_entry.vme_start,
1220 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1221 		    (uint64_t)map_addr,
1222 		    (uint64_t) tmp_entry.vme_start,
1223 		    &tmp_entry);
1224 
1225 #if VM_MAP_DEBUG_APPLE_PROTECT
1226 		if (vm_map_debug_apple_protect) {
1227 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1228 			    " backing:[object:%p,offset:0x%llx,"
1229 			    "crypto_backing_offset:0x%llx,"
1230 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1231 			    map,
1232 			    (uint64_t) map_addr,
1233 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1234 			    tmp_entry.vme_start)),
1235 			    unprotected_mem_obj,
1236 			    protected_object,
1237 			    VME_OFFSET(&tmp_entry),
1238 			    crypto_backing_offset,
1239 			    crypto_start,
1240 			    crypto_end);
1241 		}
1242 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1243 
1244 		/*
1245 		 * Release the reference obtained by
1246 		 * apple_protect_pager_setup().
1247 		 * The mapping (if it succeeded) is now holding a reference on
1248 		 * the memory object.
1249 		 */
1250 		memory_object_deallocate(unprotected_mem_obj);
1251 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1252 
1253 		/* continue with next map entry */
1254 		crypto_backing_offset += (tmp_entry.vme_end -
1255 		    tmp_entry.vme_start);
1256 		crypto_backing_offset -= crypto_start;
1257 	}
1258 	kr = KERN_SUCCESS;
1259 
1260 done:
1261 	if (map_locked) {
1262 		vm_map_unlock(map);
1263 	}
1264 	return kr;
1265 }
1266 #endif  /* CONFIG_CODE_DECRYPTION */
1267 
1268 
1269 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1270 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1271 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1272 
1273 #if XNU_TARGET_OS_OSX
1274 #define MALLOC_NO_COW_DEFAULT 1
1275 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1276 #else /* XNU_TARGET_OS_OSX */
1277 #define MALLOC_NO_COW_DEFAULT 1
1278 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1279 #endif /* XNU_TARGET_OS_OSX */
1280 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1281 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1282 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1283 #if DEBUG
1284 int vm_check_map_sanity = 0;
1285 #endif
1286 
1287 /*
1288  *	vm_map_init:
1289  *
1290  *	Initialize the vm_map module.  Must be called before
1291  *	any other vm_map routines.
1292  *
1293  *	Map and entry structures are allocated from zones -- we must
1294  *	initialize those zones.
1295  *
1296  *	There are three zones of interest:
1297  *
1298  *	vm_map_zone:		used to allocate maps.
1299  *	vm_map_entry_zone:	used to allocate map entries.
1300  *
1301  *	LP32:
1302  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1303  *
1304  *	The kernel allocates map entries from a special zone that is initially
1305  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1306  *	the kernel to allocate more memory to a entry zone when it became
1307  *	empty since the very act of allocating memory implies the creation
1308  *	of a new entry.
1309  */
1310 __startup_func
1311 void
vm_map_init(void)1312 vm_map_init(void)
1313 {
1314 
1315 #if MACH_ASSERT
1316 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1317 	    sizeof(debug4k_filter));
1318 #endif /* MACH_ASSERT */
1319 
1320 	zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1321 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1322 
1323 	/*
1324 	 * Don't quarantine because we always need elements available
1325 	 * Disallow GC on this zone... to aid the GC.
1326 	 */
1327 	zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1328 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1329 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1330 		z->z_elems_rsv = (uint16_t)(32 *
1331 		(ml_early_cpu_max_number() + 1));
1332 	});
1333 
1334 	zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1335 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1336 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1337 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1338 	});
1339 
1340 	zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1341 	    ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1342 
1343 	/*
1344 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1345 	 */
1346 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1347 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1348 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1349 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1350 	    zone_count_free(vm_map_zone),
1351 	    zone_count_free(vm_map_entry_zone),
1352 	    zone_count_free(vm_map_holes_zone));
1353 
1354 	/*
1355 	 * Since these are covered by zones, remove them from stolen page accounting.
1356 	 */
1357 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1358 
1359 #if VM_MAP_DEBUG_APPLE_PROTECT
1360 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1361 	    &vm_map_debug_apple_protect,
1362 	    sizeof(vm_map_debug_apple_protect));
1363 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1364 #if VM_MAP_DEBUG_APPLE_FOURK
1365 	PE_parse_boot_argn("vm_map_debug_fourk",
1366 	    &vm_map_debug_fourk,
1367 	    sizeof(vm_map_debug_fourk));
1368 #endif /* VM_MAP_DEBUG_FOURK */
1369 
1370 	if (malloc_no_cow) {
1371 		vm_memory_malloc_no_cow_mask = 0ULL;
1372 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1373 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1374 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1375 #if XNU_TARGET_OS_OSX
1376 		/*
1377 		 * On macOS, keep copy-on-write for MALLOC_LARGE because
1378 		 * realloc() may use vm_copy() to transfer the old contents
1379 		 * to the new location.
1380 		 */
1381 #else /* XNU_TARGET_OS_OSX */
1382 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1383 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1384 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1385 #endif /* XNU_TARGET_OS_OSX */
1386 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1387 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1388 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1389 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1390 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1391 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1392 		    &vm_memory_malloc_no_cow_mask,
1393 		    sizeof(vm_memory_malloc_no_cow_mask));
1394 	}
1395 
1396 #if CONFIG_MAP_RANGES
1397 	vm_map_range_map_init();
1398 #endif /* CONFIG_MAP_RANGES */
1399 
1400 #if DEBUG
1401 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1402 	if (vm_check_map_sanity) {
1403 		kprintf("VM sanity checking enabled\n");
1404 	} else {
1405 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1406 	}
1407 #endif /* DEBUG */
1408 
1409 #if DEVELOPMENT || DEBUG
1410 	PE_parse_boot_argn("panic_on_unsigned_execute",
1411 	    &panic_on_unsigned_execute,
1412 	    sizeof(panic_on_unsigned_execute));
1413 	PE_parse_boot_argn("panic_on_mlock_failure",
1414 	    &panic_on_mlock_failure,
1415 	    sizeof(panic_on_mlock_failure));
1416 #endif /* DEVELOPMENT || DEBUG */
1417 }
1418 
1419 __startup_func
1420 static void
vm_map_steal_memory(void)1421 vm_map_steal_memory(void)
1422 {
1423 
1424 	/*
1425 	 * We need to reserve enough memory to support boostraping VM maps
1426 	 * and the zone subsystem.
1427 	 *
1428 	 * The VM Maps that need to function before zones can support them
1429 	 * are the ones registered with vm_map_will_allocate_early_map(),
1430 	 * which are:
1431 	 * - the kernel map
1432 	 * - the various submaps used by zones (pgz, meta, ...)
1433 	 *
1434 	 * We also need enough entries and holes to support them
1435 	 * until zone_metadata_init() is called, which is when
1436 	 * the zone allocator becomes capable of expanding dynamically.
1437 	 *
1438 	 * We need:
1439 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1440 	 * - To allow for 3-4 entries per map, but the kernel map
1441 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1442 	 *   to describe the submaps, so double it (and make it 8x too)
1443 	 * - To allow for holes between entries,
1444 	 *   hence needs the same budget as entries
1445 	 */
1446 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1447 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1448 	    VM_MAP_EARLY_COUNT_MAX);
1449 
1450 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1451 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1452 	    8 * VM_MAP_EARLY_COUNT_MAX);
1453 
1454 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1455 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1456 	    8 * VM_MAP_EARLY_COUNT_MAX);
1457 
1458 	/*
1459 	 * Steal a contiguous range of memory so that a simple range check
1460 	 * can validate early addresses being freed/crammed to these
1461 	 * zones
1462 	 */
1463 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1464 	    map_holes_data_size);
1465 	kentry_data    = map_data + map_data_size;
1466 	map_holes_data = kentry_data + kentry_data_size;
1467 }
1468 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1469 
1470 __startup_func
1471 static void
vm_kernel_boostraped(void)1472 vm_kernel_boostraped(void)
1473 {
1474 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1475 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1476 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1477 
1478 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1479 	    zone_count_free(vm_map_zone),
1480 	    zone_count_free(vm_map_entry_zone),
1481 	    zone_count_free(vm_map_holes_zone));
1482 }
1483 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1484 
1485 void
vm_map_disable_hole_optimization(vm_map_t map)1486 vm_map_disable_hole_optimization(vm_map_t map)
1487 {
1488 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1489 
1490 	if (map->holelistenabled) {
1491 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1492 
1493 		while (hole_entry != NULL) {
1494 			next_hole_entry = hole_entry->vme_next;
1495 
1496 			hole_entry->vme_next = NULL;
1497 			hole_entry->vme_prev = NULL;
1498 			zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1499 
1500 			if (next_hole_entry == head_entry) {
1501 				hole_entry = NULL;
1502 			} else {
1503 				hole_entry = next_hole_entry;
1504 			}
1505 		}
1506 
1507 		map->holes_list = NULL;
1508 		map->holelistenabled = FALSE;
1509 
1510 		map->first_free = vm_map_to_entry(map);
1511 		SAVE_HINT_HOLE_WRITE(map, NULL);
1512 	}
1513 }
1514 
1515 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1516 vm_kernel_map_is_kernel(vm_map_t map)
1517 {
1518 	return map->pmap == kernel_pmap;
1519 }
1520 
1521 /*
1522  *	vm_map_create:
1523  *
1524  *	Creates and returns a new empty VM map with
1525  *	the given physical map structure, and having
1526  *	the given lower and upper address bounds.
1527  */
1528 
1529 extern vm_map_t vm_map_create_external(
1530 	pmap_t                  pmap,
1531 	vm_map_offset_t         min_off,
1532 	vm_map_offset_t         max_off,
1533 	boolean_t               pageable);
1534 
1535 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1536 vm_map_create_external(
1537 	pmap_t                  pmap,
1538 	vm_map_offset_t         min,
1539 	vm_map_offset_t         max,
1540 	boolean_t               pageable)
1541 {
1542 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1543 
1544 	if (pageable) {
1545 		options |= VM_MAP_CREATE_PAGEABLE;
1546 	}
1547 	return vm_map_create_options(pmap, min, max, options);
1548 }
1549 
1550 __startup_func
1551 void
vm_map_will_allocate_early_map(vm_map_t * owner)1552 vm_map_will_allocate_early_map(vm_map_t *owner)
1553 {
1554 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1555 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1556 	}
1557 
1558 	early_map_owners[early_map_count++] = owner;
1559 }
1560 
1561 __startup_func
1562 void
vm_map_relocate_early_maps(vm_offset_t delta)1563 vm_map_relocate_early_maps(vm_offset_t delta)
1564 {
1565 	for (uint32_t i = 0; i < early_map_count; i++) {
1566 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1567 
1568 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1569 	}
1570 
1571 	early_map_count = ~0u;
1572 }
1573 
1574 /*
1575  *	Routine:	vm_map_relocate_early_elem
1576  *
1577  *	Purpose:
1578  *		Early zone elements are allocated in a temporary part
1579  *		of the address space.
1580  *
1581  *		Once the zones live in their final place, the early
1582  *		VM maps, map entries and map holes need to be relocated.
1583  *
1584  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1585  *		pointers to vm_map_links. Other pointers to other types
1586  *		are fine.
1587  *
1588  *		Fortunately, pointers to those types are self-contained
1589  *		in those zones, _except_ for pointers to VM maps,
1590  *		which are tracked during early boot and fixed with
1591  *		vm_map_relocate_early_maps().
1592  */
1593 __startup_func
1594 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1595 vm_map_relocate_early_elem(
1596 	uint32_t                zone_id,
1597 	vm_offset_t             new_addr,
1598 	vm_offset_t             delta)
1599 {
1600 #define relocate(type_t, field)  ({ \
1601 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1602 	if (*__field) {                                                        \
1603 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1604 	}                                                                      \
1605 })
1606 
1607 	switch (zone_id) {
1608 	case ZONE_ID_VM_MAP:
1609 	case ZONE_ID_VM_MAP_ENTRY:
1610 	case ZONE_ID_VM_MAP_HOLES:
1611 		break;
1612 
1613 	default:
1614 		panic("Unexpected zone ID %d", zone_id);
1615 	}
1616 
1617 	if (zone_id == ZONE_ID_VM_MAP) {
1618 		relocate(vm_map_t, hdr.links.prev);
1619 		relocate(vm_map_t, hdr.links.next);
1620 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1621 #ifdef VM_MAP_STORE_USE_RB
1622 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1623 #endif /* VM_MAP_STORE_USE_RB */
1624 		relocate(vm_map_t, hint);
1625 		relocate(vm_map_t, hole_hint);
1626 		relocate(vm_map_t, first_free);
1627 		return;
1628 	}
1629 
1630 	relocate(struct vm_map_links *, prev);
1631 	relocate(struct vm_map_links *, next);
1632 
1633 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1634 #ifdef VM_MAP_STORE_USE_RB
1635 		relocate(vm_map_entry_t, store.entry.rbe_left);
1636 		relocate(vm_map_entry_t, store.entry.rbe_right);
1637 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1638 #endif /* VM_MAP_STORE_USE_RB */
1639 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1640 			/* no object to relocate because we haven't made any */
1641 			((vm_map_entry_t)new_addr)->vme_submap +=
1642 			    delta >> VME_SUBMAP_SHIFT;
1643 		}
1644 #if MAP_ENTRY_CREATION_DEBUG
1645 		relocate(vm_map_entry_t, vme_creation_maphdr);
1646 #endif /* MAP_ENTRY_CREATION_DEBUG */
1647 	}
1648 
1649 #undef relocate
1650 }
1651 
1652 /*
1653  * Generate a serial ID to identify a newly allocated vm_map
1654  */
1655 static uintptr_t vm_map_serial_current = 0;
1656 vm_map_serial_t vm_map_serial_generate(void);
1657 void vm_map_assign_serial(vm_map_t, vm_map_serial_t);
1658 
1659 vm_map_serial_t
vm_map_serial_generate(void)1660 vm_map_serial_generate(void)
1661 {
1662 	vm_map_serial_t serial = (void *)os_atomic_inc(&vm_map_serial_current, relaxed);
1663 	return serial;
1664 }
1665 
1666 void
vm_map_assign_serial(vm_map_t map,vm_map_serial_t serial)1667 vm_map_assign_serial(vm_map_t map, vm_map_serial_t serial)
1668 {
1669 	map->serial_id = serial;
1670 #if CONFIG_SPTM
1671 	/* Copy through our ID to the pmap (only available on SPTM systems) */
1672 	if (map->pmap) {
1673 		map->pmap->associated_vm_map_serial_id = map->serial_id;
1674 	}
1675 #endif /* CONFIG_SPTM */
1676 }
1677 
1678 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1679 vm_map_create_options(
1680 	pmap_t                  pmap,
1681 	vm_map_offset_t         min,
1682 	vm_map_offset_t         max,
1683 	vm_map_create_options_t options)
1684 {
1685 	vm_map_t result;
1686 
1687 #if DEBUG || DEVELOPMENT
1688 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1689 		if (early_map_count != ~0u && early_map_count !=
1690 		    zone_count_allocated(vm_map_zone) + 1) {
1691 			panic("allocating %dth early map, owner not known",
1692 			    zone_count_allocated(vm_map_zone) + 1);
1693 		}
1694 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1695 			panic("allocating %dth early map for non kernel pmap",
1696 			    early_map_count);
1697 		}
1698 	}
1699 #endif /* DEBUG || DEVELOPMENT */
1700 
1701 	result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1702 
1703 	vm_map_store_init(&result->hdr);
1704 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1705 	vm_map_set_page_shift(result, PAGE_SHIFT);
1706 
1707 	result->size_limit      = RLIM_INFINITY;        /* default unlimited */
1708 	result->data_limit      = RLIM_INFINITY;        /* default unlimited */
1709 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1710 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1711 
1712 	result->pmap = pmap;
1713 
1714 	/*
1715 	 * Immediately give ourselves an ID
1716 	 * Unless this map is being created as part of a fork, in which case
1717 	 * the caller will reassign the ID of the parent (so don't waste an
1718 	 *  increment here).
1719 	 */
1720 	if ((options & VM_MAP_CREATE_VIA_FORK) == 0) {
1721 		vm_map_assign_serial(result, vm_map_serial_generate());
1722 	}
1723 
1724 	result->min_offset = min;
1725 	result->max_offset = max;
1726 	result->first_free = vm_map_to_entry(result);
1727 	result->hint = vm_map_to_entry(result);
1728 
1729 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1730 		assert(pmap == kernel_pmap);
1731 		result->never_faults = true;
1732 	}
1733 
1734 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1735 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1736 		result->has_corpse_footprint = true;
1737 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1738 		struct vm_map_links *hole_entry;
1739 
1740 		hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1741 		hole_entry->start = min;
1742 		/*
1743 		 * Holes can be used to track ranges all the way up to
1744 		 * MACH_VM_MAX_ADDRESS or more (e.g. kernel map).
1745 		 */
1746 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1747 		result->holes_list = result->hole_hint = hole_entry;
1748 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1749 		result->holelistenabled = true;
1750 	}
1751 
1752 	vm_map_lock_init(result);
1753 
1754 	return result;
1755 }
1756 
1757 /*
1758  * Adjusts a submap that was made by kmem_suballoc()
1759  * before it knew where it would be mapped,
1760  * so that it has the right min/max offsets.
1761  *
1762  * We do not need to hold any locks:
1763  * only the caller knows about this map,
1764  * and it is not published on any entry yet.
1765  */
1766 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1767 vm_map_adjust_offsets(
1768 	vm_map_t                map,
1769 	vm_map_offset_t         min_off,
1770 	vm_map_offset_t         max_off)
1771 {
1772 	assert(map->min_offset == 0);
1773 	assert(map->max_offset == max_off - min_off);
1774 	assert(map->hdr.nentries == 0);
1775 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1776 
1777 	map->min_offset = min_off;
1778 	map->max_offset = max_off;
1779 
1780 	if (map->holelistenabled) {
1781 		struct vm_map_links *hole = map->holes_list;
1782 
1783 		hole->start = min_off;
1784 #if defined(__arm64__)
1785 		hole->end = max_off;
1786 #else
1787 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1788 #endif
1789 	}
1790 }
1791 
1792 
1793 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1794 vm_map_adjusted_size(vm_map_t map)
1795 {
1796 	const struct vm_reserved_region *regions = NULL;
1797 	size_t num_regions = 0;
1798 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1799 
1800 	if (map == NULL || (map->size == 0)) {
1801 		return 0;
1802 	}
1803 
1804 	map_size = map->size;
1805 
1806 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1807 		/*
1808 		 * No special reserved regions or not an exotic map or the task
1809 		 * is terminating and these special regions might have already
1810 		 * been deallocated.
1811 		 */
1812 		return map_size;
1813 	}
1814 
1815 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1816 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1817 
1818 	while (num_regions) {
1819 		reserved_size += regions[--num_regions].vmrr_size;
1820 	}
1821 
1822 	/*
1823 	 * There are a few places where the map is being switched out due to
1824 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1825 	 * In those cases, we could have the map's regions being deallocated on
1826 	 * a core while some accounting process is trying to get the map's size.
1827 	 * So this assert can't be enabled till all those places are uniform in
1828 	 * their use of the 'map->terminated' bit.
1829 	 *
1830 	 * assert(map_size >= reserved_size);
1831 	 */
1832 
1833 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1834 }
1835 
1836 /*
1837  *	vm_map_entry_create:	[ internal use only ]
1838  *
1839  *	Allocates a VM map entry for insertion in the
1840  *	given map (or map copy).  No fields are filled.
1841  *
1842  *	The VM entry will be zero initialized, except for:
1843  *	- behavior set to VM_BEHAVIOR_DEFAULT
1844  *	- inheritance set to VM_INHERIT_DEFAULT
1845  */
1846 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1847 
1848 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1849 
1850 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1851 _vm_map_entry_create(
1852 	struct vm_map_header    *map_header __unused)
1853 {
1854 	vm_map_entry_t entry = NULL;
1855 
1856 	entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1857 
1858 	/*
1859 	 * Help the compiler with what we know to be true,
1860 	 * so that the further bitfields inits have good codegen.
1861 	 *
1862 	 * See rdar://87041299
1863 	 */
1864 	__builtin_assume(entry->vme_object_value == 0);
1865 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1866 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1867 
1868 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1869 	    "VME_ALIAS_MASK covers tags");
1870 
1871 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1872 	    "can skip zeroing of the behavior field");
1873 	entry->inheritance = VM_INHERIT_DEFAULT;
1874 
1875 #if MAP_ENTRY_CREATION_DEBUG
1876 	entry->vme_creation_maphdr = map_header;
1877 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1878 	    BTREF_GET_NOWAIT);
1879 #endif
1880 	return entry;
1881 }
1882 
1883 /*
1884  *	vm_map_entry_dispose:	[ internal use only ]
1885  *
1886  *	Inverse of vm_map_entry_create.
1887  *
1888  *      write map lock held so no need to
1889  *	do anything special to insure correctness
1890  *      of the stores
1891  */
1892 static void
vm_map_entry_dispose(vm_map_entry_t entry)1893 vm_map_entry_dispose(
1894 	vm_map_entry_t          entry)
1895 {
1896 #if VM_BTLOG_TAGS
1897 	if (entry->vme_kernel_object) {
1898 		btref_put(entry->vme_tag_btref);
1899 	}
1900 #endif /* VM_BTLOG_TAGS */
1901 #if MAP_ENTRY_CREATION_DEBUG
1902 	btref_put(entry->vme_creation_bt);
1903 #endif
1904 #if MAP_ENTRY_INSERTION_DEBUG
1905 	btref_put(entry->vme_insertion_bt);
1906 #endif
1907 	zfree(vm_map_entry_zone, entry);
1908 }
1909 
1910 #define vm_map_copy_entry_dispose(copy_entry) \
1911 	vm_map_entry_dispose(copy_entry)
1912 
1913 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1914 vm_map_zap_first_entry(
1915 	vm_map_zap_t            list)
1916 {
1917 	return list->vmz_head;
1918 }
1919 
1920 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1921 vm_map_zap_last_entry(
1922 	vm_map_zap_t            list)
1923 {
1924 	assert(vm_map_zap_first_entry(list));
1925 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1926 }
1927 
1928 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1929 vm_map_zap_append(
1930 	vm_map_zap_t            list,
1931 	vm_map_entry_t          entry)
1932 {
1933 	entry->vme_next = VM_MAP_ENTRY_NULL;
1934 	*list->vmz_tail = entry;
1935 	list->vmz_tail = &entry->vme_next;
1936 }
1937 
1938 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1939 vm_map_zap_pop(
1940 	vm_map_zap_t            list)
1941 {
1942 	vm_map_entry_t head = list->vmz_head;
1943 
1944 	if (head != VM_MAP_ENTRY_NULL &&
1945 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1946 		list->vmz_tail = &list->vmz_head;
1947 	}
1948 
1949 	return head;
1950 }
1951 
1952 static void
vm_map_zap_dispose(vm_map_zap_t list)1953 vm_map_zap_dispose(
1954 	vm_map_zap_t            list)
1955 {
1956 	vm_map_entry_t          entry;
1957 
1958 	while ((entry = vm_map_zap_pop(list))) {
1959 		if (entry->is_sub_map) {
1960 			vm_map_deallocate(VME_SUBMAP(entry));
1961 		} else {
1962 			vm_object_deallocate(VME_OBJECT(entry));
1963 		}
1964 
1965 		vm_map_entry_dispose(entry);
1966 	}
1967 }
1968 
1969 #if MACH_ASSERT
1970 static boolean_t first_free_check = FALSE;
1971 boolean_t
first_free_is_valid(vm_map_t map)1972 first_free_is_valid(
1973 	vm_map_t        map)
1974 {
1975 	if (!first_free_check) {
1976 		return TRUE;
1977 	}
1978 
1979 	return first_free_is_valid_store( map );
1980 }
1981 #endif /* MACH_ASSERT */
1982 
1983 
1984 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1985 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1986 
1987 #define vm_map_copy_entry_unlink(copy, entry)                           \
1988 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1989 
1990 /*
1991  *	vm_map_destroy:
1992  *
1993  *	Actually destroy a map.
1994  */
1995 void
vm_map_destroy(vm_map_t map)1996 vm_map_destroy(
1997 	vm_map_t        map)
1998 {
1999 	/* final cleanup: this is not allowed to fail */
2000 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
2001 
2002 	VM_MAP_ZAP_DECLARE(zap);
2003 
2004 	vm_map_lock(map);
2005 
2006 	map->terminated = true;
2007 	/* clean up regular map entries */
2008 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
2009 	    KMEM_GUARD_NONE, &zap);
2010 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
2011 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
2012 	    KMEM_GUARD_NONE, &zap);
2013 
2014 	vm_map_disable_hole_optimization(map);
2015 	vm_map_corpse_footprint_destroy(map);
2016 
2017 	vm_map_unlock(map);
2018 
2019 	vm_map_zap_dispose(&zap);
2020 
2021 	assert(map->hdr.nentries == 0);
2022 
2023 	if (map->pmap) {
2024 		pmap_destroy(map->pmap);
2025 	}
2026 
2027 	lck_rw_destroy(&map->lock, &vm_map_lck_grp);
2028 
2029 #if CONFIG_MAP_RANGES
2030 	kfree_data(map->extra_ranges,
2031 	    map->extra_ranges_count * sizeof(struct vm_map_user_range));
2032 #endif
2033 
2034 	zfree_id(ZONE_ID_VM_MAP, map);
2035 }
2036 
2037 /*
2038  * Returns pid of the task with the largest number of VM map entries.
2039  * Used in the zone-map-exhaustion jetsam path.
2040  */
2041 pid_t
find_largest_process_vm_map_entries(void)2042 find_largest_process_vm_map_entries(void)
2043 {
2044 	pid_t victim_pid = -1;
2045 	int max_vm_map_entries = 0;
2046 	task_t task = TASK_NULL;
2047 	queue_head_t *task_list = &tasks;
2048 
2049 	lck_mtx_lock(&tasks_threads_lock);
2050 	queue_iterate(task_list, task, task_t, tasks) {
2051 		if (task == kernel_task || !task->active) {
2052 			continue;
2053 		}
2054 
2055 		vm_map_t task_map = task->map;
2056 		if (task_map != VM_MAP_NULL) {
2057 			int task_vm_map_entries = task_map->hdr.nentries;
2058 			if (task_vm_map_entries > max_vm_map_entries) {
2059 				max_vm_map_entries = task_vm_map_entries;
2060 				victim_pid = pid_from_task(task);
2061 			}
2062 		}
2063 	}
2064 	lck_mtx_unlock(&tasks_threads_lock);
2065 
2066 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
2067 	return victim_pid;
2068 }
2069 
2070 
2071 /*
2072  *	vm_map_lookup_entry:	[ internal use only ]
2073  *
2074  *	Calls into the vm map store layer to find the map
2075  *	entry containing (or immediately preceding) the
2076  *	specified address in the given map; the entry is returned
2077  *	in the "entry" parameter.  The boolean
2078  *	result indicates whether the address is
2079  *	actually contained in the map.
2080  */
2081 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2082 vm_map_lookup_entry(
2083 	vm_map_t        map,
2084 	vm_map_offset_t address,
2085 	vm_map_entry_t  *entry)         /* OUT */
2086 {
2087 	bool result = false;
2088 
2089 #if CONFIG_KERNEL_TAGGING
2090 	if (VM_KERNEL_ADDRESS(address)) {
2091 		address = vm_memtag_canonicalize_kernel(address);
2092 	}
2093 #endif /* CONFIG_KERNEL_TAGGING */
2094 
2095 #if CONFIG_PROB_GZALLOC
2096 	if (map->pmap == kernel_pmap) {
2097 		assertf(!pgz_owned(address),
2098 		    "it is the responsibility of callers to unguard PGZ addresses");
2099 	}
2100 #endif /* CONFIG_PROB_GZALLOC */
2101 	result = vm_map_store_lookup_entry( map, address, entry );
2102 
2103 	return result;
2104 }
2105 
2106 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2107 vm_map_lookup_entry_or_next(
2108 	vm_map_t        map,
2109 	vm_map_offset_t address,
2110 	vm_map_entry_t  *entry)         /* OUT */
2111 {
2112 	if (vm_map_lookup_entry(map, address, entry)) {
2113 		return true;
2114 	}
2115 
2116 	*entry = (*entry)->vme_next;
2117 	return false;
2118 }
2119 
2120 #if CONFIG_PROB_GZALLOC
2121 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2122 vm_map_lookup_entry_allow_pgz(
2123 	vm_map_t        map,
2124 	vm_map_offset_t address,
2125 	vm_map_entry_t  *entry)         /* OUT */
2126 {
2127 #if CONFIG_KERNEL_TAGGING
2128 	if (VM_KERNEL_ADDRESS(address)) {
2129 		address = vm_memtag_canonicalize_kernel(address);
2130 	}
2131 #endif /* CONFIG_KERNEL_TAGGING */
2132 
2133 	return vm_map_store_lookup_entry( map, address, entry );
2134 }
2135 #endif /* CONFIG_PROB_GZALLOC */
2136 
2137 /*
2138  *	Routine:	vm_map_range_invalid_panic
2139  *	Purpose:
2140  *			Panic on detection of an invalid range id.
2141  */
2142 __abortlike
2143 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)2144 vm_map_range_invalid_panic(
2145 	vm_map_t                map,
2146 	vm_map_range_id_t       range_id)
2147 {
2148 	panic("invalid range ID (%u) for map %p", range_id, map);
2149 }
2150 
2151 /*
2152  *	Routine:	vm_map_get_range
2153  *	Purpose:
2154  *			Adjust bounds based on security policy.
2155  */
2156 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)2157 vm_map_get_range(
2158 	vm_map_t                map,
2159 	vm_map_address_t       *address,
2160 	vm_map_kernel_flags_t  *vmk_flags,
2161 	vm_map_size_t           size,
2162 	bool                   *is_ptr)
2163 {
2164 	struct mach_vm_range effective_range = {};
2165 	vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
2166 
2167 	if (map == kernel_map) {
2168 		effective_range = kmem_ranges[range_id];
2169 
2170 		if (startup_phase >= STARTUP_SUB_KMEM) {
2171 			/*
2172 			 * Hint provided by caller is zeroed as the range is restricted to a
2173 			 * subset of the entire kernel_map VA, which could put the hint outside
2174 			 * the range, causing vm_map_store_find_space to fail.
2175 			 */
2176 			*address = 0ull;
2177 			/*
2178 			 * Ensure that range_id passed in by the caller is within meaningful
2179 			 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
2180 			 * to fail as the corresponding range is invalid. Range id larger than
2181 			 * KMEM_RANGE_ID_MAX will lead to an OOB access.
2182 			 */
2183 			if ((range_id == KMEM_RANGE_ID_NONE) ||
2184 			    (range_id > KMEM_RANGE_ID_MAX)) {
2185 				vm_map_range_invalid_panic(map, range_id);
2186 			}
2187 
2188 			/*
2189 			 * Pointer ranges use kmem_locate_space to do allocations.
2190 			 *
2191 			 * Non pointer fronts look like [ Small | Large | Permanent ]
2192 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
2193 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
2194 			 * use the entire range.
2195 			 */
2196 			if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2197 				*is_ptr = true;
2198 			} else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2199 				effective_range = kmem_large_ranges[range_id];
2200 			}
2201 		}
2202 #if CONFIG_MAP_RANGES
2203 	} else if (map->uses_user_ranges) {
2204 		switch (range_id) {
2205 		case UMEM_RANGE_ID_DEFAULT:
2206 			effective_range = map->default_range;
2207 			break;
2208 		case UMEM_RANGE_ID_HEAP:
2209 			effective_range = map->data_range;
2210 			break;
2211 		case UMEM_RANGE_ID_LARGE_FILE:
2212 			if (map->large_file_range.min_address != map->large_file_range.max_address) {
2213 				/* large file range is configured and should be used */
2214 				effective_range = map->large_file_range;
2215 			} else {
2216 				/*
2217 				 * the user asking for this user range might not have the
2218 				 * permissions to use the large file range (i.e., it doesn't
2219 				 * hold the correct entitlement), so we give it the data range
2220 				 * instead
2221 				 */
2222 				effective_range = map->data_range;
2223 			}
2224 			break;
2225 		case UMEM_RANGE_ID_FIXED:
2226 			/*
2227 			 * anywhere allocations with an address in "FIXED"
2228 			 * makes no sense, leave the range empty
2229 			 */
2230 			break;
2231 
2232 		default:
2233 			vm_map_range_invalid_panic(map, range_id);
2234 		}
2235 #endif /* CONFIG_MAP_RANGES */
2236 	} else {
2237 		/*
2238 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
2239 		 * allocations of PAGEZERO to explicit requests since its
2240 		 * normal use is to catch dereferences of NULL and many
2241 		 * applications also treat pointers with a value of 0 as
2242 		 * special and suddenly having address 0 contain useable
2243 		 * memory would tend to confuse those applications.
2244 		 */
2245 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2246 		effective_range.max_address = map->max_offset;
2247 	}
2248 
2249 	return effective_range;
2250 }
2251 
2252 kern_return_t
vm_map_locate_space_anywhere(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2253 vm_map_locate_space_anywhere(
2254 	vm_map_t                map,
2255 	vm_map_size_t           size,
2256 	vm_map_offset_t         mask,
2257 	vm_map_kernel_flags_t   vmk_flags,
2258 	vm_map_offset_t        *start_inout,
2259 	vm_map_entry_t         *entry_out)
2260 {
2261 	struct mach_vm_range effective_range = {};
2262 	vm_map_size_t   guard_offset;
2263 	vm_map_offset_t hint, limit;
2264 	vm_map_entry_t  entry;
2265 	bool            is_kmem_ptr_range = false;
2266 
2267 	/*
2268 	 * Only supported by vm_map_enter() with a fixed address.
2269 	 */
2270 	assert(!vmk_flags.vmf_fixed);
2271 	assert(!vmk_flags.vmkf_beyond_max);
2272 
2273 	if (__improbable(map->wait_for_space)) {
2274 		/*
2275 		 * support for "wait_for_space" is minimal,
2276 		 * its only consumer is the ipc_kernel_copy_map.
2277 		 */
2278 		assert(!map->holelistenabled &&
2279 		    !vmk_flags.vmkf_last_free &&
2280 		    !vmk_flags.vmkf_keep_map_locked &&
2281 		    !vmk_flags.vmkf_map_jit &&
2282 		    !vmk_flags.vmf_random_addr &&
2283 		    *start_inout <= map->min_offset);
2284 	} else if (vmk_flags.vmkf_last_free) {
2285 		assert(!vmk_flags.vmkf_map_jit &&
2286 		    !vmk_flags.vmf_random_addr);
2287 	}
2288 
2289 	if (vmk_flags.vmkf_guard_before) {
2290 		guard_offset = VM_MAP_PAGE_SIZE(map);
2291 		assert(size > guard_offset);
2292 		size -= guard_offset;
2293 	} else {
2294 		assert(size != 0);
2295 		guard_offset = 0;
2296 	}
2297 
2298 	if (__improbable(!vm_map_is_map_size_valid(
2299 		    map, size, vmk_flags.vmkf_no_soft_limit))) {
2300 		return KERN_NO_SPACE;
2301 	}
2302 
2303 	/*
2304 	 * Validate range_id from flags and get associated range
2305 	 */
2306 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2307 	    &is_kmem_ptr_range);
2308 
2309 	if (is_kmem_ptr_range) {
2310 		return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2311 		           vmk_flags.vmkf_last_free, start_inout, entry_out);
2312 	}
2313 
2314 #if XNU_TARGET_OS_OSX
2315 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2316 		assert(map != kernel_map);
2317 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2318 	}
2319 #endif /* XNU_TARGET_OS_OSX */
2320 
2321 again:
2322 	if (vmk_flags.vmkf_last_free) {
2323 		hint = *start_inout;
2324 
2325 		if (hint == 0 || hint > effective_range.max_address) {
2326 			hint = effective_range.max_address;
2327 		}
2328 		if (hint <= effective_range.min_address) {
2329 			return KERN_NO_SPACE;
2330 		}
2331 		limit = effective_range.min_address;
2332 	} else {
2333 		hint = *start_inout;
2334 
2335 		if (vmk_flags.vmkf_map_jit) {
2336 			if (map->jit_entry_exists &&
2337 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2338 				return KERN_INVALID_ARGUMENT;
2339 			}
2340 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2341 				vmk_flags.vmf_random_addr = true;
2342 			}
2343 		}
2344 
2345 		if (vmk_flags.vmf_random_addr) {
2346 			kern_return_t kr;
2347 
2348 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2349 			if (kr != KERN_SUCCESS) {
2350 				return kr;
2351 			}
2352 		}
2353 #if __x86_64__
2354 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2355 		    !map->disable_vmentry_reuse &&
2356 		    map->vmmap_high_start != 0) {
2357 			hint = map->vmmap_high_start;
2358 		}
2359 #endif /* __x86_64__ */
2360 
2361 		if (hint < effective_range.min_address) {
2362 			hint = effective_range.min_address;
2363 		}
2364 		if (effective_range.max_address <= hint) {
2365 			return KERN_NO_SPACE;
2366 		}
2367 
2368 		limit = effective_range.max_address;
2369 	}
2370 	entry = vm_map_store_find_space(map,
2371 	    hint, limit, vmk_flags.vmkf_last_free,
2372 	    guard_offset, size, mask,
2373 	    start_inout);
2374 
2375 	if (__improbable(entry == NULL)) {
2376 		if (map->wait_for_space &&
2377 		    guard_offset + size <=
2378 		    effective_range.max_address - effective_range.min_address) {
2379 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2380 			vm_map_unlock(map);
2381 			thread_block(THREAD_CONTINUE_NULL);
2382 			vm_map_lock(map);
2383 			goto again;
2384 		}
2385 		return KERN_NO_SPACE;
2386 	}
2387 
2388 	if (entry_out) {
2389 		*entry_out = entry;
2390 	}
2391 	return KERN_SUCCESS;
2392 }
2393 
2394 /*!
2395  * @function vm_map_locate_space_fixed()
2396  *
2397  * @brief
2398  * Locate (no reservation) a range in the specified VM map at a fixed address.
2399  *
2400  * @param map           the map to scan for memory, must be locked.
2401  * @param start         the fixed address trying to be reserved
2402  * @param size          the size of the allocation to make.
2403  * @param mask          an alignment mask the allocation must respect,
2404  * @param vmk_flags     the vm map kernel flags to influence this call.
2405  *                      vmk_flags.vmf_anywhere must not be set.
2406  * @param entry_out     the entry right before the hole.
2407  * @param zap_list      a zap list of entries to clean up after the call.
2408  *
2409  * @returns
2410  * - KERN_SUCCESS in case of success and no conflicting entry is found,
2411  *   in which case entry_out is set to the entry before the hole.
2412  *
2413  * - KERN_MEMORY_PRESENT if a conflicting entry is found,
2414  *   in which case entry_out is set the conflicting entry,
2415  *   the callers MUST handle this error explicitly.
2416  *
2417  * - KERN_INVALID_ADDRESS if the specified @c start or @c size
2418  *   would result in a mapping outside of the map.
2419  *
2420  * - KERN_NO_SPACE for various cases of unrecoverable failures.
2421  */
2422 static kern_return_t
vm_map_locate_space_fixed(vm_map_t map,vm_map_offset_t start,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * entry_out,vm_map_zap_t zap_list)2423 vm_map_locate_space_fixed(
2424 	vm_map_t                map,
2425 	vm_map_offset_t         start,
2426 	vm_map_size_t           size,
2427 	vm_map_offset_t         mask,
2428 	vm_map_kernel_flags_t   vmk_flags,
2429 	vm_map_entry_t         *entry_out,
2430 	vm_map_zap_t            zap_list)
2431 {
2432 	vm_map_offset_t effective_min_offset, effective_max_offset;
2433 	vm_map_entry_t  entry;
2434 	vm_map_offset_t end;
2435 
2436 	assert(vmk_flags.vmf_fixed);
2437 
2438 	effective_min_offset = map->min_offset;
2439 	effective_max_offset = map->max_offset;
2440 
2441 	if (vmk_flags.vmkf_beyond_max) {
2442 		/*
2443 		 * Allow an insertion beyond the map's max offset.
2444 		 */
2445 		effective_max_offset = 0x00000000FFFFF000ULL;
2446 		if (vm_map_is_64bit(map)) {
2447 			effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2448 		}
2449 #if XNU_TARGET_OS_OSX
2450 	} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2451 		effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2452 #endif /* XNU_TARGET_OS_OSX */
2453 	}
2454 
2455 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2456 	    !vmk_flags.vmf_overwrite &&
2457 	    map->pmap == kernel_pmap &&
2458 	    vmk_flags.vm_tag == VM_MEMORY_REALLOC) {
2459 		/*
2460 		 * Force realloc() to switch to a new allocation,
2461 		 * to prevent 4k-fragmented virtual ranges.
2462 		 */
2463 //		DEBUG4K_ERROR("no realloc in place");
2464 		return KERN_NO_SPACE;
2465 	}
2466 
2467 	/*
2468 	 *	Verify that:
2469 	 *		the address doesn't itself violate
2470 	 *		the mask requirement.
2471 	 */
2472 
2473 	if ((start & mask) != 0) {
2474 		return KERN_NO_SPACE;
2475 	}
2476 
2477 	if (__improbable(!vm_map_is_map_size_valid(
2478 		    map, size, vmk_flags.vmkf_no_soft_limit))) {
2479 		return KERN_NO_SPACE;
2480 	}
2481 
2482 #if CONFIG_MAP_RANGES
2483 	if (map->uses_user_ranges) {
2484 		struct mach_vm_range r;
2485 
2486 		vm_map_user_range_resolve(map, start, 1, &r);
2487 		if (r.max_address == 0) {
2488 			return KERN_INVALID_ADDRESS;
2489 		}
2490 		effective_min_offset = r.min_address;
2491 		effective_max_offset = r.max_address;
2492 	}
2493 #endif /* CONFIG_MAP_RANGES */
2494 
2495 	if ((startup_phase >= STARTUP_SUB_KMEM) && !vmk_flags.vmkf_submap &&
2496 	    (map == kernel_map)) {
2497 		mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2498 		effective_min_offset = r->min_address;
2499 		effective_max_offset = r->max_address;
2500 	}
2501 
2502 	/*
2503 	 *	...	the address is within bounds
2504 	 */
2505 
2506 	end = start + size;
2507 
2508 	if ((start < effective_min_offset) ||
2509 	    (end > effective_max_offset) ||
2510 	    (start >= end)) {
2511 		return KERN_INVALID_ADDRESS;
2512 	}
2513 
2514 	if (vmk_flags.vmf_overwrite) {
2515 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2516 		kern_return_t remove_kr;
2517 
2518 		/*
2519 		 * Fixed mapping and "overwrite" flag: attempt to
2520 		 * remove all existing mappings in the specified
2521 		 * address range, saving them in our "zap_list".
2522 		 *
2523 		 * This avoids releasing the VM map lock in
2524 		 * vm_map_entry_delete() and allows atomicity
2525 		 * when we want to replace some mappings with a new one.
2526 		 * It also allows us to restore the old VM mappings if the
2527 		 * new mapping fails.
2528 		 */
2529 		remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2530 
2531 		if (vmk_flags.vmkf_overwrite_immutable) {
2532 			/* we can overwrite immutable mappings */
2533 			remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2534 		}
2535 		if (vmk_flags.vmkf_remap_prot_copy) {
2536 			remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2537 		}
2538 		remove_kr = vm_map_delete(map, start, end, remove_flags,
2539 		    KMEM_GUARD_NONE, zap_list).kmr_return;
2540 		if (remove_kr) {
2541 			/* XXX FBDP restore zap_list? */
2542 			return remove_kr;
2543 		}
2544 	}
2545 
2546 	/*
2547 	 *	...	the starting address isn't allocated
2548 	 */
2549 
2550 	if (vm_map_lookup_entry(map, start, &entry)) {
2551 		*entry_out = entry;
2552 		return KERN_MEMORY_PRESENT;
2553 	}
2554 
2555 	/*
2556 	 *	...	the next region doesn't overlap the
2557 	 *		end point.
2558 	 */
2559 
2560 	if ((entry->vme_next != vm_map_to_entry(map)) &&
2561 	    (entry->vme_next->vme_start < end)) {
2562 		return KERN_NO_SPACE;
2563 	}
2564 
2565 	*entry_out = entry;
2566 	return KERN_SUCCESS;
2567 }
2568 
2569 /*
2570  *	Routine:	vm_map_find_space
2571  *	Purpose:
2572  *		Allocate a range in the specified virtual address map,
2573  *		returning the entry allocated for that range.
2574  *		Used by kmem_alloc, etc.
2575  *
2576  *		The map must be NOT be locked. It will be returned locked
2577  *		on KERN_SUCCESS, unlocked on failure.
2578  *
2579  *		If an entry is allocated, the object/offset fields
2580  *		are initialized to zero.
2581  */
2582 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2583 vm_map_find_space(
2584 	vm_map_t                map,
2585 	vm_map_offset_t         hint_address,
2586 	vm_map_size_t           size,
2587 	vm_map_offset_t         mask,
2588 	vm_map_kernel_flags_t   vmk_flags,
2589 	vm_map_entry_t          *o_entry)       /* OUT */
2590 {
2591 	vm_map_entry_t          new_entry, entry;
2592 	kern_return_t           kr;
2593 
2594 	if (size == 0) {
2595 		return KERN_INVALID_ARGUMENT;
2596 	}
2597 
2598 	new_entry = vm_map_entry_create(map);
2599 	new_entry->use_pmap = true;
2600 	new_entry->protection = VM_PROT_DEFAULT;
2601 	new_entry->max_protection = VM_PROT_ALL;
2602 
2603 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2604 		new_entry->map_aligned = true;
2605 	}
2606 	if (vmk_flags.vmf_permanent) {
2607 		new_entry->vme_permanent = true;
2608 	}
2609 
2610 	vm_map_lock(map);
2611 
2612 	kr = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
2613 	    &hint_address, &entry);
2614 	if (kr != KERN_SUCCESS) {
2615 		vm_map_unlock(map);
2616 		vm_map_entry_dispose(new_entry);
2617 		return kr;
2618 	}
2619 	new_entry->vme_start = hint_address;
2620 	new_entry->vme_end = hint_address + size;
2621 
2622 	/*
2623 	 *	At this point,
2624 	 *
2625 	 *	- new_entry's "vme_start" and "vme_end" should define
2626 	 *	  the endpoints of the available new range,
2627 	 *
2628 	 *	- and "entry" should refer to the region before
2629 	 *	  the new range,
2630 	 *
2631 	 *	- and the map should still be locked.
2632 	 */
2633 
2634 	assert(page_aligned(new_entry->vme_start));
2635 	assert(page_aligned(new_entry->vme_end));
2636 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2637 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2638 
2639 
2640 	/*
2641 	 *	Insert the new entry into the list
2642 	 */
2643 
2644 	vm_map_store_entry_link(map, entry, new_entry,
2645 	    VM_MAP_KERNEL_FLAGS_NONE);
2646 	map->size += size;
2647 
2648 	/*
2649 	 *	Update the lookup hint
2650 	 */
2651 	SAVE_HINT_MAP_WRITE(map, new_entry);
2652 
2653 	*o_entry = new_entry;
2654 	return KERN_SUCCESS;
2655 }
2656 
2657 int vm_map_pmap_enter_print = FALSE;
2658 int vm_map_pmap_enter_enable = FALSE;
2659 
2660 /*
2661  *	Routine:	vm_map_pmap_enter [internal only]
2662  *
2663  *	Description:
2664  *		Force pages from the specified object to be entered into
2665  *		the pmap at the specified address if they are present.
2666  *		As soon as a page not found in the object the scan ends.
2667  *
2668  *	Returns:
2669  *		Nothing.
2670  *
2671  *	In/out conditions:
2672  *		The source map should not be locked on entry.
2673  */
2674 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2675 vm_map_pmap_enter(
2676 	vm_map_t                map,
2677 	vm_map_offset_t         addr,
2678 	vm_map_offset_t         end_addr,
2679 	vm_object_t             object,
2680 	vm_object_offset_t      offset,
2681 	vm_prot_t               protection)
2682 {
2683 	int                     type_of_fault;
2684 	kern_return_t           kr;
2685 	uint8_t                 object_lock_type = 0;
2686 	struct vm_object_fault_info fault_info = {
2687 		.interruptible = THREAD_UNINT,
2688 	};
2689 
2690 	if (map->pmap == 0) {
2691 		return;
2692 	}
2693 
2694 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2695 
2696 	while (addr < end_addr) {
2697 		vm_page_t       m;
2698 
2699 
2700 		/*
2701 		 * TODO:
2702 		 * From vm_map_enter(), we come into this function without the map
2703 		 * lock held or the object lock held.
2704 		 * We haven't taken a reference on the object either.
2705 		 * We should do a proper lookup on the map to make sure
2706 		 * that things are sane before we go locking objects that
2707 		 * could have been deallocated from under us.
2708 		 */
2709 
2710 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2711 		vm_object_lock(object);
2712 
2713 		m = vm_page_lookup(object, offset);
2714 
2715 		if (m == VM_PAGE_NULL || m->vmp_busy || vm_page_is_fictitious(m) ||
2716 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2717 			vm_object_unlock(object);
2718 			return;
2719 		}
2720 
2721 		if (vm_map_pmap_enter_print) {
2722 			printf("vm_map_pmap_enter:");
2723 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2724 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2725 		}
2726 		type_of_fault = DBG_CACHE_HIT_FAULT;
2727 		kr = vm_fault_enter(m, map->pmap,
2728 		    addr,
2729 		    PAGE_SIZE, 0,
2730 		    protection, protection,
2731 		    VM_PAGE_WIRED(m),
2732 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2733 		    &fault_info,
2734 		    NULL,                  /* need_retry */
2735 		    &type_of_fault,
2736 		    &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2737 
2738 		vm_object_unlock(object);
2739 
2740 		offset += PAGE_SIZE_64;
2741 		addr += PAGE_SIZE;
2742 	}
2743 }
2744 
2745 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2746 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2747 vm_map_random_address_for_size(
2748 	vm_map_t                map,
2749 	vm_map_offset_t        *address,
2750 	vm_map_size_t           size,
2751 	vm_map_kernel_flags_t   vmk_flags)
2752 {
2753 	kern_return_t   kr = KERN_SUCCESS;
2754 	int             tries = 0;
2755 	vm_map_offset_t random_addr = 0;
2756 	vm_map_offset_t hole_end;
2757 
2758 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2759 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2760 	vm_map_size_t   vm_hole_size = 0;
2761 	vm_map_size_t   addr_space_size;
2762 	bool            is_kmem_ptr;
2763 	struct mach_vm_range effective_range;
2764 
2765 	effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2766 	    &is_kmem_ptr);
2767 
2768 	addr_space_size = effective_range.max_address - effective_range.min_address;
2769 	if (size >= addr_space_size) {
2770 		return KERN_NO_SPACE;
2771 	}
2772 	addr_space_size -= size;
2773 
2774 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2775 
2776 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2777 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2778 			random_addr = (vm_map_offset_t)early_random();
2779 		} else {
2780 			random_addr = (vm_map_offset_t)random();
2781 		}
2782 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2783 		random_addr = vm_map_trunc_page(
2784 			effective_range.min_address + (random_addr % addr_space_size),
2785 			VM_MAP_PAGE_MASK(map));
2786 
2787 #if CONFIG_PROB_GZALLOC
2788 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2789 			continue;
2790 		}
2791 #endif /* CONFIG_PROB_GZALLOC */
2792 
2793 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2794 			if (prev_entry == vm_map_to_entry(map)) {
2795 				next_entry = vm_map_first_entry(map);
2796 			} else {
2797 				next_entry = prev_entry->vme_next;
2798 			}
2799 			if (next_entry == vm_map_to_entry(map)) {
2800 				hole_end = vm_map_max(map);
2801 			} else {
2802 				hole_end = next_entry->vme_start;
2803 			}
2804 			vm_hole_size = hole_end - random_addr;
2805 			if (vm_hole_size >= size) {
2806 				*address = random_addr;
2807 				break;
2808 			}
2809 		}
2810 		tries++;
2811 	}
2812 
2813 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2814 		kr = KERN_NO_SPACE;
2815 	}
2816 	return kr;
2817 }
2818 
2819 static boolean_t
vm_memory_malloc_no_cow(int alias)2820 vm_memory_malloc_no_cow(
2821 	int alias)
2822 {
2823 	uint64_t alias_mask;
2824 
2825 	if (!malloc_no_cow) {
2826 		return FALSE;
2827 	}
2828 	if (alias > 63) {
2829 		return FALSE;
2830 	}
2831 	alias_mask = 1ULL << alias;
2832 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2833 		return TRUE;
2834 	}
2835 	return FALSE;
2836 }
2837 
2838 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2839 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2840 /*
2841  *	Routine:	vm_map_enter
2842  *
2843  *	Description:
2844  *		Allocate a range in the specified virtual address map.
2845  *		The resulting range will refer to memory defined by
2846  *		the given memory object and offset into that object.
2847  *
2848  *		Arguments are as defined in the vm_map call.
2849  */
2850 static unsigned int vm_map_enter_restore_successes = 0;
2851 static unsigned int vm_map_enter_restore_failures = 0;
2852 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2853 vm_map_enter(
2854 	vm_map_t                map,
2855 	vm_map_offset_t         *address,       /* IN/OUT */
2856 	vm_map_size_t           size,
2857 	vm_map_offset_t         mask,
2858 	vm_map_kernel_flags_t   vmk_flags,
2859 	vm_object_t             object,
2860 	vm_object_offset_t      offset,
2861 	boolean_t               needs_copy,
2862 	vm_prot_t               cur_protection,
2863 	vm_prot_t               max_protection,
2864 	vm_inherit_t            inheritance)
2865 {
2866 	vm_map_entry_t          entry, new_entry;
2867 	vm_map_offset_t         start, tmp_start, tmp_offset;
2868 	vm_map_offset_t         end, tmp_end;
2869 	vm_map_offset_t         tmp2_start, tmp2_end;
2870 	vm_map_offset_t         step;
2871 	kern_return_t           result = KERN_SUCCESS;
2872 	bool                    map_locked = FALSE;
2873 	bool                    pmap_empty = TRUE;
2874 	bool                    new_mapping_established = FALSE;
2875 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2876 	const bool              anywhere = !vmk_flags.vmf_fixed;
2877 	const bool              purgable = vmk_flags.vmf_purgeable;
2878 	const bool              no_cache = vmk_flags.vmf_no_cache;
2879 	const bool              is_submap = vmk_flags.vmkf_submap;
2880 	const bool              permanent = vmk_flags.vmf_permanent;
2881 	const bool              no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2882 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
2883 	const bool              iokit_acct = vmk_flags.vmkf_iokit_acct;
2884 	const bool              resilient_codesign = vmk_flags.vmf_resilient_codesign;
2885 	const bool              resilient_media = vmk_flags.vmf_resilient_media;
2886 	const bool              entry_for_tpro = vmk_flags.vmf_tpro;
2887 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
2888 	const vm_tag_t          alias = vmk_flags.vm_tag;
2889 	vm_tag_t                user_alias;
2890 	kern_return_t           kr;
2891 	bool                    clear_map_aligned = FALSE;
2892 	vm_map_size_t           chunk_size = 0;
2893 	vm_object_t             caller_object;
2894 	VM_MAP_ZAP_DECLARE(zap_old_list);
2895 	VM_MAP_ZAP_DECLARE(zap_new_list);
2896 
2897 	caller_object = object;
2898 
2899 	assertf(vmk_flags.__vmkf_unused2 == 0, "vmk_flags unused2=0x%llx\n", vmk_flags.__vmkf_unused2);
2900 
2901 	if (vmk_flags.vmf_4gb_chunk) {
2902 #if defined(__LP64__)
2903 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2904 #else /* __LP64__ */
2905 		chunk_size = ANON_CHUNK_SIZE;
2906 #endif /* __LP64__ */
2907 	} else {
2908 		chunk_size = ANON_CHUNK_SIZE;
2909 	}
2910 
2911 
2912 
2913 	if (superpage_size) {
2914 		if (object != VM_OBJECT_NULL) {
2915 			/* caller can't provide their own VM object */
2916 			return KERN_INVALID_ARGUMENT;
2917 		}
2918 		switch (superpage_size) {
2919 			/*
2920 			 * Note that the current implementation only supports
2921 			 * a single size for superpages, SUPERPAGE_SIZE, per
2922 			 * architecture. As soon as more sizes are supposed
2923 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2924 			 * with a lookup of the size depending on superpage_size.
2925 			 */
2926 #ifdef __x86_64__
2927 		case SUPERPAGE_SIZE_ANY:
2928 			/* handle it like 2 MB and round up to page size */
2929 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2930 			OS_FALLTHROUGH;
2931 		case SUPERPAGE_SIZE_2MB:
2932 			break;
2933 #endif
2934 		default:
2935 			return KERN_INVALID_ARGUMENT;
2936 		}
2937 		mask = SUPERPAGE_SIZE - 1;
2938 		if (size & (SUPERPAGE_SIZE - 1)) {
2939 			return KERN_INVALID_ARGUMENT;
2940 		}
2941 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2942 	}
2943 
2944 
2945 	if ((cur_protection & VM_PROT_WRITE) &&
2946 	    (cur_protection & VM_PROT_EXECUTE) &&
2947 #if XNU_TARGET_OS_OSX
2948 	    map->pmap != kernel_pmap &&
2949 	    (cs_process_global_enforcement() ||
2950 	    (vmk_flags.vmkf_cs_enforcement_override
2951 	    ? vmk_flags.vmkf_cs_enforcement
2952 	    : (vm_map_cs_enforcement(map)
2953 #if __arm64__
2954 	    || !VM_MAP_IS_EXOTIC(map)
2955 #endif /* __arm64__ */
2956 	    ))) &&
2957 #endif /* XNU_TARGET_OS_OSX */
2958 #if CODE_SIGNING_MONITOR
2959 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2960 #endif
2961 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2962 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2963 	    !entry_for_jit) {
2964 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2965 
2966 		DTRACE_VM3(cs_wx,
2967 		    uint64_t, 0,
2968 		    uint64_t, 0,
2969 		    vm_prot_t, cur_protection);
2970 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2971 		    proc_selfpid(),
2972 		    (get_bsdtask_info(current_task())
2973 		    ? proc_name_address(get_bsdtask_info(current_task()))
2974 		    : "?"),
2975 		    __FUNCTION__,
2976 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2977 		cur_protection &= ~VM_PROT_EXECUTE;
2978 		if (vm_protect_wx_fail) {
2979 			return KERN_PROTECTION_FAILURE;
2980 		}
2981 	}
2982 
2983 	if (entry_for_jit
2984 	    && cur_protection != VM_PROT_ALL) {
2985 		/*
2986 		 * Native macOS processes and all non-macOS processes are
2987 		 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2988 		 * the RWX requirement was not enforced, and thus, we must live
2989 		 * with our sins. We are now dealing with a JIT mapping without
2990 		 * RWX.
2991 		 *
2992 		 * We deal with these by letting the MAP_JIT stick in order
2993 		 * to avoid CS violations when these pages are mapped executable
2994 		 * down the line. In order to appease the page table monitor (you
2995 		 * know what I'm talking about), these pages will end up being
2996 		 * marked as XNU_USER_DEBUG, which will be allowed because we
2997 		 * don't enforce the code signing monitor on macOS systems. If
2998 		 * the user-space application ever changes permissions to RWX,
2999 		 * which they are allowed to since the mapping was originally
3000 		 * created with MAP_JIT, then they'll switch over to using the
3001 		 * XNU_USER_JIT type, and won't be allowed to downgrade any
3002 		 * more after that.
3003 		 *
3004 		 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
3005 		 * strictly disallowed.
3006 		 */
3007 
3008 #if XNU_TARGET_OS_OSX
3009 		/*
3010 		 * Continue to allow non-RWX JIT
3011 		 */
3012 #else
3013 		/* non-macOS: reject JIT regions without RWX */
3014 		DTRACE_VM3(cs_wx,
3015 		    uint64_t, 0,
3016 		    uint64_t, 0,
3017 		    vm_prot_t, cur_protection);
3018 		printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
3019 		    proc_selfpid(),
3020 		    (get_bsdtask_info(current_task())
3021 		    ? proc_name_address(get_bsdtask_info(current_task()))
3022 		    : "?"),
3023 		    __FUNCTION__,
3024 		    cur_protection);
3025 		return KERN_PROTECTION_FAILURE;
3026 #endif
3027 	}
3028 
3029 	/*
3030 	 * If the task has requested executable lockdown,
3031 	 * deny any new executable mapping.
3032 	 */
3033 	if (map->map_disallow_new_exec == TRUE) {
3034 		if (cur_protection & VM_PROT_EXECUTE) {
3035 			return KERN_PROTECTION_FAILURE;
3036 		}
3037 	}
3038 
3039 	if (resilient_codesign) {
3040 		assert(!is_submap);
3041 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3042 		if ((cur_protection | max_protection) & reject_prot) {
3043 			return KERN_PROTECTION_FAILURE;
3044 		}
3045 	}
3046 
3047 	if (resilient_media) {
3048 		assert(!is_submap);
3049 //		assert(!needs_copy);
3050 		if (object != VM_OBJECT_NULL &&
3051 		    !object->internal) {
3052 			/*
3053 			 * This mapping is directly backed by an external
3054 			 * memory manager (e.g. a vnode pager for a file):
3055 			 * we would not have any safe place to inject
3056 			 * a zero-filled page if an actual page is not
3057 			 * available, without possibly impacting the actual
3058 			 * contents of the mapped object (e.g. the file),
3059 			 * so we can't provide any media resiliency here.
3060 			 */
3061 			return KERN_INVALID_ARGUMENT;
3062 		}
3063 	}
3064 
3065 	if (entry_for_tpro) {
3066 		/*
3067 		 * TPRO overrides the effective permissions of the region
3068 		 * and explicitly maps as RW. Ensure we have been passed
3069 		 * the expected permissions. We accept `cur_protections`
3070 		 * RO as that will be handled on fault.
3071 		 */
3072 		if (!(max_protection & VM_PROT_READ) ||
3073 		    !(max_protection & VM_PROT_WRITE) ||
3074 		    !(cur_protection & VM_PROT_READ)) {
3075 			return KERN_PROTECTION_FAILURE;
3076 		}
3077 
3078 		/*
3079 		 * We can now downgrade the cur_protection to RO. This is a mild lie
3080 		 * to the VM layer. But TPRO will be responsible for toggling the
3081 		 * protections between RO/RW
3082 		 */
3083 		cur_protection = VM_PROT_READ;
3084 	}
3085 
3086 	if (is_submap) {
3087 		vm_map_t submap;
3088 		if (purgable) {
3089 			/* submaps can not be purgeable */
3090 			return KERN_INVALID_ARGUMENT;
3091 		}
3092 		if (object == VM_OBJECT_NULL) {
3093 			/* submaps can not be created lazily */
3094 			return KERN_INVALID_ARGUMENT;
3095 		}
3096 		submap = (vm_map_t) object;
3097 		if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
3098 			/* page size mismatch */
3099 			return KERN_INVALID_ARGUMENT;
3100 		}
3101 	}
3102 	if (vmk_flags.vmkf_already) {
3103 		/*
3104 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
3105 		 * is already present.  For it to be meaningul, the requested
3106 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
3107 		 * we shouldn't try and remove what was mapped there first
3108 		 * (!VM_FLAGS_OVERWRITE).
3109 		 */
3110 		if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
3111 			return KERN_INVALID_ARGUMENT;
3112 		}
3113 	}
3114 
3115 	if (size == 0 ||
3116 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
3117 		*address = 0;
3118 		return KERN_INVALID_ARGUMENT;
3119 	}
3120 
3121 	if (map->pmap == kernel_pmap) {
3122 		user_alias = VM_KERN_MEMORY_NONE;
3123 	} else {
3124 		user_alias = alias;
3125 	}
3126 
3127 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
3128 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
3129 	}
3130 
3131 #define RETURN(value)   { result = value; goto BailOut; }
3132 
3133 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
3134 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
3135 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
3136 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
3137 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
3138 	}
3139 
3140 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3141 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
3142 		/*
3143 		 * In most cases, the caller rounds the size up to the
3144 		 * map's page size.
3145 		 * If we get a size that is explicitly not map-aligned here,
3146 		 * we'll have to respect the caller's wish and mark the
3147 		 * mapping as "not map-aligned" to avoid tripping the
3148 		 * map alignment checks later.
3149 		 */
3150 		clear_map_aligned = TRUE;
3151 	}
3152 	if (!anywhere &&
3153 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3154 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
3155 		/*
3156 		 * We've been asked to map at a fixed address and that
3157 		 * address is not aligned to the map's specific alignment.
3158 		 * The caller should know what it's doing (i.e. most likely
3159 		 * mapping some fragmented copy map, transferring memory from
3160 		 * a VM map with a different alignment), so clear map_aligned
3161 		 * for this new VM map entry and proceed.
3162 		 */
3163 		clear_map_aligned = TRUE;
3164 	}
3165 
3166 	/*
3167 	 * Only zero-fill objects are allowed to be purgable.
3168 	 * LP64todo - limit purgable objects to 32-bits for now
3169 	 */
3170 	if (purgable &&
3171 	    (offset != 0 ||
3172 	    (object != VM_OBJECT_NULL &&
3173 	    (object->vo_size != size ||
3174 	    object->purgable == VM_PURGABLE_DENY))
3175 #if __LP64__
3176 	    || size > ANON_MAX_SIZE
3177 #endif
3178 	    )) {
3179 		return KERN_INVALID_ARGUMENT;
3180 	}
3181 
3182 	if (__improbable(!vm_map_is_map_size_valid(
3183 		    map, size, vmk_flags.vmkf_no_soft_limit))) {
3184 		return KERN_NO_SPACE;
3185 	}
3186 
3187 	vm_map_lock(map);
3188 	map_locked = TRUE;
3189 
3190 
3191 	if (anywhere) {
3192 		result = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
3193 		    address, &entry);
3194 		start = *address;
3195 	} else {
3196 		start = *address;
3197 		result = vm_map_locate_space_fixed(map, start, size, mask,
3198 		    vmk_flags, &entry, &zap_old_list);
3199 	}
3200 
3201 	end = start + size;
3202 
3203 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
3204 
3205 	/*
3206 	 * Check if what's already there is what we want.
3207 	 */
3208 	if (result == KERN_MEMORY_PRESENT) {
3209 		assert(!anywhere);
3210 		if (!(vmk_flags.vmkf_already)) {
3211 			RETURN(KERN_NO_SPACE);
3212 		}
3213 		tmp_start = start;
3214 		tmp_offset = offset;
3215 		if (entry->vme_start < start) {
3216 			tmp_start -= start - entry->vme_start;
3217 			tmp_offset -= start - entry->vme_start;
3218 		}
3219 		for (; entry->vme_start < end;
3220 		    entry = entry->vme_next) {
3221 			/*
3222 			 * Check if the mapping's attributes
3223 			 * match the existing map entry.
3224 			 */
3225 			if (entry == vm_map_to_entry(map) ||
3226 			    entry->vme_start != tmp_start ||
3227 			    entry->is_sub_map != is_submap ||
3228 			    VME_OFFSET(entry) != tmp_offset ||
3229 			    entry->needs_copy != needs_copy ||
3230 			    entry->protection != cur_protection ||
3231 			    entry->max_protection != max_protection ||
3232 			    entry->inheritance != inheritance ||
3233 			    entry->iokit_acct != iokit_acct ||
3234 			    VME_ALIAS(entry) != alias) {
3235 				/* not the same mapping ! */
3236 				RETURN(KERN_NO_SPACE);
3237 			}
3238 			/*
3239 			 * Check if the same object is being mapped.
3240 			 */
3241 			if (is_submap) {
3242 				if (VME_SUBMAP(entry) !=
3243 				    (vm_map_t) object) {
3244 					/* not the same submap */
3245 					RETURN(KERN_NO_SPACE);
3246 				}
3247 			} else {
3248 				if (VME_OBJECT(entry) != object) {
3249 					/* not the same VM object... */
3250 					vm_object_t obj2;
3251 
3252 					obj2 = VME_OBJECT(entry);
3253 					if ((obj2 == VM_OBJECT_NULL || obj2->internal) &&
3254 					    (object == VM_OBJECT_NULL || object->internal)) {
3255 						/*
3256 						 * ... but both are
3257 						 * anonymous memory,
3258 						 * so equivalent.
3259 						 */
3260 					} else {
3261 						RETURN(KERN_NO_SPACE);
3262 					}
3263 				}
3264 			}
3265 
3266 			tmp_offset += entry->vme_end - entry->vme_start;
3267 			tmp_start += entry->vme_end - entry->vme_start;
3268 			if (entry->vme_end >= end) {
3269 				/* reached the end of our mapping */
3270 				break;
3271 			}
3272 		}
3273 		/* it all matches:  let's use what's already there ! */
3274 		RETURN(KERN_MEMORY_PRESENT);
3275 	}
3276 
3277 	if (result != KERN_SUCCESS) {
3278 		goto BailOut;
3279 	}
3280 
3281 
3282 	/*
3283 	 *	At this point,
3284 	 *		"start" and "end" should define the endpoints of the
3285 	 *			available new range, and
3286 	 *		"entry" should refer to the region before the new
3287 	 *			range, and
3288 	 *
3289 	 *		the map should be locked.
3290 	 */
3291 
3292 	/*
3293 	 *	See whether we can avoid creating a new entry (and object) by
3294 	 *	extending one of our neighbors.  [So far, we only attempt to
3295 	 *	extend from below.]  Note that we can never extend/join
3296 	 *	purgable objects because they need to remain distinct
3297 	 *	entities in order to implement their "volatile object"
3298 	 *	semantics.
3299 	 */
3300 
3301 	if (purgable ||
3302 	    entry_for_jit ||
3303 	    entry_for_tpro ||
3304 	    vm_memory_malloc_no_cow(user_alias)) {
3305 		if (superpage_size) {
3306 			/*
3307 			 * For "super page" allocations, we will allocate
3308 			 * special physically-contiguous VM objects later on,
3309 			 * so we should not have flags instructing us to create
3310 			 * a differently special VM object here.
3311 			 */
3312 			RETURN(KERN_INVALID_ARGUMENT);
3313 		}
3314 
3315 		if (object == VM_OBJECT_NULL) {
3316 			assert(!superpage_size);
3317 			object = vm_object_allocate(size, map->serial_id);
3318 			vm_object_lock(object);
3319 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3320 			VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3321 			if (malloc_no_cow_except_fork &&
3322 			    !purgable &&
3323 			    !entry_for_jit &&
3324 			    !entry_for_tpro &&
3325 			    vm_memory_malloc_no_cow(user_alias)) {
3326 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3327 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3328 			}
3329 			if (entry_for_jit) {
3330 				object->vo_inherit_copy_none = true;
3331 			}
3332 			if (purgable) {
3333 				task_t owner;
3334 				VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3335 				if (map->pmap == kernel_pmap) {
3336 					/*
3337 					 * Purgeable mappings made in a kernel
3338 					 * map are "owned" by the kernel itself
3339 					 * rather than the current user task
3340 					 * because they're likely to be used by
3341 					 * more than this user task (see
3342 					 * execargs_purgeable_allocate(), for
3343 					 * example).
3344 					 */
3345 					owner = kernel_task;
3346 				} else {
3347 					owner = current_task();
3348 				}
3349 				assert(object->vo_owner == NULL);
3350 				assert(object->resident_page_count == 0);
3351 				assert(object->wired_page_count == 0);
3352 				vm_purgeable_nonvolatile_enqueue(object, owner);
3353 			}
3354 			vm_object_unlock(object);
3355 			offset = (vm_object_offset_t)0;
3356 		}
3357 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3358 		/* no coalescing if address space uses sub-pages */
3359 	} else if ((is_submap == FALSE) &&
3360 	    (object == VM_OBJECT_NULL) &&
3361 	    (entry != vm_map_to_entry(map)) &&
3362 	    (entry->vme_end == start) &&
3363 	    (!entry->is_shared) &&
3364 	    (!entry->is_sub_map) &&
3365 	    (!entry->in_transition) &&
3366 	    (!entry->needs_wakeup) &&
3367 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3368 	    (entry->protection == cur_protection) &&
3369 	    (entry->max_protection == max_protection) &&
3370 	    (entry->inheritance == inheritance) &&
3371 	    ((user_alias == VM_MEMORY_REALLOC) ||
3372 	    (VME_ALIAS(entry) == alias)) &&
3373 	    (entry->no_cache == no_cache) &&
3374 	    (entry->vme_permanent == permanent) &&
3375 	    /* no coalescing for immutable executable mappings */
3376 	    !((entry->protection & VM_PROT_EXECUTE) &&
3377 	    entry->vme_permanent) &&
3378 	    (!entry->superpage_size && !superpage_size) &&
3379 	    /*
3380 	     * No coalescing if not map-aligned, to avoid propagating
3381 	     * that condition any further than needed:
3382 	     */
3383 	    (!entry->map_aligned || !clear_map_aligned) &&
3384 	    (!entry->zero_wired_pages) &&
3385 	    (!entry->used_for_jit && !entry_for_jit) &&
3386 #if __arm64e__
3387 	    (!entry->used_for_tpro && !entry_for_tpro) &&
3388 #endif
3389 	    (!entry->csm_associated) &&
3390 	    (entry->iokit_acct == iokit_acct) &&
3391 	    (!entry->vme_resilient_codesign) &&
3392 	    (!entry->vme_resilient_media) &&
3393 	    (!entry->vme_atomic) &&
3394 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
3395 
3396 	    ((entry->vme_end - entry->vme_start) + size <=
3397 	    (user_alias == VM_MEMORY_REALLOC ?
3398 	    ANON_CHUNK_SIZE :
3399 	    NO_COALESCE_LIMIT)) &&
3400 
3401 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
3402 		if (vm_object_coalesce(VME_OBJECT(entry),
3403 		    VM_OBJECT_NULL,
3404 		    VME_OFFSET(entry),
3405 		    (vm_object_offset_t) 0,
3406 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
3407 		    (vm_map_size_t)(end - entry->vme_end))) {
3408 			/*
3409 			 *	Coalesced the two objects - can extend
3410 			 *	the previous map entry to include the
3411 			 *	new range.
3412 			 */
3413 			map->size += (end - entry->vme_end);
3414 			assert(entry->vme_start < end);
3415 			assert(VM_MAP_PAGE_ALIGNED(end,
3416 			    VM_MAP_PAGE_MASK(map)));
3417 			if (__improbable(vm_debug_events)) {
3418 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3419 			}
3420 			entry->vme_end = end;
3421 			if (map->holelistenabled) {
3422 				vm_map_store_update_first_free(map, entry, TRUE);
3423 			} else {
3424 				vm_map_store_update_first_free(map, map->first_free, TRUE);
3425 			}
3426 			new_mapping_established = TRUE;
3427 			RETURN(KERN_SUCCESS);
3428 		}
3429 	}
3430 
3431 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3432 	new_entry = NULL;
3433 
3434 	if (vmk_flags.vmkf_submap_adjust) {
3435 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3436 		offset = start;
3437 	}
3438 
3439 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3440 		tmp2_end = tmp2_start + step;
3441 		/*
3442 		 *	Create a new entry
3443 		 *
3444 		 * XXX FBDP
3445 		 * The reserved "page zero" in each process's address space can
3446 		 * be arbitrarily large.  Splitting it into separate objects and
3447 		 * therefore different VM map entries serves no purpose and just
3448 		 * slows down operations on the VM map, so let's not split the
3449 		 * allocation into chunks if the max protection is NONE.  That
3450 		 * memory should never be accessible, so it will never get to the
3451 		 * default pager.
3452 		 */
3453 		tmp_start = tmp2_start;
3454 		if (!is_submap &&
3455 		    object == VM_OBJECT_NULL &&
3456 		    size > chunk_size &&
3457 		    max_protection != VM_PROT_NONE &&
3458 		    superpage_size == 0) {
3459 			tmp_end = tmp_start + chunk_size;
3460 		} else {
3461 			tmp_end = tmp2_end;
3462 		}
3463 		do {
3464 			if (!is_submap &&
3465 			    object != VM_OBJECT_NULL &&
3466 			    object->internal &&
3467 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3468 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3469 				DTRACE_VM5(vm_map_enter_overmap,
3470 				    vm_map_t, map,
3471 				    vm_map_address_t, tmp_start,
3472 				    vm_map_address_t, tmp_end,
3473 				    vm_object_offset_t, offset,
3474 				    vm_object_size_t, object->vo_size);
3475 			}
3476 			new_entry = vm_map_entry_insert(map,
3477 			    entry, tmp_start, tmp_end,
3478 			    object, offset, vmk_flags,
3479 			    needs_copy,
3480 			    cur_protection, max_protection,
3481 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3482 			    VM_INHERIT_NONE : inheritance),
3483 			    clear_map_aligned);
3484 
3485 			assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3486 
3487 			if (resilient_codesign) {
3488 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3489 				if (!((cur_protection | max_protection) & reject_prot)) {
3490 					new_entry->vme_resilient_codesign = TRUE;
3491 				}
3492 			}
3493 
3494 			if (resilient_media &&
3495 			    (object == VM_OBJECT_NULL ||
3496 			    object->internal)) {
3497 				new_entry->vme_resilient_media = TRUE;
3498 			}
3499 
3500 			assert(!new_entry->iokit_acct);
3501 			if (!is_submap &&
3502 			    object != VM_OBJECT_NULL &&
3503 			    object->internal &&
3504 			    (object->purgable != VM_PURGABLE_DENY ||
3505 			    object->vo_ledger_tag)) {
3506 				assert(new_entry->use_pmap);
3507 				assert(!new_entry->iokit_acct);
3508 				/*
3509 				 * Turn off pmap accounting since
3510 				 * purgeable (or tagged) objects have their
3511 				 * own ledgers.
3512 				 */
3513 				new_entry->use_pmap = FALSE;
3514 			} else if (!is_submap &&
3515 			    iokit_acct &&
3516 			    object != VM_OBJECT_NULL &&
3517 			    object->internal) {
3518 				/* alternate accounting */
3519 				assert(!new_entry->iokit_acct);
3520 				assert(new_entry->use_pmap);
3521 				new_entry->iokit_acct = TRUE;
3522 				new_entry->use_pmap = FALSE;
3523 				DTRACE_VM4(
3524 					vm_map_iokit_mapped_region,
3525 					vm_map_t, map,
3526 					vm_map_offset_t, new_entry->vme_start,
3527 					vm_map_offset_t, new_entry->vme_end,
3528 					int, VME_ALIAS(new_entry));
3529 				vm_map_iokit_mapped_region(
3530 					map,
3531 					(new_entry->vme_end -
3532 					new_entry->vme_start));
3533 			} else if (!is_submap) {
3534 				assert(!new_entry->iokit_acct);
3535 				assert(new_entry->use_pmap);
3536 			}
3537 
3538 			if (is_submap) {
3539 				vm_map_t        submap;
3540 				boolean_t       submap_is_64bit;
3541 				boolean_t       use_pmap;
3542 
3543 				assert(new_entry->is_sub_map);
3544 				assert(!new_entry->use_pmap);
3545 				assert(!new_entry->iokit_acct);
3546 				submap = (vm_map_t) object;
3547 				submap_is_64bit = vm_map_is_64bit(submap);
3548 				use_pmap = vmk_flags.vmkf_nested_pmap;
3549 #ifndef NO_NESTED_PMAP
3550 				if (use_pmap && submap->pmap == NULL) {
3551 					ledger_t ledger = map->pmap->ledger;
3552 					/* we need a sub pmap to nest... */
3553 					submap->pmap = pmap_create_options(ledger, 0,
3554 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3555 					if (submap->pmap == NULL) {
3556 						/* let's proceed without nesting... */
3557 					}
3558 #if defined(__arm64__)
3559 					else {
3560 						pmap_set_nested(submap->pmap);
3561 					}
3562 #endif
3563 				}
3564 				if (use_pmap && submap->pmap != NULL) {
3565 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3566 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3567 						kr = KERN_FAILURE;
3568 					} else {
3569 						kr = pmap_nest(map->pmap,
3570 						    submap->pmap,
3571 						    tmp_start,
3572 						    tmp_end - tmp_start);
3573 					}
3574 					if (kr != KERN_SUCCESS) {
3575 						printf("vm_map_enter: "
3576 						    "pmap_nest(0x%llx,0x%llx) "
3577 						    "error 0x%x\n",
3578 						    (long long)tmp_start,
3579 						    (long long)tmp_end,
3580 						    kr);
3581 					} else {
3582 						/* we're now nested ! */
3583 						new_entry->use_pmap = TRUE;
3584 						pmap_empty = FALSE;
3585 					}
3586 				}
3587 #endif /* NO_NESTED_PMAP */
3588 			}
3589 			entry = new_entry;
3590 
3591 			if (superpage_size) {
3592 				vm_page_t pages, m;
3593 				vm_object_t sp_object;
3594 				vm_object_offset_t sp_offset;
3595 
3596 				assert(object == VM_OBJECT_NULL);
3597 				VME_OFFSET_SET(entry, 0);
3598 
3599 				/* allocate one superpage */
3600 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3601 				if (kr != KERN_SUCCESS) {
3602 					/* deallocate whole range... */
3603 					new_mapping_established = TRUE;
3604 					/* ... but only up to "tmp_end" */
3605 					size -= end - tmp_end;
3606 					RETURN(kr);
3607 				}
3608 
3609 				/* create one vm_object per superpage */
3610 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start), map->serial_id);
3611 				vm_object_lock(sp_object);
3612 				sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3613 				VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3614 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3615 				VME_OBJECT_SET(entry, sp_object, false, 0);
3616 				assert(entry->use_pmap);
3617 
3618 				/* enter the base pages into the object */
3619 				for (sp_offset = 0;
3620 				    sp_offset < SUPERPAGE_SIZE;
3621 				    sp_offset += PAGE_SIZE) {
3622 					m = pages;
3623 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3624 					pages = NEXT_PAGE(m);
3625 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3626 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3627 				}
3628 				vm_object_unlock(sp_object);
3629 			}
3630 		} while (tmp_end != tmp2_end &&
3631 		    (tmp_start = tmp_end) &&
3632 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3633 		    tmp_end + chunk_size : tmp2_end));
3634 	}
3635 
3636 	new_mapping_established = TRUE;
3637 
3638 
3639 BailOut:
3640 	assert(map_locked == TRUE);
3641 
3642 	/*
3643 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3644 	 * If we have identified and possibly established the new mapping(s),
3645 	 * make sure we did not go beyond the address space limit.
3646 	 */
3647 	if (result == KERN_SUCCESS) {
3648 		if (map->size_limit != RLIM_INFINITY &&
3649 		    map->size > map->size_limit) {
3650 			/*
3651 			 * Establishing the requested mappings would exceed
3652 			 * the process's RLIMIT_AS limit: fail with
3653 			 * KERN_NO_SPACE.
3654 			 */
3655 			result = KERN_NO_SPACE;
3656 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3657 			    proc_selfpid(),
3658 			    (get_bsdtask_info(current_task())
3659 			    ? proc_name_address(get_bsdtask_info(current_task()))
3660 			    : "?"),
3661 			    __FUNCTION__,
3662 			    (uint64_t) map->size,
3663 			    (uint64_t) map->size_limit);
3664 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3665 			    vm_map_size_t, map->size,
3666 			    uint64_t, map->size_limit);
3667 			vm_map_enter_RLIMIT_AS_count++;
3668 		} else if (map->data_limit != RLIM_INFINITY &&
3669 		    map->size > map->data_limit) {
3670 			/*
3671 			 * Establishing the requested mappings would exceed
3672 			 * the process's RLIMIT_DATA limit: fail with
3673 			 * KERN_NO_SPACE.
3674 			 */
3675 			result = KERN_NO_SPACE;
3676 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3677 			    proc_selfpid(),
3678 			    (get_bsdtask_info(current_task())
3679 			    ? proc_name_address(get_bsdtask_info(current_task()))
3680 			    : "?"),
3681 			    __FUNCTION__,
3682 			    (uint64_t) map->size,
3683 			    (uint64_t) map->data_limit);
3684 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3685 			    vm_map_size_t, map->size,
3686 			    uint64_t, map->data_limit);
3687 			vm_map_enter_RLIMIT_DATA_count++;
3688 		}
3689 	}
3690 
3691 	if (result == KERN_SUCCESS) {
3692 		vm_prot_t pager_prot;
3693 		memory_object_t pager;
3694 
3695 #if DEBUG
3696 		if (pmap_empty &&
3697 		    !(vmk_flags.vmkf_no_pmap_check)) {
3698 			assert(pmap_is_empty(map->pmap,
3699 			    *address,
3700 			    *address + size));
3701 		}
3702 #endif /* DEBUG */
3703 
3704 		/*
3705 		 * For "named" VM objects, let the pager know that the
3706 		 * memory object is being mapped.  Some pagers need to keep
3707 		 * track of this, to know when they can reclaim the memory
3708 		 * object, for example.
3709 		 * VM calls memory_object_map() for each mapping (specifying
3710 		 * the protection of each mapping) and calls
3711 		 * memory_object_last_unmap() when all the mappings are gone.
3712 		 */
3713 		pager_prot = max_protection;
3714 		if (needs_copy) {
3715 			/*
3716 			 * Copy-On-Write mapping: won't modify
3717 			 * the memory object.
3718 			 */
3719 			pager_prot &= ~VM_PROT_WRITE;
3720 		}
3721 		if (!is_submap &&
3722 		    object != VM_OBJECT_NULL &&
3723 		    object->named &&
3724 		    object->pager != MEMORY_OBJECT_NULL) {
3725 			vm_object_lock(object);
3726 			pager = object->pager;
3727 			if (object->named &&
3728 			    pager != MEMORY_OBJECT_NULL) {
3729 				assert(object->pager_ready);
3730 				vm_object_mapping_wait(object, THREAD_UNINT);
3731 				/* object might have lost its pager while waiting */
3732 				pager = object->pager;
3733 				if (object->named && pager != MEMORY_OBJECT_NULL) {
3734 					vm_object_mapping_begin(object);
3735 					vm_object_unlock(object);
3736 
3737 					kr = memory_object_map(pager, pager_prot);
3738 					assert(kr == KERN_SUCCESS);
3739 
3740 					vm_object_lock(object);
3741 					vm_object_mapping_end(object);
3742 				}
3743 			}
3744 			vm_object_unlock(object);
3745 		}
3746 	}
3747 
3748 	assert(map_locked == TRUE);
3749 
3750 	if (new_mapping_established) {
3751 		/*
3752 		 * If we release the map lock for any reason below,
3753 		 * another thread could deallocate our new mapping,
3754 		 * releasing the caller's reference on "caller_object",
3755 		 * which was transferred to the mapping.
3756 		 * If this was the only reference, the object could be
3757 		 * destroyed.
3758 		 *
3759 		 * We need to take an extra reference on "caller_object"
3760 		 * to keep it alive if we need to return the caller's
3761 		 * reference to the caller in case of failure.
3762 		 */
3763 		if (is_submap) {
3764 			vm_map_reference((vm_map_t)caller_object);
3765 		} else {
3766 			vm_object_reference(caller_object);
3767 		}
3768 	}
3769 
3770 	if (!keep_map_locked) {
3771 		vm_map_unlock(map);
3772 		map_locked = FALSE;
3773 		entry = VM_MAP_ENTRY_NULL;
3774 		new_entry = VM_MAP_ENTRY_NULL;
3775 	}
3776 
3777 	/*
3778 	 * We can't hold the map lock if we enter this block.
3779 	 */
3780 
3781 	if (result == KERN_SUCCESS) {
3782 		/*	Wire down the new entry if the user
3783 		 *	requested all new map entries be wired.
3784 		 */
3785 		if ((map->wiring_required) || (superpage_size)) {
3786 			assert(!keep_map_locked);
3787 			pmap_empty = FALSE; /* pmap won't be empty */
3788 			kr = vm_map_wire_nested(map, start, end,
3789 			    cur_protection, VM_KERN_MEMORY_MLOCK,
3790 			    TRUE, PMAP_NULL, 0, NULL);
3791 			result = kr;
3792 		}
3793 
3794 	}
3795 
3796 	if (result != KERN_SUCCESS) {
3797 		if (new_mapping_established) {
3798 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3799 
3800 			/*
3801 			 * We have to get rid of the new mappings since we
3802 			 * won't make them available to the user.
3803 			 * Try and do that atomically, to minimize the risk
3804 			 * that someone else create new mappings that range.
3805 			 */
3806 			if (!map_locked) {
3807 				vm_map_lock(map);
3808 				map_locked = TRUE;
3809 			}
3810 			remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3811 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3812 			if (permanent) {
3813 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3814 			}
3815 			(void) vm_map_delete(map,
3816 			    *address, *address + size,
3817 			    remove_flags,
3818 			    KMEM_GUARD_NONE, &zap_new_list);
3819 		}
3820 
3821 		if (vm_map_zap_first_entry(&zap_old_list)) {
3822 			vm_map_entry_t entry1, entry2;
3823 
3824 			/*
3825 			 * The new mapping failed.  Attempt to restore
3826 			 * the old mappings, saved in the "zap_old_map".
3827 			 */
3828 			if (!map_locked) {
3829 				vm_map_lock(map);
3830 				map_locked = TRUE;
3831 			}
3832 
3833 			/* first check if the coast is still clear */
3834 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3835 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3836 
3837 			if (vm_map_lookup_entry(map, start, &entry1) ||
3838 			    vm_map_lookup_entry(map, end, &entry2) ||
3839 			    entry1 != entry2) {
3840 				/*
3841 				 * Part of that range has already been
3842 				 * re-mapped:  we can't restore the old
3843 				 * mappings...
3844 				 */
3845 				vm_map_enter_restore_failures++;
3846 			} else {
3847 				/*
3848 				 * Transfer the saved map entries from
3849 				 * "zap_old_map" to the original "map",
3850 				 * inserting them all after "entry1".
3851 				 */
3852 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3853 					vm_map_size_t entry_size;
3854 
3855 					entry_size = (entry2->vme_end -
3856 					    entry2->vme_start);
3857 					vm_map_store_entry_link(map, entry1, entry2,
3858 					    VM_MAP_KERNEL_FLAGS_NONE);
3859 					map->size += entry_size;
3860 					entry1 = entry2;
3861 				}
3862 				if (map->wiring_required) {
3863 					/*
3864 					 * XXX TODO: we should rewire the
3865 					 * old pages here...
3866 					 */
3867 				}
3868 				vm_map_enter_restore_successes++;
3869 			}
3870 		}
3871 	}
3872 
3873 	/*
3874 	 * The caller is responsible for releasing the lock if it requested to
3875 	 * keep the map locked.
3876 	 */
3877 	if (map_locked && !keep_map_locked) {
3878 		vm_map_unlock(map);
3879 	}
3880 
3881 	vm_map_zap_dispose(&zap_old_list);
3882 	vm_map_zap_dispose(&zap_new_list);
3883 
3884 	if (new_mapping_established) {
3885 		/*
3886 		 * The caller had a reference on "caller_object" and we
3887 		 * transferred that reference to the mapping.
3888 		 * We also took an extra reference on "caller_object" to keep
3889 		 * it alive while the map was unlocked.
3890 		 */
3891 		if (result == KERN_SUCCESS) {
3892 			/*
3893 			 * On success, the caller's reference on the object gets
3894 			 * tranferred to the mapping.
3895 			 * Release our extra reference.
3896 			 */
3897 			if (is_submap) {
3898 				vm_map_deallocate((vm_map_t)caller_object);
3899 			} else {
3900 				vm_object_deallocate(caller_object);
3901 			}
3902 		} else {
3903 			/*
3904 			 * On error, the caller expects to still have a
3905 			 * reference on the object it gave us.
3906 			 * Let's use our extra reference for that.
3907 			 */
3908 		}
3909 	}
3910 
3911 	return result;
3912 
3913 #undef  RETURN
3914 }
3915 
3916 /*
3917  * Counters for the prefault optimization.
3918  */
3919 int64_t vm_prefault_nb_pages = 0;
3920 int64_t vm_prefault_nb_bailout = 0;
3921 
3922 static kern_return_t
vm_map_enter_adjust_offset(vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_offset_t quantity)3923 vm_map_enter_adjust_offset(
3924 	vm_object_offset_t *obj_offs,
3925 	vm_object_offset_t *obj_end,
3926 	vm_object_offset_t  quantity)
3927 {
3928 	if (os_add_overflow(*obj_offs, quantity, obj_offs) ||
3929 	    os_add_overflow(*obj_end, quantity, obj_end) ||
3930 	    vm_map_round_page_mask(*obj_end, PAGE_MASK) == 0) {
3931 		return KERN_INVALID_ARGUMENT;
3932 	}
3933 
3934 	return KERN_SUCCESS;
3935 }
3936 
3937 static __attribute__((always_inline, warn_unused_result))
3938 kern_return_t
vm_map_enter_mem_object_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)3939 vm_map_enter_mem_object_sanitize(
3940 	vm_map_t                target_map,
3941 	vm_map_offset_ut        address_u,
3942 	vm_map_size_ut          initial_size_u,
3943 	vm_map_offset_ut        mask_u,
3944 	vm_object_offset_ut     offset_u,
3945 	vm_prot_ut              cur_protection_u,
3946 	vm_prot_ut              max_protection_u,
3947 	vm_inherit_ut           inheritance_u,
3948 	vm_map_kernel_flags_t   vmk_flags,
3949 	ipc_port_t              port,
3950 	vm_map_address_t       *map_addr,
3951 	vm_map_size_t          *map_size,
3952 	vm_map_offset_t        *mask,
3953 	vm_object_offset_t     *obj_offs,
3954 	vm_object_offset_t     *obj_end,
3955 	vm_object_size_t       *obj_size,
3956 	vm_prot_t              *cur_protection,
3957 	vm_prot_t              *max_protection,
3958 	vm_inherit_t           *inheritance)
3959 {
3960 	kern_return_t           result;
3961 
3962 	result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
3963 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3964 	    VM_PROT_IS_MASK, cur_protection,
3965 	    max_protection);
3966 	if (__improbable(result != KERN_SUCCESS)) {
3967 		return result;
3968 	}
3969 
3970 	result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3971 	    inheritance);
3972 	if (__improbable(result != KERN_SUCCESS)) {
3973 		return result;
3974 	}
3975 
3976 	result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, mask);
3977 	if (__improbable(result != KERN_SUCCESS)) {
3978 		return result;
3979 	}
3980 
3981 	if (vmk_flags.vmf_fixed) {
3982 		vm_map_address_t        map_end;
3983 
3984 		result = vm_sanitize_addr_size(address_u, initial_size_u,
3985 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3986 		    target_map,
3987 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_REALIGN_START,
3988 		    map_addr, &map_end, map_size);
3989 		if (__improbable(result != KERN_SUCCESS)) {
3990 			return result;
3991 		}
3992 	} else {
3993 		*map_addr = vm_sanitize_addr(target_map, address_u);
3994 		result = vm_sanitize_size(0, initial_size_u,
3995 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3996 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
3997 		if (__improbable(result != KERN_SUCCESS)) {
3998 			return result;
3999 		}
4000 	}
4001 
4002 	*obj_size = vm_object_round_page(*map_size);
4003 	if (__improbable(*obj_size == 0)) {
4004 		return KERN_INVALID_ARGUMENT;
4005 	}
4006 
4007 	if (IP_VALID(port)) {
4008 		result = vm_sanitize_addr_size(offset_u, *obj_size,
4009 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
4010 		    PAGE_MASK,
4011 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
4012 		    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4013 		    obj_offs, obj_end, obj_size);
4014 		if (__improbable(result != KERN_SUCCESS)) {
4015 			return result;
4016 		}
4017 	} else {
4018 		*obj_offs = 0;
4019 		*obj_end  = *obj_size;
4020 	}
4021 
4022 	return KERN_SUCCESS;
4023 }
4024 
4025 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset_u,boolean_t copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,upl_page_list_ptr_t page_list,unsigned int page_list_count)4026 vm_map_enter_mem_object(
4027 	vm_map_t                target_map,
4028 	vm_map_offset_ut       *address_u,
4029 	vm_map_size_ut          initial_size_u,
4030 	vm_map_offset_ut        mask_u,
4031 	vm_map_kernel_flags_t   vmk_flags,
4032 	ipc_port_t              port,
4033 	vm_object_offset_ut     offset_u,
4034 	boolean_t               copy,
4035 	vm_prot_ut              cur_protection_u,
4036 	vm_prot_ut              max_protection_u,
4037 	vm_inherit_ut           inheritance_u,
4038 	upl_page_list_ptr_t     page_list,
4039 	unsigned int            page_list_count)
4040 {
4041 	vm_map_offset_t         mask;
4042 	vm_prot_t               cur_protection;
4043 	vm_prot_t               max_protection;
4044 	vm_inherit_t            inheritance;
4045 	vm_map_address_t        map_addr, map_mask;
4046 	vm_map_size_t           map_size;
4047 	vm_object_t             object = VM_OBJECT_NULL;
4048 	vm_object_offset_t      obj_offs, obj_end;
4049 	vm_object_size_t        obj_size;
4050 	kern_return_t           result;
4051 	boolean_t               mask_cur_protection, mask_max_protection;
4052 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
4053 	vm_map_offset_t         offset_in_mapping = 0;
4054 
4055 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4056 		/* XXX TODO4K prefaulting depends on page size... */
4057 		try_prefault = FALSE;
4058 	}
4059 
4060 	/*
4061 	 * Check arguments for validity
4062 	 */
4063 	if ((target_map == VM_MAP_NULL) ||
4064 	    (try_prefault && (copy || !page_list))) {
4065 		return KERN_INVALID_ARGUMENT;
4066 	}
4067 
4068 	map_mask = vm_map_page_mask(target_map);
4069 
4070 	/*
4071 	 * Sanitize any input parameters that are addr/size/prot/inherit
4072 	 */
4073 	result = vm_map_enter_mem_object_sanitize(
4074 		target_map,
4075 		*address_u,
4076 		initial_size_u,
4077 		mask_u,
4078 		offset_u,
4079 		cur_protection_u,
4080 		max_protection_u,
4081 		inheritance_u,
4082 		vmk_flags,
4083 		port,
4084 		&map_addr,
4085 		&map_size,
4086 		&mask,
4087 		&obj_offs,
4088 		&obj_end,
4089 		&obj_size,
4090 		&cur_protection,
4091 		&max_protection,
4092 		&inheritance);
4093 	if (__improbable(result != KERN_SUCCESS)) {
4094 		return vm_sanitize_get_kr(result);
4095 	}
4096 
4097 	assertf(vmk_flags.__vmkf_unused2 == 0, "vmk_flags unused2=0x%llx\n", vmk_flags.__vmkf_unused2);
4098 	vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, map_size);
4099 
4100 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4101 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
4102 	cur_protection &= ~VM_PROT_IS_MASK;
4103 	max_protection &= ~VM_PROT_IS_MASK;
4104 
4105 #if __arm64__
4106 	if (cur_protection & VM_PROT_EXECUTE) {
4107 		cur_protection |= VM_PROT_READ;
4108 	}
4109 #endif /* __arm64__ */
4110 
4111 	/*
4112 	 * Find the vm object (if any) corresponding to this port.
4113 	 */
4114 	if (!IP_VALID(port)) {
4115 		object = VM_OBJECT_NULL;
4116 		copy = FALSE;
4117 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4118 		vm_named_entry_t        named_entry;
4119 		vm_object_size_t        initial_size;
4120 
4121 		named_entry = mach_memory_entry_from_port(port);
4122 
4123 		if (vmk_flags.vmf_return_data_addr ||
4124 		    vmk_flags.vmf_return_4k_data_addr) {
4125 			result = vm_map_enter_adjust_offset(&obj_offs,
4126 			    &obj_end, named_entry->data_offset);
4127 			if (__improbable(result)) {
4128 				return result;
4129 			}
4130 		}
4131 
4132 		/* a few checks to make sure user is obeying rules */
4133 		if (mask_max_protection) {
4134 			max_protection &= named_entry->protection;
4135 		}
4136 		if (mask_cur_protection) {
4137 			cur_protection &= named_entry->protection;
4138 		}
4139 		if ((named_entry->protection & max_protection) !=
4140 		    max_protection) {
4141 			return KERN_INVALID_RIGHT;
4142 		}
4143 		if ((named_entry->protection & cur_protection) !=
4144 		    cur_protection) {
4145 			return KERN_INVALID_RIGHT;
4146 		}
4147 
4148 		/*
4149 		 * unwrap is safe because we know obj_size is larger and doesn't
4150 		 * overflow
4151 		 */
4152 		initial_size = VM_SANITIZE_UNSAFE_UNWRAP(initial_size_u);
4153 		if (named_entry->size < obj_offs + initial_size) {
4154 			return KERN_INVALID_ARGUMENT;
4155 		}
4156 
4157 		/* for a vm_map_copy, we can only map it whole */
4158 		if (named_entry->is_copy &&
4159 		    (obj_size != named_entry->size) &&
4160 		    (vm_map_round_page(obj_size, map_mask) == named_entry->size)) {
4161 			/* XXX FBDP use the rounded size... */
4162 			obj_end += named_entry->size - obj_size;
4163 			obj_size = named_entry->size;
4164 		}
4165 
4166 		if (named_entry->offset) {
4167 			/*
4168 			 * the callers parameter offset is defined to be the
4169 			 * offset from beginning of named entry offset in object
4170 			 *
4171 			 * Because we checked above that
4172 			 *   obj_offs + obj_size < named_entry_size
4173 			 * these overflow checks should be redundant...
4174 			 */
4175 			result = vm_map_enter_adjust_offset(&obj_offs,
4176 			    &obj_end, named_entry->offset);
4177 			if (__improbable(result)) {
4178 				return result;
4179 			}
4180 		}
4181 
4182 		if (!VM_MAP_PAGE_ALIGNED(obj_size, map_mask)) {
4183 			/*
4184 			 * Let's not map more than requested;
4185 			 * vm_map_enter() will handle this "not map-aligned"
4186 			 * case.
4187 			 */
4188 			map_size = obj_size;
4189 		}
4190 
4191 		named_entry_lock(named_entry);
4192 
4193 		// rdar://130307561 (Combine copy, object, and submap fields of vm_named_entry into an enum)
4194 		assert(named_entry->is_copy || named_entry->is_object || named_entry->is_sub_map);
4195 
4196 		if (named_entry->is_sub_map) {
4197 			vm_map_t                submap;
4198 
4199 			assert(!named_entry->is_copy);
4200 			assert(!named_entry->is_object);
4201 
4202 			if (vmk_flags.vmf_return_data_addr ||
4203 			    vmk_flags.vmf_return_4k_data_addr) {
4204 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4205 			}
4206 
4207 			submap = named_entry->backing.map;
4208 			vm_map_reference(submap);
4209 			named_entry_unlock(named_entry);
4210 
4211 			vmk_flags.vmkf_submap = TRUE;
4212 			result = vm_map_enter(target_map,
4213 			    &map_addr,
4214 			    map_size,
4215 			    mask,
4216 			    vmk_flags,
4217 			    (vm_object_t)(uintptr_t) submap,
4218 			    obj_offs,
4219 			    copy,
4220 			    cur_protection,
4221 			    max_protection,
4222 			    inheritance);
4223 			if (result != KERN_SUCCESS) {
4224 				vm_map_deallocate(submap);
4225 				return result;
4226 			}
4227 			/*
4228 			 * No need to lock "submap" just to check its
4229 			 * "mapped" flag: that flag is never reset
4230 			 * once it's been set and if we race, we'll
4231 			 * just end up setting it twice, which is OK.
4232 			 */
4233 			if (submap->mapped_in_other_pmaps == FALSE &&
4234 			    vm_map_pmap(submap) != PMAP_NULL &&
4235 			    vm_map_pmap(submap) !=
4236 			    vm_map_pmap(target_map)) {
4237 				/*
4238 				 * This submap is being mapped in a map
4239 				 * that uses a different pmap.
4240 				 * Set its "mapped_in_other_pmaps" flag
4241 				 * to indicate that we now need to
4242 				 * remove mappings from all pmaps rather
4243 				 * than just the submap's pmap.
4244 				 */
4245 				vm_map_lock(submap);
4246 				submap->mapped_in_other_pmaps = TRUE;
4247 				vm_map_unlock(submap);
4248 			}
4249 			goto out;
4250 		}
4251 
4252 		if (named_entry->is_copy) {
4253 			kern_return_t   kr;
4254 			vm_map_copy_t   copy_map;
4255 			vm_map_entry_t  copy_entry;
4256 			vm_map_offset_t copy_addr;
4257 			vm_map_copy_t   target_copy_map;
4258 			vm_map_offset_t overmap_start, overmap_end;
4259 			vm_map_offset_t trimmed_start;
4260 			vm_map_size_t   target_size;
4261 
4262 			assert(!named_entry->is_object);
4263 			assert(!named_entry->is_sub_map);
4264 
4265 			int allowed_flags = VM_FLAGS_FIXED |
4266 			    VM_FLAGS_ANYWHERE |
4267 			    VM_FLAGS_OVERWRITE |
4268 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4269 			    VM_FLAGS_RETURN_DATA_ADDR;
4270 
4271 			if (!vm_map_kernel_flags_check_vmflags(vmk_flags, allowed_flags)) {
4272 				named_entry_unlock(named_entry);
4273 				return KERN_INVALID_ARGUMENT;
4274 			}
4275 
4276 			copy_map = named_entry->backing.copy;
4277 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4278 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4279 				/* unsupported type; should not happen */
4280 				printf("vm_map_enter_mem_object: "
4281 				    "memory_entry->backing.copy "
4282 				    "unsupported type 0x%x\n",
4283 				    copy_map->type);
4284 				named_entry_unlock(named_entry);
4285 				return KERN_INVALID_ARGUMENT;
4286 			}
4287 
4288 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4289 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, obj_offs, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4290 			}
4291 
4292 			if (vmk_flags.vmf_return_data_addr ||
4293 			    vmk_flags.vmf_return_4k_data_addr) {
4294 				offset_in_mapping = obj_offs & map_mask;
4295 				if (vmk_flags.vmf_return_4k_data_addr) {
4296 					offset_in_mapping &= ~((signed)(0xFFF));
4297 				}
4298 			}
4299 
4300 			target_copy_map = VM_MAP_COPY_NULL;
4301 			target_size = copy_map->size;
4302 			overmap_start = 0;
4303 			overmap_end = 0;
4304 			trimmed_start = 0;
4305 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4306 				DEBUG4K_ADJUST("adjusting...\n");
4307 				kr = vm_map_copy_adjust_to_target(
4308 					copy_map,
4309 					obj_offs,
4310 					initial_size,
4311 					target_map,
4312 					copy,
4313 					&target_copy_map,
4314 					&overmap_start,
4315 					&overmap_end,
4316 					&trimmed_start);
4317 				if (kr != KERN_SUCCESS) {
4318 					named_entry_unlock(named_entry);
4319 					return kr;
4320 				}
4321 				target_size = target_copy_map->size;
4322 			} else {
4323 				/*
4324 				 * Assert that the vm_map_copy is coming from the right
4325 				 * zone and hasn't been forged
4326 				 */
4327 				vm_map_copy_require(copy_map);
4328 				target_copy_map = copy_map;
4329 			}
4330 
4331 			vm_map_kernel_flags_t rsv_flags = vmk_flags;
4332 
4333 			vm_map_kernel_flags_and_vmflags(&rsv_flags,
4334 			    (VM_FLAGS_FIXED |
4335 			    VM_FLAGS_ANYWHERE |
4336 			    VM_FLAGS_OVERWRITE |
4337 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4338 			    VM_FLAGS_RETURN_DATA_ADDR));
4339 
4340 			/* reserve a contiguous range */
4341 			kr = vm_map_enter(target_map,
4342 			    &map_addr,
4343 			    vm_map_round_page(target_size, map_mask),
4344 			    mask,
4345 			    rsv_flags,
4346 			    VM_OBJECT_NULL,
4347 			    0,
4348 			    FALSE,               /* copy */
4349 			    cur_protection,
4350 			    max_protection,
4351 			    inheritance);
4352 			if (kr != KERN_SUCCESS) {
4353 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4354 				if (target_copy_map != copy_map) {
4355 					vm_map_copy_discard(target_copy_map);
4356 					target_copy_map = VM_MAP_COPY_NULL;
4357 				}
4358 				named_entry_unlock(named_entry);
4359 				return kr;
4360 			}
4361 
4362 			copy_addr = map_addr;
4363 
4364 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4365 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4366 			    copy_entry = copy_entry->vme_next) {
4367 				vm_map_t                copy_submap = VM_MAP_NULL;
4368 				vm_object_t             copy_object = VM_OBJECT_NULL;
4369 				vm_map_size_t           copy_size;
4370 				vm_object_offset_t      copy_offset;
4371 				boolean_t               do_copy = false;
4372 
4373 				if (copy_entry->is_sub_map) {
4374 					copy_submap = VME_SUBMAP(copy_entry);
4375 					copy_object = (vm_object_t)copy_submap;
4376 				} else {
4377 					copy_object = VME_OBJECT(copy_entry);
4378 				}
4379 				copy_offset = VME_OFFSET(copy_entry);
4380 				copy_size = (copy_entry->vme_end -
4381 				    copy_entry->vme_start);
4382 
4383 				/* sanity check */
4384 				if ((copy_addr + copy_size) >
4385 				    (map_addr +
4386 				    overmap_start + overmap_end +
4387 				    named_entry->size /* XXX full size */)) {
4388 					/* over-mapping too much !? */
4389 					kr = KERN_INVALID_ARGUMENT;
4390 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4391 					/* abort */
4392 					break;
4393 				}
4394 
4395 				/* take a reference on the object */
4396 				if (copy_entry->is_sub_map) {
4397 					vm_map_reference(copy_submap);
4398 				} else {
4399 					if (!copy &&
4400 					    copy_object != VM_OBJECT_NULL &&
4401 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4402 						bool is_writable;
4403 
4404 						/*
4405 						 * We need to resolve our side of this
4406 						 * "symmetric" copy-on-write now; we
4407 						 * need a new object to map and share,
4408 						 * instead of the current one which
4409 						 * might still be shared with the
4410 						 * original mapping.
4411 						 *
4412 						 * Note: A "vm_map_copy_t" does not
4413 						 * have a lock but we're protected by
4414 						 * the named entry's lock here.
4415 						 */
4416 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4417 						VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4418 						assert(copy_object != VME_OBJECT(copy_entry));
4419 						is_writable = false;
4420 						if (copy_entry->protection & VM_PROT_WRITE) {
4421 							is_writable = true;
4422 #if __arm64e__
4423 						} else if (copy_entry->used_for_tpro) {
4424 							is_writable = true;
4425 #endif /* __arm64e__ */
4426 						}
4427 						if (!copy_entry->needs_copy && is_writable) {
4428 							vm_prot_t prot;
4429 
4430 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4431 							vm_object_pmap_protect(copy_object,
4432 							    copy_offset,
4433 							    copy_size,
4434 							    PMAP_NULL,
4435 							    PAGE_SIZE,
4436 							    0,
4437 							    prot);
4438 						}
4439 						copy_entry->needs_copy = FALSE;
4440 						copy_entry->is_shared = TRUE;
4441 						copy_object = VME_OBJECT(copy_entry);
4442 						copy_offset = VME_OFFSET(copy_entry);
4443 						vm_object_lock(copy_object);
4444 						/* we're about to make a shared mapping of this object */
4445 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4446 						VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4447 						vm_object_unlock(copy_object);
4448 					}
4449 
4450 					if (copy_object != VM_OBJECT_NULL &&
4451 					    copy_object->named &&
4452 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4453 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4454 						memory_object_t pager;
4455 						vm_prot_t       pager_prot;
4456 
4457 						/*
4458 						 * For "named" VM objects, let the pager know that the
4459 						 * memory object is being mapped.  Some pagers need to keep
4460 						 * track of this, to know when they can reclaim the memory
4461 						 * object, for example.
4462 						 * VM calls memory_object_map() for each mapping (specifying
4463 						 * the protection of each mapping) and calls
4464 						 * memory_object_last_unmap() when all the mappings are gone.
4465 						 */
4466 						pager_prot = max_protection;
4467 						if (copy) {
4468 							/*
4469 							 * Copy-On-Write mapping: won't modify the
4470 							 * memory object.
4471 							 */
4472 							pager_prot &= ~VM_PROT_WRITE;
4473 						}
4474 						vm_object_lock(copy_object);
4475 						pager = copy_object->pager;
4476 						if (copy_object->named &&
4477 						    pager != MEMORY_OBJECT_NULL &&
4478 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4479 							assert(copy_object->pager_ready);
4480 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4481 							/*
4482 							 * Object might have lost its pager
4483 							 * while waiting.
4484 							 */
4485 							pager = copy_object->pager;
4486 							if (copy_object->named &&
4487 							    pager != MEMORY_OBJECT_NULL) {
4488 								vm_object_mapping_begin(copy_object);
4489 								vm_object_unlock(copy_object);
4490 
4491 								kr = memory_object_map(pager, pager_prot);
4492 								assert(kr == KERN_SUCCESS);
4493 
4494 								vm_object_lock(copy_object);
4495 								vm_object_mapping_end(copy_object);
4496 							}
4497 						}
4498 						vm_object_unlock(copy_object);
4499 					}
4500 
4501 					/*
4502 					 *	Perform the copy if requested
4503 					 */
4504 
4505 					if (copy && copy_object != VM_OBJECT_NULL) {
4506 						vm_object_t             new_object;
4507 						vm_object_offset_t      new_offset;
4508 
4509 						result = vm_object_copy_strategically(copy_object, copy_offset,
4510 						    copy_size,
4511 						    false,                                   /* forking */
4512 						    &new_object, &new_offset,
4513 						    &do_copy);
4514 
4515 
4516 						if (result == KERN_MEMORY_RESTART_COPY) {
4517 							boolean_t success;
4518 							boolean_t src_needs_copy;
4519 
4520 							/*
4521 							 * XXX
4522 							 * We currently ignore src_needs_copy.
4523 							 * This really is the issue of how to make
4524 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4525 							 * non-kernel users to use. Solution forthcoming.
4526 							 * In the meantime, since we don't allow non-kernel
4527 							 * memory managers to specify symmetric copy,
4528 							 * we won't run into problems here.
4529 							 */
4530 							new_object = copy_object;
4531 							new_offset = copy_offset;
4532 							success = vm_object_copy_quickly(new_object,
4533 							    new_offset,
4534 							    copy_size,
4535 							    &src_needs_copy,
4536 							    &do_copy);
4537 							assert(success);
4538 							result = KERN_SUCCESS;
4539 						}
4540 						if (result != KERN_SUCCESS) {
4541 							kr = result;
4542 							break;
4543 						}
4544 
4545 						copy_object = new_object;
4546 						copy_offset = new_offset;
4547 						/*
4548 						 * No extra object reference for the mapping:
4549 						 * the mapping should be the only thing keeping
4550 						 * this new object alive.
4551 						 */
4552 					} else {
4553 						/*
4554 						 * We already have the right object
4555 						 * to map.
4556 						 */
4557 						copy_object = VME_OBJECT(copy_entry);
4558 						/* take an extra ref for the mapping below */
4559 						vm_object_reference(copy_object);
4560 					}
4561 				}
4562 
4563 				/*
4564 				 * If the caller does not want a specific
4565 				 * tag for this new mapping:  use
4566 				 * the tag of the original mapping.
4567 				 */
4568 				vm_map_kernel_flags_t vmk_remap_flags = {
4569 					.vmkf_submap = copy_entry->is_sub_map,
4570 				};
4571 
4572 				vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4573 				    vm_map_kernel_flags_vmflags(vmk_flags),
4574 				    vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4575 
4576 				/* over-map the object into destination */
4577 				vmk_remap_flags.vmf_fixed = true;
4578 				vmk_remap_flags.vmf_overwrite = true;
4579 
4580 				if (!copy && !copy_entry->is_sub_map) {
4581 					/*
4582 					 * copy-on-write should have been
4583 					 * resolved at this point, or we would
4584 					 * end up sharing instead of copying.
4585 					 */
4586 					assert(!copy_entry->needs_copy);
4587 				}
4588 #if XNU_TARGET_OS_OSX
4589 				if (copy_entry->used_for_jit) {
4590 					vmk_remap_flags.vmkf_map_jit = TRUE;
4591 				}
4592 #endif /* XNU_TARGET_OS_OSX */
4593 
4594 				kr = vm_map_enter(target_map,
4595 				    &copy_addr,
4596 				    copy_size,
4597 				    (vm_map_offset_t) 0,
4598 				    vmk_remap_flags,
4599 				    copy_object,
4600 				    copy_offset,
4601 				    ((copy_object == NULL)
4602 				    ? FALSE
4603 				    : (copy || copy_entry->needs_copy)),
4604 				    cur_protection,
4605 				    max_protection,
4606 				    inheritance);
4607 				if (kr != KERN_SUCCESS) {
4608 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4609 					if (copy_entry->is_sub_map) {
4610 						vm_map_deallocate(copy_submap);
4611 					} else {
4612 						vm_object_deallocate(copy_object);
4613 					}
4614 					/* abort */
4615 					break;
4616 				}
4617 
4618 				/* next mapping */
4619 				copy_addr += copy_size;
4620 			}
4621 
4622 			named_entry_unlock(named_entry);
4623 			if (target_copy_map != copy_map) {
4624 				vm_map_copy_discard(target_copy_map);
4625 				target_copy_map = VM_MAP_COPY_NULL;
4626 			}
4627 
4628 			if (kr == KERN_SUCCESS) {
4629 				if (overmap_start) {
4630 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t)offset_in_mapping, (uint64_t)overmap_start, (uint64_t)(map_addr + offset_in_mapping + overmap_start));
4631 				}
4632 				offset_in_mapping += overmap_start;
4633 			} else if (!vmk_flags.vmf_overwrite) {
4634 				/* deallocate the contiguous range */
4635 				vm_map_remove(target_map, map_addr,
4636 				    map_addr + map_size);
4637 			}
4638 			result = kr;
4639 			goto out;
4640 		}
4641 
4642 		if (named_entry->is_object) {
4643 			unsigned int    access;
4644 			unsigned int    wimg_mode;
4645 
4646 			assert(!named_entry->is_copy);
4647 			assert(!named_entry->is_sub_map);
4648 
4649 			/* we are mapping a VM object */
4650 
4651 			access = named_entry->access;
4652 
4653 			if (vmk_flags.vmf_return_data_addr ||
4654 			    vmk_flags.vmf_return_4k_data_addr) {
4655 				offset_in_mapping = obj_offs & map_mask;
4656 				if (vmk_flags.vmf_return_4k_data_addr) {
4657 					offset_in_mapping &= ~((signed)(0xFFF));
4658 				}
4659 				obj_offs -= offset_in_mapping;
4660 				map_size  = vm_map_round_page(initial_size +
4661 				    offset_in_mapping, map_mask);
4662 			}
4663 
4664 			object = vm_named_entry_to_vm_object(named_entry);
4665 			assert(object != VM_OBJECT_NULL);
4666 			vm_object_lock(object);
4667 			named_entry_unlock(named_entry);
4668 
4669 			wimg_mode = object->wimg_bits;
4670 			vm_prot_to_wimg(access, &wimg_mode);
4671 			if (object->wimg_bits != wimg_mode) {
4672 				vm_object_change_wimg_mode(object, wimg_mode);
4673 			}
4674 
4675 			vm_object_reference_locked(object);
4676 			vm_object_unlock(object);
4677 		} else {
4678 			panic("invalid VM named entry %p", named_entry);
4679 		}
4680 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4681 		/*
4682 		 * JMM - This is temporary until we unify named entries
4683 		 * and raw memory objects.
4684 		 *
4685 		 * Detected fake ip_kotype for a memory object.  In
4686 		 * this case, the port isn't really a port at all, but
4687 		 * instead is just a raw memory object.
4688 		 */
4689 		if (vmk_flags.vmf_return_data_addr ||
4690 		    vmk_flags.vmf_return_4k_data_addr) {
4691 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4692 		}
4693 
4694 		object = memory_object_to_vm_object((memory_object_t)port);
4695 		if (object == VM_OBJECT_NULL) {
4696 			return KERN_INVALID_OBJECT;
4697 		}
4698 		vm_object_reference(object);
4699 
4700 		/* wait for object (if any) to be ready */
4701 		if (object != VM_OBJECT_NULL) {
4702 			if (is_kernel_object(object)) {
4703 				printf("Warning: Attempt to map kernel object"
4704 				    " by a non-private kernel entity\n");
4705 				return KERN_INVALID_OBJECT;
4706 			}
4707 			if (!object->pager_ready) {
4708 				vm_object_lock(object);
4709 
4710 				while (!object->pager_ready) {
4711 					vm_object_sleep(object,
4712 					    VM_OBJECT_EVENT_PAGER_READY,
4713 					    THREAD_UNINT,
4714 					    LCK_SLEEP_EXCLUSIVE);
4715 				}
4716 				vm_object_unlock(object);
4717 			}
4718 		}
4719 	} else {
4720 		return KERN_INVALID_OBJECT;
4721 	}
4722 
4723 	if (object != VM_OBJECT_NULL &&
4724 	    object->named &&
4725 	    object->pager != MEMORY_OBJECT_NULL &&
4726 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4727 		memory_object_t pager;
4728 		vm_prot_t       pager_prot;
4729 		kern_return_t   kr;
4730 
4731 		/*
4732 		 * For "named" VM objects, let the pager know that the
4733 		 * memory object is being mapped.  Some pagers need to keep
4734 		 * track of this, to know when they can reclaim the memory
4735 		 * object, for example.
4736 		 * VM calls memory_object_map() for each mapping (specifying
4737 		 * the protection of each mapping) and calls
4738 		 * memory_object_last_unmap() when all the mappings are gone.
4739 		 */
4740 		pager_prot = max_protection;
4741 		if (copy) {
4742 			/*
4743 			 * Copy-On-Write mapping: won't modify the
4744 			 * memory object.
4745 			 */
4746 			pager_prot &= ~VM_PROT_WRITE;
4747 		}
4748 		vm_object_lock(object);
4749 		pager = object->pager;
4750 		if (object->named &&
4751 		    pager != MEMORY_OBJECT_NULL &&
4752 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4753 			assert(object->pager_ready);
4754 			vm_object_mapping_wait(object, THREAD_UNINT);
4755 			/* object might have lost its pager while waiting */
4756 			pager = object->pager;
4757 			if (object->named && pager != MEMORY_OBJECT_NULL) {
4758 				vm_object_mapping_begin(object);
4759 				vm_object_unlock(object);
4760 
4761 				kr = memory_object_map(pager, pager_prot);
4762 				assert(kr == KERN_SUCCESS);
4763 
4764 				vm_object_lock(object);
4765 				vm_object_mapping_end(object);
4766 			}
4767 		}
4768 		vm_object_unlock(object);
4769 	}
4770 
4771 	/*
4772 	 *	Perform the copy if requested
4773 	 */
4774 
4775 	if (copy) {
4776 		vm_object_t             new_object;
4777 		vm_object_offset_t      new_offset;
4778 
4779 		result = vm_object_copy_strategically(object,
4780 		    obj_offs,
4781 		    map_size,
4782 		    false,                                   /* forking */
4783 		    &new_object, &new_offset,
4784 		    &copy);
4785 
4786 
4787 		if (result == KERN_MEMORY_RESTART_COPY) {
4788 			boolean_t success;
4789 			boolean_t src_needs_copy;
4790 
4791 			/*
4792 			 * XXX
4793 			 * We currently ignore src_needs_copy.
4794 			 * This really is the issue of how to make
4795 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4796 			 * non-kernel users to use. Solution forthcoming.
4797 			 * In the meantime, since we don't allow non-kernel
4798 			 * memory managers to specify symmetric copy,
4799 			 * we won't run into problems here.
4800 			 */
4801 			new_object = object;
4802 			new_offset = obj_offs;
4803 			success = vm_object_copy_quickly(new_object,
4804 			    new_offset,
4805 			    map_size,
4806 			    &src_needs_copy,
4807 			    &copy);
4808 			assert(success);
4809 			result = KERN_SUCCESS;
4810 		}
4811 		/*
4812 		 *	Throw away the reference to the
4813 		 *	original object, as it won't be mapped.
4814 		 */
4815 
4816 		vm_object_deallocate(object);
4817 
4818 		if (result != KERN_SUCCESS) {
4819 			return result;
4820 		}
4821 
4822 		object   = new_object;
4823 		obj_offs = new_offset;
4824 	}
4825 
4826 	/*
4827 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4828 	 * needs to be atomic.
4829 	 */
4830 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4831 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4832 
4833 	result = vm_map_enter(target_map,
4834 	    &map_addr, map_size,
4835 	    (vm_map_offset_t)mask,
4836 	    vmk_flags,
4837 	    object, obj_offs,
4838 	    copy,
4839 	    cur_protection, max_protection,
4840 	    inheritance);
4841 	if (result != KERN_SUCCESS) {
4842 		vm_object_deallocate(object);
4843 	}
4844 
4845 	/*
4846 	 * Try to prefault, and do not forget to release the vm map lock.
4847 	 */
4848 	if (result == KERN_SUCCESS && try_prefault) {
4849 		mach_vm_address_t va = map_addr;
4850 		kern_return_t kr = KERN_SUCCESS;
4851 		unsigned int i = 0;
4852 		int pmap_options;
4853 
4854 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4855 
4856 		for (i = 0; i < page_list_count; ++i) {
4857 			if (!UPL_VALID_PAGE(page_list, i)) {
4858 				if (kernel_prefault) {
4859 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4860 					result = KERN_MEMORY_ERROR;
4861 					break;
4862 				}
4863 			} else {
4864 				/*
4865 				 * If this function call failed, we should stop
4866 				 * trying to optimize, other calls are likely
4867 				 * going to fail too.
4868 				 *
4869 				 * We are not gonna report an error for such
4870 				 * failure though. That's an optimization, not
4871 				 * something critical.
4872 				 */
4873 				kr = pmap_enter_object_options_check(target_map->pmap,
4874 				    va, 0, object, UPL_PHYS_PAGE(page_list, i),
4875 				    cur_protection, VM_PROT_NONE,
4876 				    TRUE, pmap_options);
4877 				if (kr != KERN_SUCCESS) {
4878 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4879 					if (kernel_prefault) {
4880 						result = kr;
4881 					}
4882 					break;
4883 				}
4884 				OSIncrementAtomic64(&vm_prefault_nb_pages);
4885 			}
4886 
4887 			/* Next virtual address */
4888 			va += PAGE_SIZE;
4889 		}
4890 		if (vmk_flags.vmkf_keep_map_locked) {
4891 			vm_map_unlock(target_map);
4892 		}
4893 	}
4894 
4895 out:
4896 	if (result == KERN_SUCCESS) {
4897 #if KASAN
4898 		if (target_map->pmap == kernel_pmap) {
4899 			kasan_notify_address(map_addr, map_size);
4900 		}
4901 #endif
4902 		*address_u = vm_sanitize_wrap_addr(map_addr + offset_in_mapping);
4903 	}
4904 	return result;
4905 }
4906 
4907 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_ut * address,vm_map_size_ut initial_size,vm_map_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset,vm_prot_ut cur_protection,vm_prot_ut max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4908 vm_map_enter_mem_object_prefault(
4909 	vm_map_t                target_map,
4910 	vm_map_offset_ut       *address,
4911 	vm_map_size_ut          initial_size,
4912 	vm_map_offset_ut        mask,
4913 	vm_map_kernel_flags_t   vmk_flags,
4914 	ipc_port_t              port,
4915 	vm_object_offset_ut     offset,
4916 	vm_prot_ut              cur_protection,
4917 	vm_prot_ut              max_protection,
4918 	upl_page_list_ptr_t     page_list,
4919 	unsigned int            page_list_count)
4920 {
4921 	/* range_id is set by vm_map_enter_mem_object */
4922 	return vm_map_enter_mem_object(target_map,
4923 	           address,
4924 	           initial_size,
4925 	           mask,
4926 	           vmk_flags,
4927 	           port,
4928 	           offset,
4929 	           FALSE,
4930 	           cur_protection,
4931 	           max_protection,
4932 	           VM_INHERIT_DEFAULT,
4933 	           page_list,
4934 	           page_list_count);
4935 }
4936 
4937 static __attribute__((always_inline, warn_unused_result))
4938 kern_return_t
vm_map_enter_mem_object_control_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)4939 vm_map_enter_mem_object_control_sanitize(
4940 	vm_map_t                target_map,
4941 	vm_map_offset_ut        address_u,
4942 	vm_map_size_ut          initial_size_u,
4943 	vm_map_offset_ut        mask_u,
4944 	vm_object_offset_ut     offset_u,
4945 	vm_prot_ut              cur_protection_u,
4946 	vm_prot_ut              max_protection_u,
4947 	vm_inherit_ut           inheritance_u,
4948 	vm_map_kernel_flags_t   vmk_flags,
4949 	vm_map_address_t       *map_addr,
4950 	vm_map_size_t          *map_size,
4951 	vm_map_offset_t        *mask,
4952 	vm_object_offset_t     *obj_offs,
4953 	vm_object_offset_t     *obj_end,
4954 	vm_object_size_t       *obj_size,
4955 	vm_prot_t              *cur_protection,
4956 	vm_prot_t              *max_protection,
4957 	vm_inherit_t           *inheritance)
4958 {
4959 	kern_return_t           kr;
4960 
4961 	kr = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
4962 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4963 	    cur_protection, max_protection);
4964 	if (__improbable(kr != KERN_SUCCESS)) {
4965 		return kr;
4966 	}
4967 
4968 	kr = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL,
4969 	    inheritance);
4970 	if (__improbable(kr != KERN_SUCCESS)) {
4971 		return kr;
4972 	}
4973 
4974 	kr = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, mask);
4975 	if (__improbable(kr != KERN_SUCCESS)) {
4976 		return kr;
4977 	}
4978 	/*
4979 	 * Ensure arithmetic doesn't overflow in vm_object space (kernel
4980 	 * pages).
4981 	 * We keep unaligned values for now. The call we eventually make to
4982 	 * vm_map_enter does guarantee that offset_u is page aligned for EITHER
4983 	 * target_map pages or kernel pages. But this isn't enough to guarantee
4984 	 * kernel space alignment.
4985 	 */
4986 	kr = vm_sanitize_addr_size(offset_u, initial_size_u,
4987 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, PAGE_MASK,
4988 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
4989 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4990 	    obj_offs, obj_end, obj_size);
4991 	if (__improbable(kr != KERN_SUCCESS)) {
4992 		return kr;
4993 	}
4994 
4995 	/*
4996 	 * There is no vm_sanitize_addr_size variant that also adjusts for
4997 	 * a separate offset. Rather than create one for this one-off issue,
4998 	 * we sanitize map_addr and map_size individually, relying on
4999 	 * vm_sanitize_size to incorporate the offset. Then, we perform the
5000 	 * overflow check manually below.
5001 	 */
5002 	*map_addr = vm_sanitize_addr(target_map, address_u);
5003 	kr = vm_sanitize_size(offset_u, initial_size_u,
5004 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
5005 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
5006 	if (__improbable(kr != KERN_SUCCESS)) {
5007 		return kr;
5008 	}
5009 
5010 	/*
5011 	 * Ensure arithmetic doesn't overflow in target_map space.
5012 	 * The computation of map_size above accounts for the possibility that
5013 	 * offset_u might be unaligned in target_map space.
5014 	 */
5015 	if (vmk_flags.vmf_fixed) {
5016 		vm_map_address_t map_end;
5017 
5018 		if (__improbable(os_add_overflow(*map_addr, *map_size, &map_end))) {
5019 			return KERN_INVALID_ARGUMENT;
5020 		}
5021 	}
5022 
5023 	return KERN_SUCCESS;
5024 }
5025 
5026 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_ut offset_u,boolean_t needs_copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u)5027 vm_map_enter_mem_object_control(
5028 	vm_map_t                target_map,
5029 	vm_map_offset_ut       *address_u,
5030 	vm_map_size_ut          initial_size_u,
5031 	vm_map_offset_ut        mask_u,
5032 	vm_map_kernel_flags_t   vmk_flags,
5033 	memory_object_control_t control,
5034 	vm_object_offset_ut     offset_u,
5035 	boolean_t               needs_copy,
5036 	vm_prot_ut              cur_protection_u,
5037 	vm_prot_ut              max_protection_u,
5038 	vm_inherit_ut           inheritance_u)
5039 {
5040 	vm_map_offset_t         mask;
5041 	vm_prot_t               cur_protection;
5042 	vm_prot_t               max_protection;
5043 	vm_inherit_t            inheritance;
5044 	vm_map_address_t        map_addr;
5045 	vm_map_size_t           map_size;
5046 	vm_object_t             object;
5047 	vm_object_offset_t      obj_offs, obj_end;
5048 	vm_object_size_t        obj_size;
5049 	kern_return_t           result;
5050 	memory_object_t         pager;
5051 	vm_prot_t               pager_prot;
5052 	kern_return_t           kr;
5053 
5054 	/*
5055 	 * Check arguments for validity
5056 	 */
5057 	if (target_map == VM_MAP_NULL) {
5058 		return KERN_INVALID_ARGUMENT;
5059 	}
5060 
5061 	/*
5062 	 * We only support vmf_return_data_addr-like behavior.
5063 	 */
5064 	vmk_flags.vmf_return_data_addr = true;
5065 
5066 	/*
5067 	 * Sanitize any input parameters that are addr/size/prot/inherit
5068 	 */
5069 	kr = vm_map_enter_mem_object_control_sanitize(target_map,
5070 	    *address_u,
5071 	    initial_size_u,
5072 	    mask_u,
5073 	    offset_u,
5074 	    cur_protection_u,
5075 	    max_protection_u,
5076 	    inheritance_u,
5077 	    vmk_flags,
5078 	    &map_addr,
5079 	    &map_size,
5080 	    &mask,
5081 	    &obj_offs,
5082 	    &obj_end,
5083 	    &obj_size,
5084 	    &cur_protection,
5085 	    &max_protection,
5086 	    &inheritance);
5087 	if (__improbable(kr != KERN_SUCCESS)) {
5088 		return vm_sanitize_get_kr(kr);
5089 	}
5090 
5091 	object = memory_object_control_to_vm_object(control);
5092 
5093 	if (object == VM_OBJECT_NULL) {
5094 		return KERN_INVALID_OBJECT;
5095 	}
5096 
5097 	if (is_kernel_object(object)) {
5098 		printf("Warning: Attempt to map kernel object"
5099 		    " by a non-private kernel entity\n");
5100 		return KERN_INVALID_OBJECT;
5101 	}
5102 
5103 	vm_object_lock(object);
5104 	os_ref_retain_locked_raw(&object->ref_count, &vm_object_refgrp);
5105 
5106 
5107 	/*
5108 	 * For "named" VM objects, let the pager know that the
5109 	 * memory object is being mapped.  Some pagers need to keep
5110 	 * track of this, to know when they can reclaim the memory
5111 	 * object, for example.
5112 	 * VM calls memory_object_map() for each mapping (specifying
5113 	 * the protection of each mapping) and calls
5114 	 * memory_object_last_unmap() when all the mappings are gone.
5115 	 */
5116 	pager_prot = max_protection;
5117 	if (needs_copy) {
5118 		pager_prot &= ~VM_PROT_WRITE;
5119 	}
5120 	pager = object->pager;
5121 	if (object->named &&
5122 	    pager != MEMORY_OBJECT_NULL &&
5123 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5124 		assert(object->pager_ready);
5125 		vm_object_mapping_wait(object, THREAD_UNINT);
5126 		/* object might have lost its pager while waiting */
5127 		pager = object->pager;
5128 		if (object->named && pager != MEMORY_OBJECT_NULL) {
5129 			vm_object_mapping_begin(object);
5130 			vm_object_unlock(object);
5131 
5132 			kr = memory_object_map(pager, pager_prot);
5133 			assert(kr == KERN_SUCCESS);
5134 
5135 			vm_object_lock(object);
5136 			vm_object_mapping_end(object);
5137 		}
5138 	}
5139 	vm_object_unlock(object);
5140 
5141 	/*
5142 	 *	Perform the copy if requested
5143 	 */
5144 
5145 	if (needs_copy) {
5146 		vm_object_t             new_object;
5147 		vm_object_offset_t      new_offset;
5148 
5149 		result = vm_object_copy_strategically(object, obj_offs, obj_size,
5150 		    false,                                   /* forking */
5151 		    &new_object, &new_offset,
5152 		    &needs_copy);
5153 
5154 
5155 		if (result == KERN_MEMORY_RESTART_COPY) {
5156 			boolean_t success;
5157 			boolean_t src_needs_copy;
5158 
5159 			/*
5160 			 * XXX
5161 			 * We currently ignore src_needs_copy.
5162 			 * This really is the issue of how to make
5163 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5164 			 * non-kernel users to use. Solution forthcoming.
5165 			 * In the meantime, since we don't allow non-kernel
5166 			 * memory managers to specify symmetric copy,
5167 			 * we won't run into problems here.
5168 			 */
5169 			new_object = object;
5170 			new_offset = obj_offs;
5171 			success = vm_object_copy_quickly(new_object,
5172 			    new_offset, obj_size,
5173 			    &src_needs_copy,
5174 			    &needs_copy);
5175 			assert(success);
5176 			result = KERN_SUCCESS;
5177 		}
5178 		/*
5179 		 *	Throw away the reference to the
5180 		 *	original object, as it won't be mapped.
5181 		 */
5182 
5183 		vm_object_deallocate(object);
5184 
5185 		if (result != KERN_SUCCESS) {
5186 			return result;
5187 		}
5188 
5189 		object   = new_object;
5190 		obj_offs = new_offset;
5191 	}
5192 
5193 	result = vm_map_enter(target_map,
5194 	    &map_addr, map_size,
5195 	    (vm_map_offset_t)mask,
5196 	    vmk_flags,
5197 	    object,
5198 	    obj_offs,
5199 	    needs_copy,
5200 	    cur_protection, max_protection,
5201 	    inheritance);
5202 
5203 	if (result == KERN_SUCCESS) {
5204 		*address_u = vm_sanitize_wrap_addr(
5205 			map_addr + (obj_offs & vm_map_page_mask(target_map)));
5206 	} else {
5207 		vm_object_deallocate(object);
5208 	}
5209 
5210 	return result;
5211 }
5212 
5213 
5214 /* Not used without nested pmaps */
5215 #ifndef NO_NESTED_PMAP
5216 /*
5217  * Clip and unnest a portion of a nested submap mapping.
5218  */
5219 
5220 
5221 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5222 vm_map_clip_unnest(
5223 	vm_map_t        map,
5224 	vm_map_entry_t  entry,
5225 	vm_map_offset_t start_unnest,
5226 	vm_map_offset_t end_unnest)
5227 {
5228 	vm_map_offset_t old_start_unnest = start_unnest;
5229 	vm_map_offset_t old_end_unnest = end_unnest;
5230 
5231 	assert(entry->is_sub_map);
5232 	assert(VME_SUBMAP(entry) != NULL);
5233 	assert(entry->use_pmap);
5234 
5235 	/*
5236 	 * Query the platform for the optimal unnest range.
5237 	 * DRK: There's some duplication of effort here, since
5238 	 * callers may have adjusted the range to some extent. This
5239 	 * routine was introduced to support 1GiB subtree nesting
5240 	 * for x86 platforms, which can also nest on 2MiB boundaries
5241 	 * depending on size/alignment.
5242 	 */
5243 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5244 		assert(VME_SUBMAP(entry)->is_nested_map);
5245 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5246 		log_unnest_badness(map,
5247 		    old_start_unnest,
5248 		    old_end_unnest,
5249 		    VME_SUBMAP(entry)->is_nested_map,
5250 		    (entry->vme_start +
5251 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5252 		    VME_OFFSET(entry)));
5253 	}
5254 
5255 	if (entry->vme_start > start_unnest ||
5256 	    entry->vme_end < end_unnest) {
5257 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5258 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5259 		    (long long)start_unnest, (long long)end_unnest,
5260 		    (long long)entry->vme_start, (long long)entry->vme_end);
5261 	}
5262 
5263 	if (start_unnest > entry->vme_start) {
5264 		_vm_map_clip_start(&map->hdr,
5265 		    entry,
5266 		    start_unnest);
5267 		if (map->holelistenabled) {
5268 			vm_map_store_update_first_free(map, NULL, FALSE);
5269 		} else {
5270 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5271 		}
5272 	}
5273 	if (entry->vme_end > end_unnest) {
5274 		_vm_map_clip_end(&map->hdr,
5275 		    entry,
5276 		    end_unnest);
5277 		if (map->holelistenabled) {
5278 			vm_map_store_update_first_free(map, NULL, FALSE);
5279 		} else {
5280 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5281 		}
5282 	}
5283 
5284 	pmap_unnest(map->pmap,
5285 	    entry->vme_start,
5286 	    entry->vme_end - entry->vme_start);
5287 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5288 		/* clean up parent map/maps */
5289 		vm_map_submap_pmap_clean(
5290 			map, entry->vme_start,
5291 			entry->vme_end,
5292 			VME_SUBMAP(entry),
5293 			VME_OFFSET(entry));
5294 	}
5295 	entry->use_pmap = FALSE;
5296 	if ((map->pmap != kernel_pmap) &&
5297 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5298 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5299 	}
5300 }
5301 #endif  /* NO_NESTED_PMAP */
5302 
5303 __abortlike
5304 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5305 __vm_map_clip_atomic_entry_panic(
5306 	vm_map_t        map,
5307 	vm_map_entry_t  entry,
5308 	vm_map_offset_t where)
5309 {
5310 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5311 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5312 	    (uint64_t)entry->vme_start,
5313 	    (uint64_t)entry->vme_end,
5314 	    (uint64_t)where);
5315 }
5316 
5317 /*
5318  *	vm_map_clip_start:	[ internal use only ]
5319  *
5320  *	Asserts that the given entry begins at or after
5321  *	the specified address; if necessary,
5322  *	it splits the entry into two.
5323  */
5324 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5325 vm_map_clip_start(
5326 	vm_map_t        map,
5327 	vm_map_entry_t  entry,
5328 	vm_map_offset_t startaddr)
5329 {
5330 #ifndef NO_NESTED_PMAP
5331 	if (entry->is_sub_map &&
5332 	    entry->use_pmap &&
5333 	    startaddr >= entry->vme_start) {
5334 		vm_map_offset_t start_unnest, end_unnest;
5335 
5336 		/*
5337 		 * Make sure "startaddr" is no longer in a nested range
5338 		 * before we clip.  Unnest only the minimum range the platform
5339 		 * can handle.
5340 		 * vm_map_clip_unnest may perform additional adjustments to
5341 		 * the unnest range.
5342 		 */
5343 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5344 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5345 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5346 	}
5347 #endif /* NO_NESTED_PMAP */
5348 	if (startaddr > entry->vme_start) {
5349 		if (!entry->is_sub_map &&
5350 		    VME_OBJECT(entry) &&
5351 		    VME_OBJECT(entry)->phys_contiguous) {
5352 			pmap_remove(map->pmap,
5353 			    (addr64_t)(entry->vme_start),
5354 			    (addr64_t)(entry->vme_end));
5355 		}
5356 		if (entry->vme_atomic) {
5357 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5358 		}
5359 
5360 		DTRACE_VM5(
5361 			vm_map_clip_start,
5362 			vm_map_t, map,
5363 			vm_map_offset_t, entry->vme_start,
5364 			vm_map_offset_t, entry->vme_end,
5365 			vm_map_offset_t, startaddr,
5366 			int, VME_ALIAS(entry));
5367 
5368 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5369 		if (map->holelistenabled) {
5370 			vm_map_store_update_first_free(map, NULL, FALSE);
5371 		} else {
5372 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5373 		}
5374 	}
5375 }
5376 
5377 
5378 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5379 	MACRO_BEGIN \
5380 	if ((startaddr) > (entry)->vme_start) \
5381 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5382 	MACRO_END
5383 
5384 /*
5385  *	This routine is called only when it is known that
5386  *	the entry must be split.
5387  */
5388 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5389 _vm_map_clip_start(
5390 	struct vm_map_header    *map_header,
5391 	vm_map_entry_t          entry,
5392 	vm_map_offset_t         start)
5393 {
5394 	vm_map_entry_t  new_entry;
5395 
5396 	/*
5397 	 *	Split off the front portion --
5398 	 *	note that we must insert the new
5399 	 *	entry BEFORE this one, so that
5400 	 *	this entry has the specified starting
5401 	 *	address.
5402 	 */
5403 
5404 	if (entry->map_aligned) {
5405 		assert(VM_MAP_PAGE_ALIGNED(start,
5406 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5407 	}
5408 
5409 	new_entry = _vm_map_entry_create(map_header);
5410 	vm_map_entry_copy_full(new_entry, entry);
5411 
5412 	new_entry->vme_end = start;
5413 	assert(new_entry->vme_start < new_entry->vme_end);
5414 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5415 	if (__improbable(start >= entry->vme_end)) {
5416 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5417 	}
5418 	assert(start < entry->vme_end);
5419 	entry->vme_start = start;
5420 
5421 #if VM_BTLOG_TAGS
5422 	if (new_entry->vme_kernel_object) {
5423 		btref_retain(new_entry->vme_tag_btref);
5424 	}
5425 #endif /* VM_BTLOG_TAGS */
5426 
5427 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5428 
5429 	if (entry->is_sub_map) {
5430 		vm_map_reference(VME_SUBMAP(new_entry));
5431 	} else {
5432 		vm_object_reference(VME_OBJECT(new_entry));
5433 	}
5434 }
5435 
5436 
5437 /*
5438  *	vm_map_clip_end:	[ internal use only ]
5439  *
5440  *	Asserts that the given entry ends at or before
5441  *	the specified address; if necessary,
5442  *	it splits the entry into two.
5443  */
5444 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5445 vm_map_clip_end(
5446 	vm_map_t        map,
5447 	vm_map_entry_t  entry,
5448 	vm_map_offset_t endaddr)
5449 {
5450 	if (endaddr > entry->vme_end) {
5451 		/*
5452 		 * Within the scope of this clipping, limit "endaddr" to
5453 		 * the end of this map entry...
5454 		 */
5455 		endaddr = entry->vme_end;
5456 	}
5457 #ifndef NO_NESTED_PMAP
5458 	if (entry->is_sub_map && entry->use_pmap) {
5459 		vm_map_offset_t start_unnest, end_unnest;
5460 
5461 		/*
5462 		 * Make sure the range between the start of this entry and
5463 		 * the new "endaddr" is no longer nested before we clip.
5464 		 * Unnest only the minimum range the platform can handle.
5465 		 * vm_map_clip_unnest may perform additional adjustments to
5466 		 * the unnest range.
5467 		 */
5468 		start_unnest = entry->vme_start;
5469 		end_unnest =
5470 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5471 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5472 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5473 	}
5474 #endif /* NO_NESTED_PMAP */
5475 	if (endaddr < entry->vme_end) {
5476 		if (!entry->is_sub_map &&
5477 		    VME_OBJECT(entry) &&
5478 		    VME_OBJECT(entry)->phys_contiguous) {
5479 			pmap_remove(map->pmap,
5480 			    (addr64_t)(entry->vme_start),
5481 			    (addr64_t)(entry->vme_end));
5482 		}
5483 		if (entry->vme_atomic) {
5484 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5485 		}
5486 		DTRACE_VM5(
5487 			vm_map_clip_end,
5488 			vm_map_t, map,
5489 			vm_map_offset_t, entry->vme_start,
5490 			vm_map_offset_t, entry->vme_end,
5491 			vm_map_offset_t, endaddr,
5492 			int, VME_ALIAS(entry));
5493 
5494 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5495 		if (map->holelistenabled) {
5496 			vm_map_store_update_first_free(map, NULL, FALSE);
5497 		} else {
5498 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5499 		}
5500 	}
5501 }
5502 
5503 
5504 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5505 	MACRO_BEGIN \
5506 	if ((endaddr) < (entry)->vme_end) \
5507 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5508 	MACRO_END
5509 
5510 /*
5511  *	This routine is called only when it is known that
5512  *	the entry must be split.
5513  */
5514 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5515 _vm_map_clip_end(
5516 	struct vm_map_header    *map_header,
5517 	vm_map_entry_t          entry,
5518 	vm_map_offset_t         end)
5519 {
5520 	vm_map_entry_t  new_entry;
5521 
5522 	/*
5523 	 *	Create a new entry and insert it
5524 	 *	AFTER the specified entry
5525 	 */
5526 
5527 	if (entry->map_aligned) {
5528 		assert(VM_MAP_PAGE_ALIGNED(end,
5529 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5530 	}
5531 
5532 	new_entry = _vm_map_entry_create(map_header);
5533 	vm_map_entry_copy_full(new_entry, entry);
5534 
5535 	if (__improbable(end <= entry->vme_start)) {
5536 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5537 	}
5538 	assert(entry->vme_start < end);
5539 	new_entry->vme_start = entry->vme_end = end;
5540 	VME_OFFSET_SET(new_entry,
5541 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5542 	assert(new_entry->vme_start < new_entry->vme_end);
5543 
5544 #if VM_BTLOG_TAGS
5545 	if (new_entry->vme_kernel_object) {
5546 		btref_retain(new_entry->vme_tag_btref);
5547 	}
5548 #endif /* VM_BTLOG_TAGS */
5549 
5550 	_vm_map_store_entry_link(map_header, entry, new_entry);
5551 
5552 	if (entry->is_sub_map) {
5553 		vm_map_reference(VME_SUBMAP(new_entry));
5554 	} else {
5555 		vm_object_reference(VME_OBJECT(new_entry));
5556 	}
5557 }
5558 
5559 
5560 /*
5561  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5562  *
5563  *	Asserts that the starting and ending region
5564  *	addresses fall within the valid range of the map.
5565  */
5566 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5567 	MACRO_BEGIN                             \
5568 	if (start < vm_map_min(map))            \
5569 	        start = vm_map_min(map);        \
5570 	if (end > vm_map_max(map))              \
5571 	        end = vm_map_max(map);          \
5572 	if (start > end)                        \
5573 	        start = end;                    \
5574 	MACRO_END
5575 
5576 /*
5577  *	vm_map_range_check:	[ internal use only ]
5578  *
5579  *	Check that the region defined by the specified start and
5580  *	end addresses are wholly contained within a single map
5581  *	entry or set of adjacent map entries of the spacified map,
5582  *	i.e. the specified region contains no unmapped space.
5583  *	If any or all of the region is unmapped, FALSE is returned.
5584  *	Otherwise, TRUE is returned and if the output argument 'entry'
5585  *	is not NULL it points to the map entry containing the start
5586  *	of the region.
5587  *
5588  *	The map is locked for reading on entry and is left locked.
5589  */
5590 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5591 vm_map_range_check(
5592 	vm_map_t                map,
5593 	vm_map_offset_t         start,
5594 	vm_map_offset_t         end,
5595 	vm_map_entry_t          *entry)
5596 {
5597 	vm_map_entry_t          cur;
5598 	vm_map_offset_t         prev;
5599 
5600 	/*
5601 	 *      Basic sanity checks first
5602 	 */
5603 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5604 		return FALSE;
5605 	}
5606 
5607 	/*
5608 	 *      Check first if the region starts within a valid
5609 	 *	mapping for the map.
5610 	 */
5611 	if (!vm_map_lookup_entry(map, start, &cur)) {
5612 		return FALSE;
5613 	}
5614 
5615 	/*
5616 	 *	Optimize for the case that the region is contained
5617 	 *	in a single map entry.
5618 	 */
5619 	if (entry != (vm_map_entry_t *) NULL) {
5620 		*entry = cur;
5621 	}
5622 	if (end <= cur->vme_end) {
5623 		return TRUE;
5624 	}
5625 
5626 	/*
5627 	 *      If the region is not wholly contained within a
5628 	 *      single entry, walk the entries looking for holes.
5629 	 */
5630 	prev = cur->vme_end;
5631 	cur = cur->vme_next;
5632 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5633 		if (end <= cur->vme_end) {
5634 			return TRUE;
5635 		}
5636 		prev = cur->vme_end;
5637 		cur = cur->vme_next;
5638 	}
5639 	return FALSE;
5640 }
5641 
5642 static __attribute__((always_inline, warn_unused_result))
5643 kern_return_t
vm_map_protect_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut new_prot_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * new_prot)5644 vm_map_protect_sanitize(
5645 	vm_map_t                map,
5646 	vm_map_offset_ut        start_u,
5647 	vm_map_offset_ut        end_u,
5648 	vm_prot_ut              new_prot_u,
5649 	vm_map_offset_t        *start,
5650 	vm_map_offset_t        *end,
5651 	vm_prot_t              *new_prot)
5652 {
5653 	kern_return_t           kr;
5654 	vm_map_size_t           size;
5655 
5656 	kr = vm_sanitize_prot(new_prot_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5657 	    map, VM_PROT_COPY, new_prot);
5658 	if (__improbable(kr != KERN_SUCCESS)) {
5659 		return kr;
5660 	}
5661 
5662 	kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5663 	    map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
5664 	if (__improbable(kr != KERN_SUCCESS)) {
5665 		return kr;
5666 	}
5667 
5668 	return KERN_SUCCESS;
5669 }
5670 
5671 /*
5672  *	vm_map_protect:
5673  *
5674  *	Sets the protection of the specified address
5675  *	region in the target map.  If "set_max" is
5676  *	specified, the maximum protection is to be set;
5677  *	otherwise, only the current protection is affected.
5678  */
5679 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t set_max,vm_prot_ut new_prot_u)5680 vm_map_protect(
5681 	vm_map_t                map,
5682 	vm_map_offset_ut        start_u,
5683 	vm_map_offset_ut        end_u,
5684 	boolean_t               set_max,
5685 	vm_prot_ut              new_prot_u)
5686 {
5687 	vm_map_entry_t                  current;
5688 	vm_map_offset_t                 prev;
5689 	vm_map_entry_t                  entry;
5690 	vm_prot_t                       new_prot;
5691 	vm_prot_t                       new_max;
5692 	int                             pmap_options = 0;
5693 	kern_return_t                   kr;
5694 	vm_map_offset_t                 start, original_start;
5695 	vm_map_offset_t                 end;
5696 
5697 	kr = vm_map_protect_sanitize(map,
5698 	    start_u,
5699 	    end_u,
5700 	    new_prot_u,
5701 	    &start,
5702 	    &end,
5703 	    &new_prot);
5704 	if (__improbable(kr != KERN_SUCCESS)) {
5705 		return vm_sanitize_get_kr(kr);
5706 	}
5707 	original_start = start;
5708 
5709 	if (new_prot & VM_PROT_COPY) {
5710 		vm_map_offset_t         new_start;
5711 		vm_prot_t               cur_prot, max_prot;
5712 		vm_map_kernel_flags_t   kflags;
5713 
5714 		/* LP64todo - see below */
5715 		if (start >= map->max_offset) {
5716 			return KERN_INVALID_ADDRESS;
5717 		}
5718 
5719 		if ((new_prot & VM_PROT_ALLEXEC) &&
5720 		    map->pmap != kernel_pmap &&
5721 		    (vm_map_cs_enforcement(map)
5722 #if XNU_TARGET_OS_OSX && __arm64__
5723 		    || !VM_MAP_IS_EXOTIC(map)
5724 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5725 		    ) &&
5726 		    VM_MAP_POLICY_WX_FAIL(map)) {
5727 			DTRACE_VM3(cs_wx,
5728 			    uint64_t, (uint64_t) start,
5729 			    uint64_t, (uint64_t) end,
5730 			    vm_prot_t, new_prot);
5731 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5732 			    proc_selfpid(),
5733 			    (get_bsdtask_info(current_task())
5734 			    ? proc_name_address(get_bsdtask_info(current_task()))
5735 			    : "?"),
5736 			    __FUNCTION__, __LINE__,
5737 #if DEVELOPMENT || DEBUG
5738 			    (uint64_t)start,
5739 			    (uint64_t)end,
5740 #else /* DEVELOPMENT || DEBUG */
5741 			    (uint64_t)0,
5742 			    (uint64_t)0,
5743 #endif /* DEVELOPMENT || DEBUG */
5744 			    new_prot);
5745 			return KERN_PROTECTION_FAILURE;
5746 		}
5747 
5748 		/*
5749 		 * Let vm_map_remap_extract() know that it will need to:
5750 		 * + make a copy of the mapping
5751 		 * + add VM_PROT_WRITE to the max protections
5752 		 * + remove any protections that are no longer allowed from the
5753 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
5754 		 *   example).
5755 		 * Note that "max_prot" is an IN/OUT parameter only for this
5756 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
5757 		 * only.
5758 		 */
5759 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5760 		cur_prot = VM_PROT_NONE;
5761 		kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5762 		kflags.vmkf_remap_prot_copy = true;
5763 		kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5764 		new_start = start;
5765 		kr = vm_map_remap(map,
5766 		    vm_sanitize_wrap_addr_ref(&new_start),
5767 		    end - start,
5768 		    0, /* mask */
5769 		    kflags,
5770 		    map,
5771 		    start,
5772 		    TRUE, /* copy-on-write remapping! */
5773 		    vm_sanitize_wrap_prot_ref(&cur_prot), /* IN/OUT */
5774 		    vm_sanitize_wrap_prot_ref(&max_prot), /* IN/OUT */
5775 		    VM_INHERIT_DEFAULT);
5776 		if (kr != KERN_SUCCESS) {
5777 			return kr;
5778 		}
5779 		new_prot &= ~VM_PROT_COPY;
5780 	}
5781 
5782 	vm_map_lock(map);
5783 restart_after_unlock:
5784 
5785 	/* LP64todo - remove this check when vm_map_commpage64()
5786 	 * no longer has to stuff in a map_entry for the commpage
5787 	 * above the map's max_offset.
5788 	 */
5789 	if (start >= map->max_offset) {
5790 		vm_map_unlock(map);
5791 		return KERN_INVALID_ADDRESS;
5792 	}
5793 
5794 	while (1) {
5795 		/*
5796 		 *      Lookup the entry.  If it doesn't start in a valid
5797 		 *	entry, return an error.
5798 		 */
5799 		if (!vm_map_lookup_entry(map, start, &entry)) {
5800 			vm_map_unlock(map);
5801 			return KERN_INVALID_ADDRESS;
5802 		}
5803 
5804 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5805 			start = SUPERPAGE_ROUND_DOWN(start);
5806 			continue;
5807 		}
5808 		break;
5809 	}
5810 	if (entry->superpage_size) {
5811 		end = SUPERPAGE_ROUND_UP(end);
5812 	}
5813 
5814 	/*
5815 	 *	Make a first pass to check for protection and address
5816 	 *	violations.
5817 	 */
5818 
5819 	current = entry;
5820 	prev = current->vme_start;
5821 	while ((current != vm_map_to_entry(map)) &&
5822 	    (current->vme_start < end)) {
5823 		/*
5824 		 * If there is a hole, return an error.
5825 		 */
5826 		if (current->vme_start != prev) {
5827 			vm_map_unlock(map);
5828 			return KERN_INVALID_ADDRESS;
5829 		}
5830 
5831 		new_max = current->max_protection;
5832 
5833 #if defined(__x86_64__)
5834 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5835 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5836 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5837 		}
5838 #elif CODE_SIGNING_MONITOR
5839 		if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5840 			new_max |= VM_PROT_EXECUTE;
5841 		}
5842 #endif
5843 		if ((new_prot & new_max) != new_prot) {
5844 			vm_map_unlock(map);
5845 			return KERN_PROTECTION_FAILURE;
5846 		}
5847 
5848 		if (current->used_for_jit &&
5849 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5850 			vm_map_unlock(map);
5851 			return KERN_PROTECTION_FAILURE;
5852 		}
5853 
5854 #if __arm64e__
5855 		/* Disallow protecting hw assisted TPRO mappings */
5856 		if (current->used_for_tpro) {
5857 			vm_map_unlock(map);
5858 			return KERN_PROTECTION_FAILURE;
5859 		}
5860 #endif /* __arm64e__ */
5861 
5862 
5863 		if ((new_prot & VM_PROT_WRITE) &&
5864 		    (new_prot & VM_PROT_ALLEXEC) &&
5865 #if XNU_TARGET_OS_OSX
5866 		    map->pmap != kernel_pmap &&
5867 		    (vm_map_cs_enforcement(map)
5868 #if __arm64__
5869 		    || !VM_MAP_IS_EXOTIC(map)
5870 #endif /* __arm64__ */
5871 		    ) &&
5872 #endif /* XNU_TARGET_OS_OSX */
5873 #if CODE_SIGNING_MONITOR
5874 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5875 #endif
5876 		    !(current->used_for_jit)) {
5877 			DTRACE_VM3(cs_wx,
5878 			    uint64_t, (uint64_t) current->vme_start,
5879 			    uint64_t, (uint64_t) current->vme_end,
5880 			    vm_prot_t, new_prot);
5881 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5882 			    proc_selfpid(),
5883 			    (get_bsdtask_info(current_task())
5884 			    ? proc_name_address(get_bsdtask_info(current_task()))
5885 			    : "?"),
5886 			    __FUNCTION__, __LINE__,
5887 #if DEVELOPMENT || DEBUG
5888 			    (uint64_t)current->vme_start,
5889 			    (uint64_t)current->vme_end,
5890 #else /* DEVELOPMENT || DEBUG */
5891 			    (uint64_t)0,
5892 			    (uint64_t)0,
5893 #endif /* DEVELOPMENT || DEBUG */
5894 			    new_prot);
5895 			new_prot &= ~VM_PROT_ALLEXEC;
5896 			if (VM_MAP_POLICY_WX_FAIL(map)) {
5897 				vm_map_unlock(map);
5898 				return KERN_PROTECTION_FAILURE;
5899 			}
5900 		}
5901 
5902 		/*
5903 		 * If the task has requested executable lockdown,
5904 		 * deny both:
5905 		 * - adding executable protections OR
5906 		 * - adding write protections to an existing executable mapping.
5907 		 */
5908 		if (map->map_disallow_new_exec == TRUE) {
5909 			if ((new_prot & VM_PROT_ALLEXEC) ||
5910 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5911 				vm_map_unlock(map);
5912 				return KERN_PROTECTION_FAILURE;
5913 			}
5914 		}
5915 
5916 		prev = current->vme_end;
5917 		current = current->vme_next;
5918 	}
5919 
5920 #if __arm64__
5921 	if (end > prev &&
5922 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5923 		vm_map_entry_t prev_entry;
5924 
5925 		prev_entry = current->vme_prev;
5926 		if (prev_entry != vm_map_to_entry(map) &&
5927 		    !prev_entry->map_aligned &&
5928 		    (vm_map_round_page(prev_entry->vme_end,
5929 		    VM_MAP_PAGE_MASK(map))
5930 		    == end)) {
5931 			/*
5932 			 * The last entry in our range is not "map-aligned"
5933 			 * but it would have reached all the way to "end"
5934 			 * if it had been map-aligned, so this is not really
5935 			 * a hole in the range and we can proceed.
5936 			 */
5937 			prev = end;
5938 		}
5939 	}
5940 #endif /* __arm64__ */
5941 
5942 	if (end > prev) {
5943 		vm_map_unlock(map);
5944 		return KERN_INVALID_ADDRESS;
5945 	}
5946 
5947 	/*
5948 	 *	Go back and fix up protections.
5949 	 *	Clip to start here if the range starts within
5950 	 *	the entry.
5951 	 */
5952 
5953 	current = entry;
5954 	if (current != vm_map_to_entry(map)) {
5955 		/* clip and unnest if necessary */
5956 		vm_map_clip_start(map, current, start);
5957 	}
5958 
5959 	while ((current != vm_map_to_entry(map)) &&
5960 	    (current->vme_start < end)) {
5961 		vm_prot_t       old_prot;
5962 
5963 		if (current->in_transition) {
5964 			wait_result_t wait_result;
5965 			vm_map_offset_t current_start;
5966 
5967 			/*
5968 			 * Another thread is wiring/unwiring this entry.
5969 			 * Let the other thread know we are waiting.
5970 			 */
5971 			current_start = current->vme_start;
5972 			current->needs_wakeup = true;
5973 			/* wait for the other thread to be done */
5974 			wait_result = vm_map_entry_wait(map, TH_UNINT);
5975 			/*
5976 			 * We unlocked the map, so anything could have changed in the
5977 			 * range and we need to re-check from "current_start" to "end".
5978 			 * Our entries might no longer be valid.
5979 			 */
5980 			current = NULL;
5981 			entry = NULL;
5982 			/*
5983 			 * Re-lookup and re-clip "current_start".
5984 			 * If it's no longer mapped,
5985 			 */
5986 			vm_map_lookup_entry_or_next(map, current_start, &current);
5987 			if (current != vm_map_to_entry(map)) {
5988 				vm_map_clip_start(map, current, current_start);
5989 			}
5990 			/* restart from this point */
5991 			start = current_start;
5992 			goto restart_after_unlock;
5993 		}
5994 
5995 		vm_map_clip_end(map, current, end);
5996 
5997 #if DEVELOPMENT || DEBUG
5998 		if (current->csm_associated && vm_log_xnu_user_debug) {
5999 			printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
6000 			    proc_selfpid(),
6001 			    (get_bsdtask_info(current_task())
6002 			    ? proc_name_address(get_bsdtask_info(current_task()))
6003 			    : "?"),
6004 			    __FUNCTION__,
6005 			    (uint64_t)start,
6006 			    (uint64_t)end,
6007 			    new_prot,
6008 			    map, current,
6009 			    current->vme_start,
6010 			    current->vme_end,
6011 			    current->protection,
6012 			    current->max_protection);
6013 		}
6014 #endif /* DEVELOPMENT || DEBUG */
6015 
6016 		if (current->is_sub_map) {
6017 			/* clipping did unnest if needed */
6018 			assert(!current->use_pmap);
6019 		}
6020 
6021 		old_prot = current->protection;
6022 
6023 		if (set_max) {
6024 			current->max_protection = new_prot;
6025 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6026 			current->protection = (new_prot & old_prot);
6027 		} else {
6028 			current->protection = new_prot;
6029 		}
6030 
6031 #if CODE_SIGNING_MONITOR
6032 		if (/* a !csm_associated mapping becoming executable */
6033 			((!current->csm_associated &&
6034 			!(old_prot & VM_PROT_EXECUTE) &&
6035 			(current->protection & VM_PROT_EXECUTE))
6036 			||
6037 			/* a csm_associated mapping becoming writable */
6038 			(current->csm_associated &&
6039 			!(old_prot & VM_PROT_WRITE) &&
6040 			(current->protection & VM_PROT_WRITE)))) {
6041 			/*
6042 			 * This mapping has not already been marked as
6043 			 * "user_debug" and it is either:
6044 			 * 1. not code-signing-monitored and becoming executable
6045 			 * 2. code-signing-monitored and becoming writable,
6046 			 * so inform the CodeSigningMonitor and mark the
6047 			 * mapping as "user_debug" if appropriate.
6048 			 */
6049 			vm_map_kernel_flags_t vmk_flags;
6050 			vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6051 			/* pretend it's a vm_protect(VM_PROT_COPY)... */
6052 			vmk_flags.vmkf_remap_prot_copy = true;
6053 			kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6054 #if DEVELOPMENT || DEBUG
6055 			if (vm_log_xnu_user_debug) {
6056 				printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6057 				    proc_selfpid(),
6058 				    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6059 				    __FUNCTION__, __LINE__,
6060 				    map, current,
6061 				    current->vme_start, current->vme_end,
6062 				    old_prot, current->protection,
6063 				    kr, current->vme_xnu_user_debug);
6064 			}
6065 #endif /* DEVELOPMENT || DEBUG */
6066 		}
6067 #endif /* CODE_SIGNING_MONITOR */
6068 
6069 		/*
6070 		 *	Update physical map if necessary.
6071 		 *	If the request is to turn off write protection,
6072 		 *	we won't do it for real (in pmap). This is because
6073 		 *	it would cause copy-on-write to fail.  We've already
6074 		 *	set, the new protection in the map, so if a
6075 		 *	write-protect fault occurred, it will be fixed up
6076 		 *	properly, COW or not.
6077 		 */
6078 		if (current->protection != old_prot) {
6079 			/* Look one level in we support nested pmaps */
6080 			/* from mapped submaps which are direct entries */
6081 			/* in our map */
6082 
6083 			vm_prot_t prot;
6084 
6085 			prot = current->protection;
6086 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6087 				prot &= ~VM_PROT_WRITE;
6088 			} else {
6089 				assert(!VME_OBJECT(current)->code_signed);
6090 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6091 				if (prot & VM_PROT_WRITE) {
6092 					/*
6093 					 * For write requests on the
6094 					 * compressor, we wil ask the
6095 					 * pmap layer to prevent us from
6096 					 * taking a write fault when we
6097 					 * attempt to access the mapping
6098 					 * next.
6099 					 */
6100 					pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6101 				}
6102 			}
6103 
6104 			if (override_nx(map, VME_ALIAS(current)) && prot) {
6105 				prot |= VM_PROT_EXECUTE;
6106 			}
6107 
6108 #if DEVELOPMENT || DEBUG
6109 			if (!(old_prot & VM_PROT_EXECUTE) &&
6110 			    (prot & VM_PROT_EXECUTE) &&
6111 			    panic_on_unsigned_execute &&
6112 			    (proc_selfcsflags() & CS_KILL)) {
6113 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6114 			}
6115 #endif /* DEVELOPMENT || DEBUG */
6116 
6117 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6118 				if (current->wired_count) {
6119 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6120 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6121 				}
6122 
6123 				/* If the pmap layer cares about this
6124 				 * protection type, force a fault for
6125 				 * each page so that vm_fault will
6126 				 * repopulate the page with the full
6127 				 * set of protections.
6128 				 */
6129 				/*
6130 				 * TODO: We don't seem to need this,
6131 				 * but this is due to an internal
6132 				 * implementation detail of
6133 				 * pmap_protect.  Do we want to rely
6134 				 * on this?
6135 				 */
6136 				prot = VM_PROT_NONE;
6137 			}
6138 
6139 			if (current->is_sub_map && current->use_pmap) {
6140 				pmap_protect(VME_SUBMAP(current)->pmap,
6141 				    current->vme_start,
6142 				    current->vme_end,
6143 				    prot);
6144 			} else {
6145 				pmap_protect_options(map->pmap,
6146 				    current->vme_start,
6147 				    current->vme_end,
6148 				    prot,
6149 				    pmap_options,
6150 				    NULL);
6151 			}
6152 		}
6153 		current = current->vme_next;
6154 	}
6155 
6156 	if (entry == VM_MAP_ENTRY_NULL) {
6157 		/*
6158 		 * Re-lookup the original start of our range.
6159 		 * If it's no longer mapped, start with the next mapping.
6160 		 */
6161 		vm_map_lookup_entry_or_next(map, original_start, &entry);
6162 	}
6163 	current = entry;
6164 	while ((current != vm_map_to_entry(map)) &&
6165 	    (current->vme_start <= end)) {
6166 		vm_map_simplify_entry(map, current);
6167 		current = current->vme_next;
6168 	}
6169 
6170 	vm_map_unlock(map);
6171 	return KERN_SUCCESS;
6172 }
6173 
6174 static __attribute__((always_inline, warn_unused_result))
6175 kern_return_t
vm_map_inherit_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_inherit_t * new_inheritance)6176 vm_map_inherit_sanitize(
6177 	vm_map_t                        map,
6178 	vm_map_offset_ut                start_u,
6179 	vm_map_offset_ut                end_u,
6180 	vm_inherit_ut                   new_inheritance_u,
6181 	vm_map_offset_t                *start,
6182 	vm_map_offset_t                *end,
6183 	vm_inherit_t                   *new_inheritance)
6184 {
6185 	kern_return_t   kr;
6186 	vm_map_size_t   size;
6187 
6188 	kr = vm_sanitize_inherit(new_inheritance_u,
6189 	    VM_SANITIZE_CALLER_VM_MAP_INHERIT, new_inheritance);
6190 	if (__improbable(kr != KERN_SUCCESS)) {
6191 		return kr;
6192 	}
6193 
6194 	kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_INHERIT,
6195 	    map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
6196 	if (__improbable(kr != KERN_SUCCESS)) {
6197 		return kr;
6198 	}
6199 
6200 	return KERN_SUCCESS;
6201 }
6202 
6203 /*
6204  *	vm_map_inherit:
6205  *
6206  *	Sets the inheritance of the specified address
6207  *	range in the target map.  Inheritance
6208  *	affects how the map will be shared with
6209  *	child maps at the time of vm_map_fork.
6210  */
6211 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u)6212 vm_map_inherit(
6213 	vm_map_t                        map,
6214 	vm_map_offset_ut                start_u,
6215 	vm_map_offset_ut                end_u,
6216 	vm_inherit_ut                   new_inheritance_u)
6217 {
6218 	vm_map_entry_t  entry;
6219 	vm_map_entry_t  temp_entry;
6220 	kern_return_t   kr;
6221 	vm_map_offset_t start;
6222 	vm_map_offset_t end;
6223 	vm_inherit_t    new_inheritance;
6224 
6225 	kr = vm_map_inherit_sanitize(map,
6226 	    start_u,
6227 	    end_u,
6228 	    new_inheritance_u,
6229 	    &start,
6230 	    &end,
6231 	    &new_inheritance);
6232 	if (__improbable(kr != KERN_SUCCESS)) {
6233 		return vm_sanitize_get_kr(kr);
6234 	}
6235 
6236 	vm_map_lock(map);
6237 
6238 	VM_MAP_RANGE_CHECK(map, start, end);
6239 
6240 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6241 		entry = temp_entry;
6242 	} else {
6243 		temp_entry = temp_entry->vme_next;
6244 		entry = temp_entry;
6245 	}
6246 
6247 	/* first check entire range for entries which can't support the */
6248 	/* given inheritance. */
6249 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6250 		if (entry->is_sub_map) {
6251 			if (new_inheritance == VM_INHERIT_COPY) {
6252 				vm_map_unlock(map);
6253 				return KERN_INVALID_ARGUMENT;
6254 			}
6255 		}
6256 
6257 		entry = entry->vme_next;
6258 	}
6259 
6260 	entry = temp_entry;
6261 	if (entry != vm_map_to_entry(map)) {
6262 		/* clip and unnest if necessary */
6263 		vm_map_clip_start(map, entry, start);
6264 	}
6265 
6266 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6267 		vm_map_clip_end(map, entry, end);
6268 		if (entry->is_sub_map) {
6269 			/* clip did unnest if needed */
6270 			assert(!entry->use_pmap);
6271 		}
6272 
6273 		entry->inheritance = new_inheritance;
6274 
6275 		entry = entry->vme_next;
6276 	}
6277 
6278 	vm_map_unlock(map);
6279 	return KERN_SUCCESS;
6280 }
6281 
6282 /*
6283  * Update the accounting for the amount of wired memory in this map.  If the user has
6284  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6285  */
6286 
6287 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6288 add_wire_counts(
6289 	vm_map_t        map,
6290 	vm_map_entry_t  entry,
6291 	boolean_t       user_wire)
6292 {
6293 	vm_map_size_t   size;
6294 
6295 	bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6296 
6297 	if (user_wire) {
6298 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6299 
6300 		/*
6301 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6302 		 * this map entry.
6303 		 */
6304 
6305 		if (entry->user_wired_count == 0) {
6306 			size = entry->vme_end - entry->vme_start;
6307 
6308 			/*
6309 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6310 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6311 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6312 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6313 			 * limit, then we fail.
6314 			 */
6315 
6316 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6317 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6318 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6319 #if DEVELOPMENT || DEBUG
6320 					if (panic_on_mlock_failure) {
6321 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6322 					}
6323 #endif /* DEVELOPMENT || DEBUG */
6324 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6325 				} else {
6326 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6327 #if DEVELOPMENT || DEBUG
6328 					if (panic_on_mlock_failure) {
6329 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6330 					}
6331 #endif /* DEVELOPMENT || DEBUG */
6332 				}
6333 				return KERN_RESOURCE_SHORTAGE;
6334 			}
6335 
6336 			/*
6337 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6338 			 * the total that has been wired in the map.
6339 			 */
6340 
6341 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6342 				return KERN_FAILURE;
6343 			}
6344 
6345 			entry->wired_count++;
6346 			map->user_wire_size += size;
6347 		}
6348 
6349 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6350 			return KERN_FAILURE;
6351 		}
6352 
6353 		entry->user_wired_count++;
6354 	} else {
6355 		/*
6356 		 * The kernel's wiring the memory.  Just bump the count and continue.
6357 		 */
6358 
6359 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6360 			panic("vm_map_wire: too many wirings");
6361 		}
6362 
6363 		entry->wired_count++;
6364 	}
6365 
6366 	if (first_wire) {
6367 		vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6368 	}
6369 
6370 	return KERN_SUCCESS;
6371 }
6372 
6373 /*
6374  * Update the memory wiring accounting now that the given map entry is being unwired.
6375  */
6376 
6377 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6378 subtract_wire_counts(
6379 	vm_map_t        map,
6380 	vm_map_entry_t  entry,
6381 	boolean_t       user_wire)
6382 {
6383 	if (user_wire) {
6384 		/*
6385 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6386 		 */
6387 
6388 		if (entry->user_wired_count == 1) {
6389 			/*
6390 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6391 			 * user wired memory for this map.
6392 			 */
6393 
6394 			assert(entry->wired_count >= 1);
6395 			entry->wired_count--;
6396 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6397 		}
6398 
6399 		assert(entry->user_wired_count >= 1);
6400 		entry->user_wired_count--;
6401 	} else {
6402 		/*
6403 		 * The kernel is unwiring the memory.   Just update the count.
6404 		 */
6405 
6406 		assert(entry->wired_count >= 1);
6407 		entry->wired_count--;
6408 	}
6409 
6410 	vme_btref_consider_and_put(entry);
6411 }
6412 
6413 int cs_executable_wire = 0;
6414 
6415 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6416 vm_map_wire_nested(
6417 	vm_map_t                map,
6418 	vm_map_offset_t         start,
6419 	vm_map_offset_t         end,
6420 	vm_prot_t               caller_prot,
6421 	vm_tag_t                tag,
6422 	boolean_t               user_wire,
6423 	pmap_t                  map_pmap,
6424 	vm_map_offset_t         pmap_addr,
6425 	ppnum_t                *physpage_p)
6426 {
6427 	vm_map_entry_t          entry;
6428 	vm_prot_t               access_type;
6429 	struct vm_map_entry     *first_entry, tmp_entry;
6430 	vm_map_t                real_map;
6431 	vm_map_offset_t         s, e;
6432 	kern_return_t           rc;
6433 	boolean_t               need_wakeup;
6434 	boolean_t               main_map = FALSE;
6435 	wait_interrupt_t        interruptible_state;
6436 	thread_t                cur_thread;
6437 	unsigned int            last_timestamp;
6438 	vm_map_size_t           size;
6439 	boolean_t               wire_and_extract;
6440 	vm_prot_t               extra_prots;
6441 
6442 	extra_prots = VM_PROT_COPY;
6443 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6444 #if XNU_TARGET_OS_OSX
6445 	if (map->pmap == kernel_pmap ||
6446 	    !vm_map_cs_enforcement(map)) {
6447 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6448 	}
6449 #endif /* XNU_TARGET_OS_OSX */
6450 #if CODE_SIGNING_MONITOR
6451 	if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6452 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6453 	}
6454 #endif /* CODE_SIGNING_MONITOR */
6455 
6456 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6457 
6458 	wire_and_extract = FALSE;
6459 	if (physpage_p != NULL) {
6460 		/*
6461 		 * The caller wants the physical page number of the
6462 		 * wired page.  We return only one physical page number
6463 		 * so this works for only one page at a time.
6464 		 *
6465 		 * The only caller (vm_map_wire_and_extract)
6466 		 * guarantees it.
6467 		 */
6468 		assert(end - start == VM_MAP_PAGE_SIZE(map));
6469 		wire_and_extract = TRUE;
6470 		*physpage_p = 0;
6471 	}
6472 
6473 	VM_MAP_RANGE_CHECK(map, start, end);
6474 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6475 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6476 	if (start == end) {
6477 		/* We wired what the caller asked for, zero pages */
6478 		return KERN_SUCCESS;
6479 	}
6480 
6481 	vm_map_lock(map);
6482 	if (map_pmap == NULL) {
6483 		main_map = TRUE;
6484 	}
6485 	last_timestamp = map->timestamp;
6486 
6487 	need_wakeup = FALSE;
6488 	cur_thread = current_thread();
6489 
6490 	s = start;
6491 	rc = KERN_SUCCESS;
6492 
6493 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6494 		entry = first_entry;
6495 		/*
6496 		 * vm_map_clip_start will be done later.
6497 		 * We don't want to unnest any nested submaps here !
6498 		 */
6499 	} else {
6500 		/* Start address is not in map */
6501 		rc = KERN_INVALID_ADDRESS;
6502 		goto done;
6503 	}
6504 
6505 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6506 		/*
6507 		 * At this point, we have wired from "start" to "s".
6508 		 * We still need to wire from "s" to "end".
6509 		 *
6510 		 * "entry" hasn't been clipped, so it could start before "s"
6511 		 * and/or end after "end".
6512 		 */
6513 
6514 		/* "e" is how far we want to wire in this entry */
6515 		e = entry->vme_end;
6516 		if (e > end) {
6517 			e = end;
6518 		}
6519 
6520 		/*
6521 		 * If another thread is wiring/unwiring this entry then
6522 		 * block after informing other thread to wake us up.
6523 		 */
6524 		if (entry->in_transition) {
6525 			wait_result_t wait_result;
6526 
6527 			/*
6528 			 * We have not clipped the entry.  Make sure that
6529 			 * the start address is in range so that the lookup
6530 			 * below will succeed.
6531 			 * "s" is the current starting point: we've already
6532 			 * wired from "start" to "s" and we still have
6533 			 * to wire from "s" to "end".
6534 			 */
6535 
6536 			entry->needs_wakeup = TRUE;
6537 
6538 			/*
6539 			 * wake up anybody waiting on entries that we have
6540 			 * already wired.
6541 			 */
6542 			if (need_wakeup) {
6543 				vm_map_entry_wakeup(map);
6544 				need_wakeup = FALSE;
6545 			}
6546 			/*
6547 			 * User wiring is interruptible
6548 			 */
6549 			wait_result = vm_map_entry_wait(map,
6550 			    (user_wire) ? THREAD_ABORTSAFE :
6551 			    THREAD_UNINT);
6552 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6553 				/*
6554 				 * undo the wirings we have done so far
6555 				 * We do not clear the needs_wakeup flag,
6556 				 * because we cannot tell if we were the
6557 				 * only one waiting.
6558 				 */
6559 				rc = KERN_FAILURE;
6560 				goto done;
6561 			}
6562 
6563 			/*
6564 			 * Cannot avoid a lookup here. reset timestamp.
6565 			 */
6566 			last_timestamp = map->timestamp;
6567 
6568 			/*
6569 			 * The entry could have been clipped, look it up again.
6570 			 * Worse that can happen is, it may not exist anymore.
6571 			 */
6572 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6573 				/*
6574 				 * User: undo everything upto the previous
6575 				 * entry.  let vm_map_unwire worry about
6576 				 * checking the validity of the range.
6577 				 */
6578 				rc = KERN_FAILURE;
6579 				goto done;
6580 			}
6581 			entry = first_entry;
6582 			continue;
6583 		}
6584 
6585 		if (entry->is_sub_map) {
6586 			vm_map_offset_t sub_start;
6587 			vm_map_offset_t sub_end;
6588 			vm_map_offset_t local_start;
6589 			vm_map_offset_t local_end;
6590 			pmap_t          pmap;
6591 			vm_map_t        sub_map = VM_MAP_NULL;
6592 
6593 			if (wire_and_extract) {
6594 				/*
6595 				 * Wiring would result in copy-on-write
6596 				 * which would not be compatible with
6597 				 * the sharing we have with the original
6598 				 * provider of this memory.
6599 				 */
6600 				rc = KERN_INVALID_ARGUMENT;
6601 				goto done;
6602 			}
6603 
6604 			vm_map_clip_start(map, entry, s);
6605 			vm_map_clip_end(map, entry, end);
6606 
6607 			sub_start = VME_OFFSET(entry);
6608 			sub_end = entry->vme_end;
6609 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6610 
6611 			local_end = entry->vme_end;
6612 			if (map_pmap == NULL) {
6613 				vm_object_t             object;
6614 				vm_object_offset_t      offset;
6615 				vm_prot_t               prot;
6616 				boolean_t               wired;
6617 				vm_map_entry_t          local_entry;
6618 				vm_map_version_t         version;
6619 				vm_map_t                lookup_map;
6620 
6621 				if (entry->use_pmap) {
6622 					pmap = VME_SUBMAP(entry)->pmap;
6623 					/* ppc implementation requires that */
6624 					/* submaps pmap address ranges line */
6625 					/* up with parent map */
6626 #ifdef notdef
6627 					pmap_addr = sub_start;
6628 #endif
6629 					pmap_addr = s;
6630 				} else {
6631 					pmap = map->pmap;
6632 					pmap_addr = s;
6633 				}
6634 
6635 				if (entry->wired_count) {
6636 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6637 						goto done;
6638 					}
6639 
6640 					/*
6641 					 * The map was not unlocked:
6642 					 * no need to goto re-lookup.
6643 					 * Just go directly to next entry.
6644 					 */
6645 					entry = entry->vme_next;
6646 					s = entry->vme_start;
6647 					continue;
6648 				}
6649 
6650 				/* call vm_map_lookup_and_lock_object to */
6651 				/* cause any needs copy to be   */
6652 				/* evaluated */
6653 				local_start = entry->vme_start;
6654 				lookup_map = map;
6655 				vm_map_lock_write_to_read(map);
6656 				rc = vm_map_lookup_and_lock_object(
6657 					&lookup_map, local_start,
6658 					(access_type | extra_prots),
6659 					OBJECT_LOCK_EXCLUSIVE,
6660 					&version, &object,
6661 					&offset, &prot, &wired,
6662 					NULL,
6663 					&real_map, NULL);
6664 				if (rc != KERN_SUCCESS) {
6665 					vm_map_unlock_read(lookup_map);
6666 					assert(map_pmap == NULL);
6667 					vm_map_unwire_nested(map, start,
6668 					    s, user_wire, PMAP_NULL, 0);
6669 					return rc;
6670 				}
6671 				vm_object_unlock(object);
6672 				if (real_map != lookup_map) {
6673 					vm_map_unlock(real_map);
6674 				}
6675 				vm_map_unlock_read(lookup_map);
6676 				vm_map_lock(map);
6677 
6678 				/* we unlocked, so must re-lookup */
6679 				if (!vm_map_lookup_entry(map,
6680 				    local_start,
6681 				    &local_entry)) {
6682 					rc = KERN_FAILURE;
6683 					goto done;
6684 				}
6685 
6686 				/*
6687 				 * entry could have been "simplified",
6688 				 * so re-clip
6689 				 */
6690 				entry = local_entry;
6691 				assert(s == local_start);
6692 				vm_map_clip_start(map, entry, s);
6693 				vm_map_clip_end(map, entry, end);
6694 				/* re-compute "e" */
6695 				e = entry->vme_end;
6696 				if (e > end) {
6697 					e = end;
6698 				}
6699 
6700 				/* did we have a change of type? */
6701 				if (!entry->is_sub_map) {
6702 					last_timestamp = map->timestamp;
6703 					continue;
6704 				}
6705 			} else {
6706 				local_start = entry->vme_start;
6707 				pmap = map_pmap;
6708 			}
6709 
6710 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6711 				goto done;
6712 			}
6713 
6714 			entry->in_transition = TRUE;
6715 
6716 			sub_map = VME_SUBMAP(entry);
6717 			vm_map_reference(sub_map);
6718 			vm_map_unlock(map);
6719 			rc = vm_map_wire_nested(sub_map,
6720 			    sub_start, sub_end,
6721 			    caller_prot, tag,
6722 			    user_wire, pmap, pmap_addr,
6723 			    NULL);
6724 			vm_map_deallocate(sub_map);
6725 			sub_map = VM_MAP_NULL;
6726 			vm_map_lock(map);
6727 
6728 			/*
6729 			 * Find the entry again.  It could have been clipped
6730 			 * after we unlocked the map.
6731 			 */
6732 			if (!vm_map_lookup_entry(map, local_start,
6733 			    &first_entry)) {
6734 				panic("vm_map_wire: re-lookup failed");
6735 			}
6736 			entry = first_entry;
6737 
6738 			assert(local_start == s);
6739 			/* re-compute "e" */
6740 			e = entry->vme_end;
6741 			if (e > end) {
6742 				e = end;
6743 			}
6744 
6745 			last_timestamp = map->timestamp;
6746 			while ((entry != vm_map_to_entry(map)) &&
6747 			    (entry->vme_start < e)) {
6748 				assert(entry->in_transition);
6749 				entry->in_transition = FALSE;
6750 				if (entry->needs_wakeup) {
6751 					entry->needs_wakeup = FALSE;
6752 					need_wakeup = TRUE;
6753 				}
6754 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6755 					subtract_wire_counts(map, entry, user_wire);
6756 				}
6757 				entry = entry->vme_next;
6758 			}
6759 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6760 				goto done;
6761 			}
6762 
6763 			/* no need to relookup again */
6764 			s = entry->vme_start;
6765 			continue;
6766 		}
6767 
6768 		/*
6769 		 * If this entry is already wired then increment
6770 		 * the appropriate wire reference count.
6771 		 */
6772 		if (entry->wired_count) {
6773 			if ((entry->protection & access_type) != access_type) {
6774 				/* found a protection problem */
6775 
6776 				/*
6777 				 * XXX FBDP
6778 				 * We should always return an error
6779 				 * in this case but since we didn't
6780 				 * enforce it before, let's do
6781 				 * it only for the new "wire_and_extract"
6782 				 * code path for now...
6783 				 */
6784 				if (wire_and_extract) {
6785 					rc = KERN_PROTECTION_FAILURE;
6786 					goto done;
6787 				}
6788 			}
6789 
6790 			/*
6791 			 * entry is already wired down, get our reference
6792 			 * after clipping to our range.
6793 			 */
6794 			vm_map_clip_start(map, entry, s);
6795 			vm_map_clip_end(map, entry, end);
6796 
6797 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6798 				goto done;
6799 			}
6800 
6801 			if (wire_and_extract) {
6802 				vm_object_t             object;
6803 				vm_object_offset_t      offset;
6804 				vm_page_t               m;
6805 
6806 				/*
6807 				 * We don't have to "wire" the page again
6808 				 * bit we still have to "extract" its
6809 				 * physical page number, after some sanity
6810 				 * checks.
6811 				 */
6812 				assert((entry->vme_end - entry->vme_start)
6813 				    == PAGE_SIZE);
6814 				assert(!entry->needs_copy);
6815 				assert(!entry->is_sub_map);
6816 				assert(VME_OBJECT(entry));
6817 				if (((entry->vme_end - entry->vme_start)
6818 				    != PAGE_SIZE) ||
6819 				    entry->needs_copy ||
6820 				    entry->is_sub_map ||
6821 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
6822 					rc = KERN_INVALID_ARGUMENT;
6823 					goto done;
6824 				}
6825 
6826 				object = VME_OBJECT(entry);
6827 				offset = VME_OFFSET(entry);
6828 				/* need exclusive lock to update m->dirty */
6829 				if (entry->protection & VM_PROT_WRITE) {
6830 					vm_object_lock(object);
6831 				} else {
6832 					vm_object_lock_shared(object);
6833 				}
6834 				m = vm_page_lookup(object, offset);
6835 				assert(m != VM_PAGE_NULL);
6836 				assert(VM_PAGE_WIRED(m));
6837 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6838 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6839 					if (entry->protection & VM_PROT_WRITE) {
6840 						vm_object_lock_assert_exclusive(
6841 							object);
6842 						m->vmp_dirty = TRUE;
6843 					}
6844 				} else {
6845 					/* not already wired !? */
6846 					*physpage_p = 0;
6847 				}
6848 				vm_object_unlock(object);
6849 			}
6850 
6851 			/* map was not unlocked: no need to relookup */
6852 			entry = entry->vme_next;
6853 			s = entry->vme_start;
6854 			continue;
6855 		}
6856 
6857 		/*
6858 		 * Unwired entry or wire request transmitted via submap
6859 		 */
6860 
6861 		/*
6862 		 * Wiring would copy the pages to the shadow object.
6863 		 * The shadow object would not be code-signed so
6864 		 * attempting to execute code from these copied pages
6865 		 * would trigger a code-signing violation.
6866 		 */
6867 
6868 		if ((entry->protection & VM_PROT_EXECUTE)
6869 #if XNU_TARGET_OS_OSX
6870 		    &&
6871 		    map->pmap != kernel_pmap &&
6872 		    (vm_map_cs_enforcement(map)
6873 #if __arm64__
6874 		    || !VM_MAP_IS_EXOTIC(map)
6875 #endif /* __arm64__ */
6876 		    )
6877 #endif /* XNU_TARGET_OS_OSX */
6878 #if CODE_SIGNING_MONITOR
6879 		    &&
6880 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6881 #endif
6882 		    ) {
6883 #if MACH_ASSERT
6884 			printf("pid %d[%s] wiring executable range from "
6885 			    "0x%llx to 0x%llx: rejected to preserve "
6886 			    "code-signing\n",
6887 			    proc_selfpid(),
6888 			    (get_bsdtask_info(current_task())
6889 			    ? proc_name_address(get_bsdtask_info(current_task()))
6890 			    : "?"),
6891 			    (uint64_t) entry->vme_start,
6892 			    (uint64_t) entry->vme_end);
6893 #endif /* MACH_ASSERT */
6894 			DTRACE_VM2(cs_executable_wire,
6895 			    uint64_t, (uint64_t)entry->vme_start,
6896 			    uint64_t, (uint64_t)entry->vme_end);
6897 			cs_executable_wire++;
6898 			rc = KERN_PROTECTION_FAILURE;
6899 			goto done;
6900 		}
6901 
6902 		/*
6903 		 * Perform actions of vm_map_lookup that need the write
6904 		 * lock on the map: create a shadow object for a
6905 		 * copy-on-write region, or an object for a zero-fill
6906 		 * region.
6907 		 */
6908 		size = entry->vme_end - entry->vme_start;
6909 		/*
6910 		 * If wiring a copy-on-write page, we need to copy it now
6911 		 * even if we're only (currently) requesting read access.
6912 		 * This is aggressive, but once it's wired we can't move it.
6913 		 */
6914 		if (entry->needs_copy) {
6915 			if (wire_and_extract) {
6916 				/*
6917 				 * We're supposed to share with the original
6918 				 * provider so should not be "needs_copy"
6919 				 */
6920 				rc = KERN_INVALID_ARGUMENT;
6921 				goto done;
6922 			}
6923 
6924 			VME_OBJECT_SHADOW(entry, size,
6925 			    vm_map_always_shadow(map));
6926 			entry->needs_copy = FALSE;
6927 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6928 			if (wire_and_extract) {
6929 				/*
6930 				 * We're supposed to share with the original
6931 				 * provider so should already have an object.
6932 				 */
6933 				rc = KERN_INVALID_ARGUMENT;
6934 				goto done;
6935 			}
6936 			VME_OBJECT_SET(entry, vm_object_allocate(size, map->serial_id), false, 0);
6937 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6938 			assert(entry->use_pmap);
6939 		} else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6940 			if (wire_and_extract) {
6941 				/*
6942 				 * We're supposed to share with the original
6943 				 * provider so should not be COPY_SYMMETRIC.
6944 				 */
6945 				rc = KERN_INVALID_ARGUMENT;
6946 				goto done;
6947 			}
6948 			/*
6949 			 * Force an unrequested "copy-on-write" but only for
6950 			 * the range we're wiring.
6951 			 */
6952 //			printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6953 			vm_map_clip_start(map, entry, s);
6954 			vm_map_clip_end(map, entry, end);
6955 			/* recompute "size" */
6956 			size = entry->vme_end - entry->vme_start;
6957 			/* make a shadow object */
6958 			vm_object_t orig_object;
6959 			vm_object_offset_t orig_offset;
6960 			orig_object = VME_OBJECT(entry);
6961 			orig_offset = VME_OFFSET(entry);
6962 			VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6963 			if (VME_OBJECT(entry) != orig_object) {
6964 				/*
6965 				 * This mapping has not been shared (or it would be
6966 				 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6967 				 * not been copied-on-write (or it would be marked
6968 				 * as "needs_copy" and would have been handled above
6969 				 * and also already write-protected).
6970 				 * We still need to write-protect here to prevent
6971 				 * other threads from modifying these pages while
6972 				 * we're in the process of copying and wiring
6973 				 * the copied pages.
6974 				 * Since the mapping is neither shared nor COWed,
6975 				 * we only need to write-protect the PTEs for this
6976 				 * mapping.
6977 				 */
6978 				vm_object_pmap_protect(orig_object,
6979 				    orig_offset,
6980 				    size,
6981 				    map->pmap,
6982 				    VM_MAP_PAGE_SIZE(map),
6983 				    entry->vme_start,
6984 				    entry->protection & ~VM_PROT_WRITE);
6985 			}
6986 		}
6987 		if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6988 			/*
6989 			 * Make the object COPY_DELAY to get a stable object
6990 			 * to wire.
6991 			 * That should avoid creating long shadow chains while
6992 			 * wiring/unwiring the same range repeatedly.
6993 			 * That also prevents part of the object from being
6994 			 * wired while another part is "needs_copy", which
6995 			 * could result in conflicting rules wrt copy-on-write.
6996 			 */
6997 			vm_object_t object;
6998 
6999 			object = VME_OBJECT(entry);
7000 			vm_object_lock(object);
7001 			if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7002 				assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
7003 				    "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
7004 				    object, (uint64_t)object->vo_size,
7005 				    entry,
7006 				    (uint64_t)entry->vme_start,
7007 				    (uint64_t)entry->vme_end,
7008 				    (uint64_t)VME_OFFSET(entry),
7009 				    (uint64_t)size);
7010 				assertf(os_ref_get_count_raw(&object->ref_count) == 1,
7011 				    "object %p ref_count %d\n",
7012 				    object, os_ref_get_count_raw(&object->ref_count));
7013 				assertf(!entry->needs_copy,
7014 				    "entry %p\n", entry);
7015 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7016 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
7017 			}
7018 			vm_object_unlock(object);
7019 		}
7020 
7021 		vm_map_clip_start(map, entry, s);
7022 		vm_map_clip_end(map, entry, end);
7023 
7024 		/* re-compute "e" */
7025 		e = entry->vme_end;
7026 		if (e > end) {
7027 			e = end;
7028 		}
7029 
7030 		/*
7031 		 * Check for holes and protection mismatch.
7032 		 * Holes: Next entry should be contiguous unless this
7033 		 *	  is the end of the region.
7034 		 * Protection: Access requested must be allowed, unless
7035 		 *	wiring is by protection class
7036 		 */
7037 		if ((entry->vme_end < end) &&
7038 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7039 		    (entry->vme_next->vme_start > entry->vme_end))) {
7040 			/* found a hole */
7041 			rc = KERN_INVALID_ADDRESS;
7042 			goto done;
7043 		}
7044 		if ((entry->protection & access_type) != access_type) {
7045 			/* found a protection problem */
7046 			rc = KERN_PROTECTION_FAILURE;
7047 			goto done;
7048 		}
7049 
7050 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7051 
7052 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7053 			goto done;
7054 		}
7055 
7056 		entry->in_transition = TRUE;
7057 
7058 		/*
7059 		 * This entry might get split once we unlock the map.
7060 		 * In vm_fault_wire(), we need the current range as
7061 		 * defined by this entry.  In order for this to work
7062 		 * along with a simultaneous clip operation, we make a
7063 		 * temporary copy of this entry and use that for the
7064 		 * wiring.  Note that the underlying objects do not
7065 		 * change during a clip.
7066 		 */
7067 		tmp_entry = *entry;
7068 
7069 		/*
7070 		 * The in_transition state guarentees that the entry
7071 		 * (or entries for this range, if split occured) will be
7072 		 * there when the map lock is acquired for the second time.
7073 		 */
7074 		vm_map_unlock(map);
7075 
7076 		if (!user_wire && cur_thread != THREAD_NULL) {
7077 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
7078 		} else {
7079 			interruptible_state = THREAD_UNINT;
7080 		}
7081 
7082 		if (map_pmap) {
7083 			rc = vm_fault_wire(map,
7084 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7085 			    physpage_p);
7086 		} else {
7087 			rc = vm_fault_wire(map,
7088 			    &tmp_entry, caller_prot, tag, map->pmap,
7089 			    tmp_entry.vme_start,
7090 			    physpage_p);
7091 		}
7092 
7093 		if (!user_wire && cur_thread != THREAD_NULL) {
7094 			thread_interrupt_level(interruptible_state);
7095 		}
7096 
7097 		vm_map_lock(map);
7098 
7099 		if (last_timestamp + 1 != map->timestamp) {
7100 			/*
7101 			 * Find the entry again.  It could have been clipped
7102 			 * after we unlocked the map.
7103 			 */
7104 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7105 			    &first_entry)) {
7106 				panic("vm_map_wire: re-lookup failed");
7107 			}
7108 
7109 			entry = first_entry;
7110 		}
7111 
7112 		last_timestamp = map->timestamp;
7113 
7114 		while ((entry != vm_map_to_entry(map)) &&
7115 		    (entry->vme_start < tmp_entry.vme_end)) {
7116 			assert(entry->in_transition);
7117 			entry->in_transition = FALSE;
7118 			if (entry->needs_wakeup) {
7119 				entry->needs_wakeup = FALSE;
7120 				need_wakeup = TRUE;
7121 			}
7122 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
7123 				subtract_wire_counts(map, entry, user_wire);
7124 			}
7125 			entry = entry->vme_next;
7126 		}
7127 
7128 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
7129 			goto done;
7130 		}
7131 
7132 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7133 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
7134 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7135 			/* found a "new" hole */
7136 			s = tmp_entry.vme_end;
7137 			rc = KERN_INVALID_ADDRESS;
7138 			goto done;
7139 		}
7140 
7141 		s = entry->vme_start;
7142 	} /* end while loop through map entries */
7143 
7144 done:
7145 	if (rc == KERN_SUCCESS) {
7146 		/* repair any damage we may have made to the VM map */
7147 		vm_map_simplify_range(map, start, end);
7148 	}
7149 
7150 	vm_map_unlock(map);
7151 
7152 	/*
7153 	 * wake up anybody waiting on entries we wired.
7154 	 */
7155 	if (need_wakeup) {
7156 		vm_map_entry_wakeup(map);
7157 	}
7158 
7159 	if (rc != KERN_SUCCESS) {
7160 		/* undo what has been wired so far */
7161 		vm_map_unwire_nested(map, start, s, user_wire,
7162 		    map_pmap, pmap_addr);
7163 		if (physpage_p) {
7164 			*physpage_p = 0;
7165 		}
7166 	}
7167 
7168 	return rc;
7169 }
7170 
7171 static __attribute__((always_inline, warn_unused_result))
7172 kern_return_t
vm_map_wire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size,vm_prot_t * prot)7173 vm_map_wire_sanitize(
7174 	vm_map_t                map,
7175 	vm_map_offset_ut        start_u,
7176 	vm_map_offset_ut        end_u,
7177 	vm_prot_ut              prot_u,
7178 	vm_sanitize_caller_t    vm_sanitize_caller,
7179 	vm_map_offset_t        *start,
7180 	vm_map_offset_t        *end,
7181 	vm_map_size_t          *size,
7182 	vm_prot_t              *prot)
7183 {
7184 	kern_return_t   kr;
7185 
7186 	kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7187 	    VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7188 	    size);
7189 	if (__improbable(kr != KERN_SUCCESS)) {
7190 		return kr;
7191 	}
7192 
7193 	kr = vm_sanitize_prot(prot_u, vm_sanitize_caller, map, prot);
7194 	if (__improbable(kr != KERN_SUCCESS)) {
7195 		return kr;
7196 	}
7197 
7198 	return KERN_SUCCESS;
7199 }
7200 
7201 /*
7202  * Validation function for vm_map_wire_nested().
7203  */
7204 kern_return_t
vm_map_wire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p,vm_sanitize_caller_t vm_sanitize_caller)7205 vm_map_wire_impl(
7206 	vm_map_t                map,
7207 	vm_map_offset_ut        start_u,
7208 	vm_map_offset_ut        end_u,
7209 	vm_prot_ut              prot_u,
7210 	vm_tag_t                tag,
7211 	boolean_t               user_wire,
7212 	ppnum_t                *physpage_p,
7213 	vm_sanitize_caller_t    vm_sanitize_caller)
7214 {
7215 	vm_map_offset_t start, end;
7216 	vm_map_size_t   size;
7217 	vm_prot_t       prot;
7218 	kern_return_t   kr;
7219 
7220 	/*
7221 	 * Sanitize any input parameters that are addr/size/prot/inherit
7222 	 */
7223 	kr = vm_map_wire_sanitize(map,
7224 	    start_u,
7225 	    end_u,
7226 	    prot_u,
7227 	    vm_sanitize_caller,
7228 	    &start,
7229 	    &end,
7230 	    &size,
7231 	    &prot);
7232 	if (__improbable(kr != KERN_SUCCESS)) {
7233 		if (physpage_p) {
7234 			*physpage_p = 0;
7235 		}
7236 		return vm_sanitize_get_kr(kr);
7237 	}
7238 
7239 	return vm_map_wire_nested(map, start, end, prot, tag, user_wire,
7240 	           PMAP_NULL, 0, physpage_p);
7241 }
7242 
7243 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,boolean_t user_wire)7244 vm_map_wire_external(
7245 	vm_map_t                map,
7246 	vm_map_offset_ut        start_u,
7247 	vm_map_offset_ut        end_u,
7248 	vm_prot_ut              prot_u,
7249 	boolean_t               user_wire)
7250 {
7251 	vm_tag_t tag = vm_tag_bt();
7252 
7253 	return vm_map_wire_kernel(map, start_u, end_u, prot_u, tag, user_wire);
7254 }
7255 
7256 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire)7257 vm_map_wire_kernel(
7258 	vm_map_t                map,
7259 	vm_map_offset_ut        start_u,
7260 	vm_map_offset_ut        end_u,
7261 	vm_prot_ut              prot_u,
7262 	vm_tag_t                tag,
7263 	boolean_t               user_wire)
7264 {
7265 	return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7266 	           user_wire, NULL, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7267 }
7268 
7269 #if XNU_PLATFORM_MacOSX
7270 
7271 kern_return_t
vm_map_wire_and_extract(vm_map_t map,vm_map_offset_ut start_u,vm_prot_ut prot_u,boolean_t user_wire,ppnum_t * physpage_p)7272 vm_map_wire_and_extract(
7273 	vm_map_t                map,
7274 	vm_map_offset_ut        start_u,
7275 	vm_prot_ut              prot_u,
7276 	boolean_t               user_wire,
7277 	ppnum_t                *physpage_p)
7278 {
7279 	vm_tag_t         tag    = vm_tag_bt();
7280 	vm_map_size_ut   size_u = vm_sanitize_wrap_size(VM_MAP_PAGE_SIZE(map));
7281 	vm_map_offset_ut end_u  = vm_sanitize_compute_ut_end(start_u, size_u);
7282 
7283 	return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7284 	           user_wire, physpage_p, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7285 }
7286 
7287 #endif /* XNU_PLATFORM_MacOSX */
7288 
7289 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7290 vm_map_unwire_nested(
7291 	vm_map_t                map,
7292 	vm_map_offset_t         start,
7293 	vm_map_offset_t         end,
7294 	boolean_t               user_wire,
7295 	pmap_t                  map_pmap,
7296 	vm_map_offset_t         pmap_addr)
7297 {
7298 	vm_map_entry_t          entry;
7299 	struct vm_map_entry     *first_entry, tmp_entry;
7300 	boolean_t               need_wakeup;
7301 	boolean_t               main_map = FALSE;
7302 	unsigned int            last_timestamp;
7303 
7304 	VM_MAP_RANGE_CHECK(map, start, end);
7305 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7306 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7307 
7308 	if (start == end) {
7309 		/* We unwired what the caller asked for: zero pages */
7310 		return KERN_SUCCESS;
7311 	}
7312 
7313 	vm_map_lock(map);
7314 	if (map_pmap == NULL) {
7315 		main_map = TRUE;
7316 	}
7317 	last_timestamp = map->timestamp;
7318 
7319 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7320 		entry = first_entry;
7321 		/*
7322 		 * vm_map_clip_start will be done later.
7323 		 * We don't want to unnest any nested sub maps here !
7324 		 */
7325 	} else {
7326 		if (!user_wire) {
7327 			panic("vm_map_unwire: start not found");
7328 		}
7329 		/*	Start address is not in map. */
7330 		vm_map_unlock(map);
7331 		return KERN_INVALID_ADDRESS;
7332 	}
7333 
7334 	if (entry->superpage_size) {
7335 		/* superpages are always wired */
7336 		vm_map_unlock(map);
7337 		return KERN_INVALID_ADDRESS;
7338 	}
7339 
7340 	need_wakeup = FALSE;
7341 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7342 		if (entry->in_transition) {
7343 			/*
7344 			 * 1)
7345 			 * Another thread is wiring down this entry. Note
7346 			 * that if it is not for the other thread we would
7347 			 * be unwiring an unwired entry.  This is not
7348 			 * permitted.  If we wait, we will be unwiring memory
7349 			 * we did not wire.
7350 			 *
7351 			 * 2)
7352 			 * Another thread is unwiring this entry.  We did not
7353 			 * have a reference to it, because if we did, this
7354 			 * entry will not be getting unwired now.
7355 			 */
7356 			if (!user_wire) {
7357 				/*
7358 				 * XXX FBDP
7359 				 * This could happen:  there could be some
7360 				 * overlapping vslock/vsunlock operations
7361 				 * going on.
7362 				 * We should probably just wait and retry,
7363 				 * but then we have to be careful that this
7364 				 * entry could get "simplified" after
7365 				 * "in_transition" gets unset and before
7366 				 * we re-lookup the entry, so we would
7367 				 * have to re-clip the entry to avoid
7368 				 * re-unwiring what we have already unwired...
7369 				 * See vm_map_wire_nested().
7370 				 *
7371 				 * Or we could just ignore "in_transition"
7372 				 * here and proceed to decement the wired
7373 				 * count(s) on this entry.  That should be fine
7374 				 * as long as "wired_count" doesn't drop all
7375 				 * the way to 0 (and we should panic if THAT
7376 				 * happens).
7377 				 */
7378 				panic("vm_map_unwire: in_transition entry");
7379 			}
7380 
7381 			entry = entry->vme_next;
7382 			continue;
7383 		}
7384 
7385 		if (entry->is_sub_map) {
7386 			vm_map_offset_t sub_start;
7387 			vm_map_offset_t sub_end;
7388 			vm_map_offset_t local_end;
7389 			pmap_t          pmap;
7390 			vm_map_t        sub_map = VM_MAP_NULL;
7391 
7392 			vm_map_clip_start(map, entry, start);
7393 			vm_map_clip_end(map, entry, end);
7394 
7395 			sub_start = VME_OFFSET(entry);
7396 			sub_end = entry->vme_end - entry->vme_start;
7397 			sub_end += VME_OFFSET(entry);
7398 			local_end = entry->vme_end;
7399 			if (map_pmap == NULL) {
7400 				if (entry->use_pmap) {
7401 					pmap = VME_SUBMAP(entry)->pmap;
7402 					pmap_addr = sub_start;
7403 				} else {
7404 					pmap = map->pmap;
7405 					pmap_addr = start;
7406 				}
7407 				if (entry->wired_count == 0 ||
7408 				    (user_wire && entry->user_wired_count == 0)) {
7409 					if (!user_wire) {
7410 						panic("vm_map_unwire: entry is unwired");
7411 					}
7412 					entry = entry->vme_next;
7413 					continue;
7414 				}
7415 
7416 				/*
7417 				 * Check for holes
7418 				 * Holes: Next entry should be contiguous unless
7419 				 * this is the end of the region.
7420 				 */
7421 				if (((entry->vme_end < end) &&
7422 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7423 				    (entry->vme_next->vme_start
7424 				    > entry->vme_end)))) {
7425 					if (!user_wire) {
7426 						panic("vm_map_unwire: non-contiguous region");
7427 					}
7428 /*
7429  *                                       entry = entry->vme_next;
7430  *                                       continue;
7431  */
7432 				}
7433 
7434 				subtract_wire_counts(map, entry, user_wire);
7435 
7436 				if (entry->wired_count != 0) {
7437 					entry = entry->vme_next;
7438 					continue;
7439 				}
7440 
7441 				entry->in_transition = TRUE;
7442 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7443 
7444 				/*
7445 				 * We can unlock the map now. The in_transition state
7446 				 * guarantees existance of the entry.
7447 				 */
7448 				sub_map = VME_SUBMAP(entry);
7449 				vm_map_reference(sub_map);
7450 				vm_map_unlock(map);
7451 				vm_map_unwire_nested(sub_map,
7452 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7453 				vm_map_deallocate(sub_map);
7454 				sub_map = VM_MAP_NULL;
7455 				vm_map_lock(map);
7456 
7457 				if (last_timestamp + 1 != map->timestamp) {
7458 					/*
7459 					 * Find the entry again.  It could have been
7460 					 * clipped or deleted after we unlocked the map.
7461 					 */
7462 					if (!vm_map_lookup_entry(map,
7463 					    tmp_entry.vme_start,
7464 					    &first_entry)) {
7465 						if (!user_wire) {
7466 							panic("vm_map_unwire: re-lookup failed");
7467 						}
7468 						entry = first_entry->vme_next;
7469 					} else {
7470 						entry = first_entry;
7471 					}
7472 				}
7473 				last_timestamp = map->timestamp;
7474 
7475 				/*
7476 				 * clear transition bit for all constituent entries
7477 				 * that were in the original entry (saved in
7478 				 * tmp_entry).  Also check for waiters.
7479 				 */
7480 				while ((entry != vm_map_to_entry(map)) &&
7481 				    (entry->vme_start < tmp_entry.vme_end)) {
7482 					assert(entry->in_transition);
7483 					entry->in_transition = FALSE;
7484 					if (entry->needs_wakeup) {
7485 						entry->needs_wakeup = FALSE;
7486 						need_wakeup = TRUE;
7487 					}
7488 					entry = entry->vme_next;
7489 				}
7490 				continue;
7491 			} else {
7492 				tmp_entry = *entry;
7493 				sub_map = VME_SUBMAP(entry);
7494 				vm_map_reference(sub_map);
7495 				vm_map_unlock(map);
7496 				vm_map_unwire_nested(sub_map,
7497 				    sub_start, sub_end, user_wire, map_pmap,
7498 				    pmap_addr);
7499 				vm_map_deallocate(sub_map);
7500 				sub_map = VM_MAP_NULL;
7501 				vm_map_lock(map);
7502 
7503 				if (last_timestamp + 1 != map->timestamp) {
7504 					/*
7505 					 * Find the entry again.  It could have been
7506 					 * clipped or deleted after we unlocked the map.
7507 					 */
7508 					if (!vm_map_lookup_entry(map,
7509 					    tmp_entry.vme_start,
7510 					    &first_entry)) {
7511 						if (!user_wire) {
7512 							panic("vm_map_unwire: re-lookup failed");
7513 						}
7514 						entry = first_entry->vme_next;
7515 					} else {
7516 						entry = first_entry;
7517 					}
7518 				}
7519 				last_timestamp = map->timestamp;
7520 			}
7521 		}
7522 
7523 
7524 		if ((entry->wired_count == 0) ||
7525 		    (user_wire && entry->user_wired_count == 0)) {
7526 			if (!user_wire) {
7527 				panic("vm_map_unwire: entry is unwired");
7528 			}
7529 
7530 			entry = entry->vme_next;
7531 			continue;
7532 		}
7533 
7534 		assert(entry->wired_count > 0 &&
7535 		    (!user_wire || entry->user_wired_count > 0));
7536 
7537 		vm_map_clip_start(map, entry, start);
7538 		vm_map_clip_end(map, entry, end);
7539 
7540 		/*
7541 		 * Check for holes
7542 		 * Holes: Next entry should be contiguous unless
7543 		 *	  this is the end of the region.
7544 		 */
7545 		if (((entry->vme_end < end) &&
7546 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7547 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7548 			if (!user_wire) {
7549 				panic("vm_map_unwire: non-contiguous region");
7550 			}
7551 			/*
7552 			 * entry = entry->vme_next;
7553 			 * continue;
7554 			 */
7555 		}
7556 
7557 		subtract_wire_counts(map, entry, user_wire);
7558 
7559 		if (entry->wired_count != 0) {
7560 			entry = entry->vme_next;
7561 			continue;
7562 		}
7563 
7564 		if (entry->zero_wired_pages) {
7565 			entry->zero_wired_pages = FALSE;
7566 		}
7567 
7568 		entry->in_transition = TRUE;
7569 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7570 
7571 		/*
7572 		 * We can unlock the map now. The in_transition state
7573 		 * guarantees existance of the entry.
7574 		 */
7575 		vm_map_unlock(map);
7576 		if (map_pmap) {
7577 			vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7578 			    pmap_addr, tmp_entry.vme_end);
7579 		} else {
7580 			vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7581 			    tmp_entry.vme_start, tmp_entry.vme_end);
7582 		}
7583 		vm_map_lock(map);
7584 
7585 		if (last_timestamp + 1 != map->timestamp) {
7586 			/*
7587 			 * Find the entry again.  It could have been clipped
7588 			 * or deleted after we unlocked the map.
7589 			 */
7590 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7591 			    &first_entry)) {
7592 				if (!user_wire) {
7593 					panic("vm_map_unwire: re-lookup failed");
7594 				}
7595 				entry = first_entry->vme_next;
7596 			} else {
7597 				entry = first_entry;
7598 			}
7599 		}
7600 		last_timestamp = map->timestamp;
7601 
7602 		/*
7603 		 * clear transition bit for all constituent entries that
7604 		 * were in the original entry (saved in tmp_entry).  Also
7605 		 * check for waiters.
7606 		 */
7607 		while ((entry != vm_map_to_entry(map)) &&
7608 		    (entry->vme_start < tmp_entry.vme_end)) {
7609 			assert(entry->in_transition);
7610 			entry->in_transition = FALSE;
7611 			if (entry->needs_wakeup) {
7612 				entry->needs_wakeup = FALSE;
7613 				need_wakeup = TRUE;
7614 			}
7615 			entry = entry->vme_next;
7616 		}
7617 	}
7618 
7619 	/*
7620 	 * We might have fragmented the address space when we wired this
7621 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7622 	 * with their neighbors now that they're no longer wired.
7623 	 * Under some circumstances, address space fragmentation can
7624 	 * prevent VM object shadow chain collapsing, which can cause
7625 	 * swap space leaks.
7626 	 */
7627 	vm_map_simplify_range(map, start, end);
7628 
7629 	vm_map_unlock(map);
7630 	/*
7631 	 * wake up anybody waiting on entries that we have unwired.
7632 	 */
7633 	if (need_wakeup) {
7634 		vm_map_entry_wakeup(map);
7635 	}
7636 	return KERN_SUCCESS;
7637 }
7638 
7639 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire)7640 vm_map_unwire(
7641 	vm_map_t                map,
7642 	vm_map_offset_ut        start_u,
7643 	vm_map_offset_ut        end_u,
7644 	boolean_t               user_wire)
7645 {
7646 	return vm_map_unwire_impl(map, start_u, end_u, user_wire,
7647 	           VM_SANITIZE_CALLER_VM_MAP_UNWIRE);
7648 }
7649 
7650 static __attribute__((always_inline, warn_unused_result))
7651 kern_return_t
vm_map_unwire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)7652 vm_map_unwire_sanitize(
7653 	vm_map_t                map,
7654 	vm_map_offset_ut        start_u,
7655 	vm_map_offset_ut        end_u,
7656 	vm_sanitize_caller_t    vm_sanitize_caller,
7657 	vm_map_offset_t        *start,
7658 	vm_map_offset_t        *end,
7659 	vm_map_size_t          *size)
7660 {
7661 	return vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7662 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7663 	           size);
7664 }
7665 
7666 kern_return_t
vm_map_unwire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire,vm_sanitize_caller_t vm_sanitize_caller)7667 vm_map_unwire_impl(
7668 	vm_map_t                map,
7669 	vm_map_offset_ut        start_u,
7670 	vm_map_offset_ut        end_u,
7671 	boolean_t               user_wire,
7672 	vm_sanitize_caller_t    vm_sanitize_caller)
7673 {
7674 	vm_map_offset_t start, end;
7675 	vm_map_size_t   size;
7676 	kern_return_t   kr;
7677 
7678 	/*
7679 	 * Sanitize any input parameters that are addr/size/prot/inherit
7680 	 */
7681 	kr = vm_map_unwire_sanitize(
7682 		map,
7683 		start_u,
7684 		end_u,
7685 		vm_sanitize_caller,
7686 		&start,
7687 		&end,
7688 		&size);
7689 	if (__improbable(kr != KERN_SUCCESS)) {
7690 		return vm_sanitize_get_kr(kr);
7691 	}
7692 
7693 	return vm_map_unwire_nested(map, start, end,
7694 	           user_wire, (pmap_t)NULL, 0);
7695 }
7696 
7697 
7698 /*
7699  *	vm_map_entry_zap:	[ internal use only ]
7700  *
7701  *	Remove the entry from the target map
7702  *	and put it on a zap list.
7703  */
7704 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7705 vm_map_entry_zap(
7706 	vm_map_t                map,
7707 	vm_map_entry_t          entry,
7708 	vm_map_zap_t            zap)
7709 {
7710 	vm_map_offset_t s, e;
7711 
7712 	s = entry->vme_start;
7713 	e = entry->vme_end;
7714 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7715 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7716 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7717 		assert(page_aligned(s));
7718 		assert(page_aligned(e));
7719 	}
7720 	if (entry->map_aligned == TRUE) {
7721 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7722 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7723 	}
7724 	assert(entry->wired_count == 0);
7725 	assert(entry->user_wired_count == 0);
7726 	assert(!entry->vme_permanent);
7727 
7728 	vm_map_store_entry_unlink(map, entry, false);
7729 	map->size -= e - s;
7730 
7731 	vm_map_zap_append(zap, entry);
7732 }
7733 
7734 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7735 vm_map_submap_pmap_clean(
7736 	vm_map_t        map,
7737 	vm_map_offset_t start,
7738 	vm_map_offset_t end,
7739 	vm_map_t        sub_map,
7740 	vm_map_offset_t offset)
7741 {
7742 	vm_map_offset_t submap_start;
7743 	vm_map_offset_t submap_end;
7744 	vm_map_size_t   remove_size;
7745 	vm_map_entry_t  entry;
7746 
7747 	submap_end = offset + (end - start);
7748 	submap_start = offset;
7749 
7750 	vm_map_lock_read(sub_map);
7751 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7752 		remove_size = (entry->vme_end - entry->vme_start);
7753 		if (offset > entry->vme_start) {
7754 			remove_size -= offset - entry->vme_start;
7755 		}
7756 
7757 
7758 		if (submap_end < entry->vme_end) {
7759 			remove_size -=
7760 			    entry->vme_end - submap_end;
7761 		}
7762 		if (entry->is_sub_map) {
7763 			vm_map_submap_pmap_clean(
7764 				sub_map,
7765 				start,
7766 				start + remove_size,
7767 				VME_SUBMAP(entry),
7768 				VME_OFFSET(entry));
7769 		} else {
7770 			if (map->mapped_in_other_pmaps &&
7771 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7772 			    VME_OBJECT(entry) != NULL) {
7773 				vm_object_pmap_protect_options(
7774 					VME_OBJECT(entry),
7775 					(VME_OFFSET(entry) +
7776 					offset -
7777 					entry->vme_start),
7778 					remove_size,
7779 					PMAP_NULL,
7780 					PAGE_SIZE,
7781 					entry->vme_start,
7782 					VM_PROT_NONE,
7783 					PMAP_OPTIONS_REMOVE);
7784 			} else {
7785 				pmap_remove(map->pmap,
7786 				    (addr64_t)start,
7787 				    (addr64_t)(start + remove_size));
7788 			}
7789 		}
7790 	}
7791 
7792 	entry = entry->vme_next;
7793 
7794 	while ((entry != vm_map_to_entry(sub_map))
7795 	    && (entry->vme_start < submap_end)) {
7796 		remove_size = (entry->vme_end - entry->vme_start);
7797 		if (submap_end < entry->vme_end) {
7798 			remove_size -= entry->vme_end - submap_end;
7799 		}
7800 		if (entry->is_sub_map) {
7801 			vm_map_submap_pmap_clean(
7802 				sub_map,
7803 				(start + entry->vme_start) - offset,
7804 				((start + entry->vme_start) - offset) + remove_size,
7805 				VME_SUBMAP(entry),
7806 				VME_OFFSET(entry));
7807 		} else {
7808 			if (map->mapped_in_other_pmaps &&
7809 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7810 			    VME_OBJECT(entry) != NULL) {
7811 				vm_object_pmap_protect_options(
7812 					VME_OBJECT(entry),
7813 					VME_OFFSET(entry),
7814 					remove_size,
7815 					PMAP_NULL,
7816 					PAGE_SIZE,
7817 					entry->vme_start,
7818 					VM_PROT_NONE,
7819 					PMAP_OPTIONS_REMOVE);
7820 			} else {
7821 				pmap_remove(map->pmap,
7822 				    (addr64_t)((start + entry->vme_start)
7823 				    - offset),
7824 				    (addr64_t)(((start + entry->vme_start)
7825 				    - offset) + remove_size));
7826 			}
7827 		}
7828 		entry = entry->vme_next;
7829 	}
7830 	vm_map_unlock_read(sub_map);
7831 	return;
7832 }
7833 
7834 /*
7835  *     virt_memory_guard_ast:
7836  *
7837  *     Handle the AST callout for a virtual memory guard.
7838  *	   raise an EXC_GUARD exception and terminate the task
7839  *     if configured to do so.
7840  */
7841 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7842 virt_memory_guard_ast(
7843 	thread_t thread,
7844 	mach_exception_data_type_t code,
7845 	mach_exception_data_type_t subcode)
7846 {
7847 	task_t task = get_threadtask(thread);
7848 	assert(task != kernel_task);
7849 	assert(task == current_task());
7850 	kern_return_t sync_exception_result;
7851 	uint32_t behavior;
7852 
7853 	behavior = task->task_exc_guard;
7854 
7855 
7856 	/* Is delivery enabled */
7857 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7858 		return;
7859 	}
7860 
7861 	/* If only once, make sure we're that once */
7862 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7863 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7864 
7865 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7866 			break;
7867 		}
7868 		behavior = task->task_exc_guard;
7869 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7870 			return;
7871 		}
7872 	}
7873 
7874 	const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7875 	/* Raise exception synchronously and see if handler claimed it */
7876 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7877 
7878 	if (fatal) {
7879 		/*
7880 		 * If Synchronous EXC_GUARD delivery was successful then
7881 		 * kill the process and return, else kill the process
7882 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
7883 		 */
7884 
7885 
7886 		int flags = PX_DEBUG_NO_HONOR;
7887 		exception_info_t info = {
7888 			.os_reason = OS_REASON_GUARD,
7889 			.exception_type = EXC_GUARD,
7890 			.mx_code = code,
7891 			.mx_subcode = subcode
7892 		};
7893 
7894 		if (sync_exception_result == KERN_SUCCESS) {
7895 			flags |= PX_PSIGNAL;
7896 		}
7897 		exit_with_mach_exception(current_proc(), info, flags);
7898 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7899 		/*
7900 		 * If the synchronous EXC_GUARD delivery was not successful,
7901 		 * raise a simulated crash.
7902 		 */
7903 		if (sync_exception_result != KERN_SUCCESS) {
7904 			task_violated_guard(code, subcode, NULL, FALSE);
7905 		}
7906 	}
7907 }
7908 
7909 /*
7910  * Validate policy for VM guard exceptions and encode the correct Mach exception
7911  * code and subcode if the policy allows delivering a guard exception here.
7912  */
7913 static bool
vm_map_guard_exception_internal(vm_map_offset_t address,unsigned reason,mach_exception_code_t * code,mach_exception_data_type_t * subcode)7914 vm_map_guard_exception_internal(
7915 	vm_map_offset_t            address,
7916 	unsigned                   reason,
7917 	mach_exception_code_t      *code,
7918 	mach_exception_data_type_t *subcode)
7919 {
7920 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7921 	unsigned int target = 0; /* should we pass in pid associated with map? */
7922 
7923 	task_t task = current_task_early();
7924 
7925 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7926 	if (task == NULL || task == kernel_task) {
7927 		return false;
7928 	}
7929 
7930 
7931 	*code = 0;
7932 	EXC_GUARD_ENCODE_TYPE(*code, guard_type);
7933 	EXC_GUARD_ENCODE_FLAVOR(*code, reason);
7934 	EXC_GUARD_ENCODE_TARGET(*code, target);
7935 	*subcode = (uint64_t)address;
7936 
7937 	return true;
7938 }
7939 
7940 /*
7941  *     vm_map_guard_exception:
7942  *
7943  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7944  *
7945  *         `reason` is kGUARD_EXC_DEALLOC_GAP when we find nothing mapped,
7946  *     or if there is a gap in the mapping when a user address space
7947  *     was requested. We report the address of the first gap found.
7948  */
7949 
7950 void
vm_map_guard_exception(vm_map_offset_t address,unsigned reason)7951 vm_map_guard_exception(
7952 	vm_map_offset_t            address,
7953 	unsigned                   reason)
7954 {
7955 	mach_exception_code_t code;
7956 	mach_exception_data_type_t subcode;
7957 	if (vm_map_guard_exception_internal(address, reason, &code, &subcode)) {
7958 		task_t task = current_task();
7959 		bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7960 
7961 		thread_guard_violation(current_thread(), code, subcode, fatal);
7962 	}
7963 }
7964 
7965 
7966 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7967 vm_map_delete_submap_recurse(
7968 	vm_map_t submap,
7969 	vm_map_offset_t submap_start,
7970 	vm_map_offset_t submap_end)
7971 {
7972 	vm_map_entry_t submap_entry;
7973 
7974 	/*
7975 	 * Verify that the submap does not contain any "permanent" entries
7976 	 * within the specified range. We permit TPRO ranges to be overwritten
7977 	 * as we only reach this path if TPRO const protection is disabled for a
7978 	 * given map.
7979 	 *
7980 	 * We do not care about gaps.
7981 	 */
7982 
7983 	vm_map_lock(submap);
7984 
7985 	if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7986 		submap_entry = submap_entry->vme_next;
7987 	}
7988 
7989 	for (;
7990 	    submap_entry != vm_map_to_entry(submap) &&
7991 	    submap_entry->vme_start < submap_end;
7992 	    submap_entry = submap_entry->vme_next) {
7993 		if (submap_entry->vme_permanent
7994 #ifdef __arm64e__
7995 		    /* allow TPRO submap entries to be overwritten */
7996 		    && !submap_entry->used_for_tpro
7997 #endif
7998 		    ) {
7999 			/* "permanent" entry -> fail */
8000 			vm_map_unlock(submap);
8001 			return KERN_PROTECTION_FAILURE;
8002 		}
8003 	}
8004 	/* no "permanent" entries in the range -> success */
8005 	vm_map_unlock(submap);
8006 	return KERN_SUCCESS;
8007 }
8008 
8009 __abortlike
8010 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)8011 __vm_map_delete_misaligned_panic(
8012 	vm_map_t                map,
8013 	vm_map_offset_t         start,
8014 	vm_map_offset_t         end)
8015 {
8016 	panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
8017 	    map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
8018 }
8019 
8020 __abortlike
8021 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)8022 __vm_map_delete_failed_panic(
8023 	vm_map_t                map,
8024 	vm_map_offset_t         start,
8025 	vm_map_offset_t         end,
8026 	kern_return_t           kr)
8027 {
8028 	panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
8029 	    map, (uint64_t)start, (uint64_t)end, kr);
8030 }
8031 
8032 __abortlike
8033 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)8034 __vm_map_delete_gap_panic(
8035 	vm_map_t                map,
8036 	vm_map_offset_t         where,
8037 	vm_map_offset_t         start,
8038 	vm_map_offset_t         end)
8039 {
8040 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
8041 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
8042 }
8043 
8044 __abortlike
8045 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)8046 __vm_map_delete_permanent_panic(
8047 	vm_map_t                map,
8048 	vm_map_offset_t         start,
8049 	vm_map_offset_t         end,
8050 	vm_map_entry_t          entry)
8051 {
8052 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
8053 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
8054 	    map, (uint64_t)start, (uint64_t)end, entry,
8055 	    (uint64_t)entry->vme_start,
8056 	    (uint64_t)entry->vme_end);
8057 }
8058 
8059 __options_decl(vm_map_delete_state_t, uint32_t, {
8060 	VMDS_NONE               = 0x0000,
8061 
8062 	VMDS_FOUND_GAP          = 0x0001,
8063 	VMDS_GAPS_OK            = 0x0002,
8064 
8065 	VMDS_KERNEL_PMAP        = 0x0004,
8066 	VMDS_NEEDS_LOOKUP       = 0x0008,
8067 	VMDS_NEEDS_WAKEUP       = 0x0010,
8068 	VMDS_KERNEL_KMEMPTR     = 0x0020
8069 });
8070 
8071 /*
8072  * vm_map_clamp_to_pmap(map, start, end)
8073  *
8074  * Modify *start and *end so they fall within the bounds of map->pmap.
8075  */
8076 #if MACH_ASSERT
8077 static void
vm_map_clamp_to_pmap(vm_map_t map,vm_map_address_t * start,vm_map_address_t * end)8078 vm_map_clamp_to_pmap(vm_map_t map, vm_map_address_t *start, vm_map_address_t *end)
8079 {
8080 	vm_map_address_t min;
8081 	vm_map_address_t max;
8082 
8083 #if __x86_64__
8084 	/* x86_64 struct pmap does not have min and max fields */
8085 	if (map->pmap == kernel_pmap) {
8086 		min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
8087 		max = VM_MAX_KERNEL_ADDRESS;
8088 	} else {
8089 		min = VM_MAP_MIN_ADDRESS;
8090 		max = VM_MAP_MAX_ADDRESS;
8091 	}
8092 #else
8093 	min = map->pmap->min;
8094 	max = map->pmap->max;
8095 #endif
8096 
8097 	if (*start < min) {
8098 		*start = min;
8099 	} else if (*start > max) {
8100 		*start = max;
8101 	}
8102 	if (*end < min) {
8103 		*end = min;
8104 	} else if (*end > max) {
8105 		*end = max;
8106 	}
8107 }
8108 #endif
8109 
8110 int vm_log_map_delete_permanent_prot_none = 0;
8111 /*
8112  *	vm_map_delete:	[ internal use only ]
8113  *
8114  *	Deallocates the given address range from the target map.
8115  *	Removes all user wirings. Unwires one kernel wiring if
8116  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
8117  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
8118  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8119  *
8120  *
8121  *	When the map is a kernel map, then any error in removing mappings
8122  *	will lead to a panic so that clients do not have to repeat the panic
8123  *	code at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
8124  *	is also passed, then KERN_ABORTED will not lead to a panic.
8125  *
8126  *	This routine is called with map locked and leaves map locked.
8127  */
8128 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8129 vm_map_delete(
8130 	vm_map_t                map,
8131 	vm_map_offset_t         start,
8132 	vm_map_offset_t         end,
8133 	vmr_flags_t             flags,
8134 	kmem_guard_t            guard,
8135 	vm_map_zap_t            zap_list)
8136 {
8137 	vm_map_entry_t          entry, next;
8138 	int                     interruptible;
8139 	vm_map_offset_t         gap_start = 0;
8140 	vm_map_offset_t         clear_in_transition_end = 0;
8141 	__unused vm_map_offset_t save_start = start;
8142 	__unused vm_map_offset_t save_end = end;
8143 	vm_map_delete_state_t   state = VMDS_NONE;
8144 	kmem_return_t           ret = { };
8145 	vm_map_range_id_t       range_id = 0;
8146 	struct kmem_page_meta  *meta = NULL;
8147 	uint32_t                size_idx, slot_idx;
8148 	struct mach_vm_range    slot;
8149 
8150 	if (vm_map_pmap(map) == kernel_pmap) {
8151 		state |= VMDS_KERNEL_PMAP;
8152 		range_id = kmem_addr_get_range(start, end - start);
8153 		if (kmem_is_ptr_range(range_id)) {
8154 			state |= VMDS_KERNEL_KMEMPTR;
8155 			slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8156 			    &size_idx, &slot);
8157 		}
8158 	}
8159 
8160 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8161 		state |= VMDS_GAPS_OK;
8162 	}
8163 
8164 	if (map->corpse_source &&
8165 	    !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8166 	    !map->terminated) {
8167 		/*
8168 		 * The map is being used for corpses related diagnostics.
8169 		 * So skip any entry removal to avoid perturbing the map state.
8170 		 * The cleanup will happen in task_terminate_internal after the
8171 		 * call to task_port_no_senders.
8172 		 */
8173 		goto out;
8174 	}
8175 
8176 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8177 	    THREAD_ABORTSAFE : THREAD_UNINT;
8178 
8179 	if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8180 	    (start & VM_MAP_PAGE_MASK(map))) {
8181 		__vm_map_delete_misaligned_panic(map, start, end);
8182 	}
8183 
8184 	if ((state & VMDS_GAPS_OK) == 0) {
8185 		/*
8186 		 * If the map isn't terminated then all deletions must have
8187 		 * no gaps, and be within the [min, max) of the map.
8188 		 *
8189 		 * We got here without VM_MAP_RANGE_CHECK() being called,
8190 		 * and hence must validate bounds manually.
8191 		 *
8192 		 * It is worth noting that because vm_deallocate() will
8193 		 * round_page() the deallocation size, it's possible for "end"
8194 		 * to be 0 here due to overflow. We hence must treat it as being
8195 		 * beyond vm_map_max(map).
8196 		 *
8197 		 * Similarly, end < start means some wrap around happend,
8198 		 * which should cause an error or panic.
8199 		 */
8200 		if (end == 0 || end > vm_map_max(map)) {
8201 			state |= VMDS_FOUND_GAP;
8202 			gap_start = vm_map_max(map);
8203 			if (state & VMDS_KERNEL_PMAP) {
8204 				__vm_map_delete_gap_panic(map,
8205 				    gap_start, start, end);
8206 			}
8207 			goto out;
8208 		}
8209 
8210 		if (end < start) {
8211 			if (state & VMDS_KERNEL_PMAP) {
8212 				__vm_map_delete_gap_panic(map,
8213 				    vm_map_max(map), start, end);
8214 			}
8215 			ret.kmr_return = KERN_INVALID_ARGUMENT;
8216 			goto out;
8217 		}
8218 
8219 		if (start < vm_map_min(map)) {
8220 			state |= VMDS_FOUND_GAP;
8221 			gap_start = start;
8222 			if (state & VMDS_KERNEL_PMAP) {
8223 				__vm_map_delete_gap_panic(map,
8224 				    gap_start, start, end);
8225 			}
8226 			goto out;
8227 		}
8228 	} else {
8229 		/*
8230 		 * If the map is terminated, we must accept start/end
8231 		 * being beyond the boundaries of the map as this is
8232 		 * how some of the mappings like commpage mappings
8233 		 * can be destroyed (they're outside of those bounds).
8234 		 *
8235 		 * end < start is still something we can't cope with,
8236 		 * so just bail.
8237 		 */
8238 		if (end < start) {
8239 			goto out;
8240 		}
8241 	}
8242 
8243 
8244 	/*
8245 	 *	Find the start of the region.
8246 	 *
8247 	 *	If in a superpage, extend the range
8248 	 *	to include the start of the mapping.
8249 	 */
8250 	while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8251 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8252 			start = SUPERPAGE_ROUND_DOWN(start);
8253 		} else {
8254 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8255 			break;
8256 		}
8257 	}
8258 
8259 	if (entry->superpage_size) {
8260 		end = SUPERPAGE_ROUND_UP(end);
8261 	}
8262 
8263 	/*
8264 	 *	Step through all entries in this region
8265 	 */
8266 	for (vm_map_offset_t s = start; s < end;) {
8267 		/*
8268 		 * At this point, we have deleted all the memory entries
8269 		 * in [start, s) and are proceeding with the [s, end) range.
8270 		 *
8271 		 * This loop might drop the map lock, and it is possible that
8272 		 * some memory was already reallocated within [start, s)
8273 		 * and we don't want to mess with those entries.
8274 		 *
8275 		 * Some of those entries could even have been re-assembled
8276 		 * with an entry after "s" (in vm_map_simplify_entry()), so
8277 		 * we may have to vm_map_clip_start() again.
8278 		 *
8279 		 * When clear_in_transition_end is set, the we had marked
8280 		 * [start, clear_in_transition_end) as "in_transition"
8281 		 * during a previous iteration and we need to clear it.
8282 		 */
8283 
8284 		/*
8285 		 * Step 1: If needed (because we dropped locks),
8286 		 *         lookup the entry again.
8287 		 *
8288 		 *         If we're coming back from unwiring (Step 5),
8289 		 *         we also need to mark the entries as no longer
8290 		 *         in transition after that.
8291 		 */
8292 
8293 		if (state & VMDS_NEEDS_LOOKUP) {
8294 			state &= ~VMDS_NEEDS_LOOKUP;
8295 
8296 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8297 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8298 			}
8299 
8300 			if (state & VMDS_KERNEL_KMEMPTR) {
8301 				kmem_validate_slot(s, meta, size_idx, slot_idx);
8302 			}
8303 		}
8304 
8305 		if (clear_in_transition_end) {
8306 			for (vm_map_entry_t it = entry;
8307 			    it != vm_map_to_entry(map) &&
8308 			    it->vme_start < clear_in_transition_end;
8309 			    it = it->vme_next) {
8310 				assert(it->in_transition);
8311 				it->in_transition = FALSE;
8312 				if (it->needs_wakeup) {
8313 					it->needs_wakeup = FALSE;
8314 					state |= VMDS_NEEDS_WAKEUP;
8315 				}
8316 			}
8317 
8318 			clear_in_transition_end = 0;
8319 		}
8320 
8321 
8322 		/*
8323 		 * Step 2: Perform various policy checks
8324 		 *         before we do _anything_ to this entry.
8325 		 */
8326 
8327 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8328 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8329 				/*
8330 				 * Either we found a gap already,
8331 				 * or we are tearing down a map,
8332 				 * keep going.
8333 				 */
8334 			} else if (state & VMDS_KERNEL_PMAP) {
8335 				__vm_map_delete_gap_panic(map, s, start, end);
8336 			} else if (s < end) {
8337 				state |= VMDS_FOUND_GAP;
8338 				gap_start = s;
8339 			}
8340 
8341 			if (entry == vm_map_to_entry(map) ||
8342 			    end <= entry->vme_start) {
8343 				break;
8344 			}
8345 
8346 			s = entry->vme_start;
8347 		}
8348 
8349 		if (state & VMDS_KERNEL_PMAP) {
8350 			/*
8351 			 * In the kernel map and its submaps,
8352 			 * permanent entries never die, even
8353 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8354 			 */
8355 			if (entry->vme_permanent) {
8356 				__vm_map_delete_permanent_panic(map, start, end, entry);
8357 			}
8358 
8359 			if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8360 				end = entry->vme_end;
8361 				flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8362 			}
8363 
8364 			/*
8365 			 * In the kernel map and its submaps,
8366 			 * the removal of an atomic/guarded entry is strict.
8367 			 *
8368 			 * An atomic entry is processed only if it was
8369 			 * specifically targeted.
8370 			 *
8371 			 * We might have deleted non-atomic entries before
8372 			 * we reach this this point however...
8373 			 */
8374 			kmem_entry_validate_guard(map, entry,
8375 			    start, end - start, guard);
8376 		}
8377 
8378 		/*
8379 		 * Step 2.1: handle "permanent" and "submap" entries
8380 		 * *before* clipping to avoid triggering some unnecessary
8381 		 * un-nesting of the shared region.
8382 		 */
8383 		if (entry->vme_permanent && entry->is_sub_map) {
8384 //			printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8385 			/*
8386 			 * Un-mapping a "permanent" mapping of a user-space
8387 			 * submap is not allowed unless...
8388 			 */
8389 			if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8390 				/*
8391 				 * a. explicitly requested by the kernel caller.
8392 				 */
8393 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8394 			} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8395 			    developer_mode_state()) {
8396 				/*
8397 				 * b. we're in "developer" mode (for
8398 				 *    breakpoints, dtrace probes, ...).
8399 				 */
8400 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8401 			} else if (map->terminated) {
8402 				/*
8403 				 * c. this is the final address space cleanup.
8404 				 */
8405 //				printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8406 			} else {
8407 				vm_map_offset_t submap_start, submap_end;
8408 				kern_return_t submap_kr;
8409 
8410 				/*
8411 				 * Check if there are any "permanent" mappings
8412 				 * in this range in the submap.
8413 				 */
8414 				if (entry->in_transition) {
8415 					/* can that even happen ? */
8416 					goto in_transition;
8417 				}
8418 				/* compute the clipped range in the submap */
8419 				submap_start = s - entry->vme_start;
8420 				submap_start += VME_OFFSET(entry);
8421 				submap_end = end - entry->vme_start;
8422 				submap_end += VME_OFFSET(entry);
8423 				submap_kr = vm_map_delete_submap_recurse(
8424 					VME_SUBMAP(entry),
8425 					submap_start,
8426 					submap_end);
8427 				if (submap_kr != KERN_SUCCESS) {
8428 					/*
8429 					 * There are some "permanent" mappings
8430 					 * in the submap: we are not allowed
8431 					 * to remove this range.
8432 					 */
8433 					printf("%d[%s] removing permanent submap entry "
8434 					    "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8435 					    proc_selfpid(),
8436 					    (get_bsdtask_info(current_task())
8437 					    ? proc_name_address(get_bsdtask_info(current_task()))
8438 					    : "?"), entry,
8439 					    (uint64_t)entry->vme_start,
8440 					    (uint64_t)entry->vme_end,
8441 					    entry->protection,
8442 					    entry->max_protection);
8443 					DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8444 					    vm_map_entry_t, entry,
8445 					    vm_map_offset_t, entry->vme_start,
8446 					    vm_map_offset_t, entry->vme_end,
8447 					    vm_prot_t, entry->protection,
8448 					    vm_prot_t, entry->max_protection,
8449 					    int, VME_ALIAS(entry));
8450 					ret.kmr_return = KERN_PROTECTION_FAILURE;
8451 					goto out;
8452 				}
8453 				/* no permanent mappings: proceed */
8454 			}
8455 		}
8456 
8457 		/*
8458 		 * Step 3: Perform any clipping needed.
8459 		 *
8460 		 *         After this, "entry" starts at "s", ends before "end"
8461 		 */
8462 
8463 		if (entry->vme_start < s) {
8464 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8465 			    entry->map_aligned &&
8466 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8467 				/*
8468 				 * The entry will no longer be map-aligned
8469 				 * after clipping and the caller said it's OK.
8470 				 */
8471 				entry->map_aligned = FALSE;
8472 			}
8473 			vm_map_clip_start(map, entry, s);
8474 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8475 		}
8476 
8477 		if (end < entry->vme_end) {
8478 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8479 			    entry->map_aligned &&
8480 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8481 				/*
8482 				 * The entry will no longer be map-aligned
8483 				 * after clipping and the caller said it's OK.
8484 				 */
8485 				entry->map_aligned = FALSE;
8486 			}
8487 			vm_map_clip_end(map, entry, end);
8488 		}
8489 
8490 		if (entry->vme_permanent && entry->is_sub_map) {
8491 			/*
8492 			 * We already went through step 2.1 which did not deny
8493 			 * the removal of this "permanent" and "is_sub_map"
8494 			 * entry.
8495 			 * Now that we've clipped what we actually want to
8496 			 * delete, undo the "permanent" part to allow the
8497 			 * removal to proceed.
8498 			 */
8499 			DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8500 			    vm_map_entry_t, entry,
8501 			    vm_map_offset_t, entry->vme_start,
8502 			    vm_map_offset_t, entry->vme_end,
8503 			    vm_prot_t, entry->protection,
8504 			    vm_prot_t, entry->max_protection,
8505 			    int, VME_ALIAS(entry));
8506 			entry->vme_permanent = false;
8507 		}
8508 
8509 		assert(s == entry->vme_start);
8510 		assert(entry->vme_end <= end);
8511 
8512 
8513 		/*
8514 		 * Step 4: If the entry is in flux, wait for this to resolve.
8515 		 */
8516 
8517 		if (entry->in_transition) {
8518 			wait_result_t wait_result;
8519 
8520 in_transition:
8521 			/*
8522 			 * Another thread is wiring/unwiring this entry.
8523 			 * Let the other thread know we are waiting.
8524 			 */
8525 
8526 			entry->needs_wakeup = TRUE;
8527 
8528 			/*
8529 			 * wake up anybody waiting on entries that we have
8530 			 * already unwired/deleted.
8531 			 */
8532 			if (state & VMDS_NEEDS_WAKEUP) {
8533 				vm_map_entry_wakeup(map);
8534 				state &= ~VMDS_NEEDS_WAKEUP;
8535 			}
8536 
8537 			wait_result = vm_map_entry_wait(map, interruptible);
8538 
8539 			if (interruptible &&
8540 			    wait_result == THREAD_INTERRUPTED) {
8541 				/*
8542 				 * We do not clear the needs_wakeup flag,
8543 				 * since we cannot tell if we were the only one.
8544 				 */
8545 				ret.kmr_return = KERN_ABORTED;
8546 				return ret;
8547 			}
8548 
8549 			/*
8550 			 * The entry could have been clipped or it
8551 			 * may not exist anymore.  Look it up again.
8552 			 */
8553 			state |= VMDS_NEEDS_LOOKUP;
8554 			continue;
8555 		}
8556 
8557 
8558 		/*
8559 		 * Step 5: Handle wiring
8560 		 */
8561 
8562 		if (entry->wired_count) {
8563 			struct vm_map_entry tmp_entry;
8564 			boolean_t           user_wire;
8565 			unsigned int        last_timestamp;
8566 
8567 			user_wire = entry->user_wired_count > 0;
8568 
8569 			/*
8570 			 *      Remove a kernel wiring if requested
8571 			 */
8572 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8573 				entry->wired_count--;
8574 				vme_btref_consider_and_put(entry);
8575 			}
8576 
8577 			/*
8578 			 *	Remove all user wirings for proper accounting
8579 			 */
8580 			while (entry->user_wired_count) {
8581 				subtract_wire_counts(map, entry, user_wire);
8582 			}
8583 
8584 			/*
8585 			 * All our DMA I/O operations in IOKit are currently
8586 			 * done by wiring through the map entries of the task
8587 			 * requesting the I/O.
8588 			 *
8589 			 * Because of this, we must always wait for kernel wirings
8590 			 * to go away on the entries before deleting them.
8591 			 *
8592 			 * Any caller who wants to actually remove a kernel wiring
8593 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8594 			 * properly remove one wiring instead of blasting through
8595 			 * them all.
8596 			 */
8597 			if (entry->wired_count != 0) {
8598 				assert(map != kernel_map);
8599 				/*
8600 				 * Cannot continue.  Typical case is when
8601 				 * a user thread has physical io pending on
8602 				 * on this page.  Either wait for the
8603 				 * kernel wiring to go away or return an
8604 				 * error.
8605 				 */
8606 				wait_result_t wait_result;
8607 
8608 				entry->needs_wakeup = TRUE;
8609 				wait_result = vm_map_entry_wait(map,
8610 				    interruptible);
8611 
8612 				if (interruptible &&
8613 				    wait_result == THREAD_INTERRUPTED) {
8614 					/*
8615 					 * We do not clear the
8616 					 * needs_wakeup flag, since we
8617 					 * cannot tell if we were the
8618 					 * only one.
8619 					 */
8620 					ret.kmr_return = KERN_ABORTED;
8621 					return ret;
8622 				}
8623 
8624 
8625 				/*
8626 				 * The entry could have been clipped or
8627 				 * it may not exist anymore.  Look it
8628 				 * up again.
8629 				 */
8630 				state |= VMDS_NEEDS_LOOKUP;
8631 				continue;
8632 			}
8633 
8634 			/*
8635 			 * We can unlock the map now.
8636 			 *
8637 			 * The entry might be split once we unlock the map,
8638 			 * but we need the range as defined by this entry
8639 			 * to be stable. So we must make a local copy.
8640 			 *
8641 			 * The underlying objects do not change during clips,
8642 			 * and the in_transition state guarentees existence
8643 			 * of the entry.
8644 			 */
8645 			last_timestamp = map->timestamp;
8646 			entry->in_transition = TRUE;
8647 			tmp_entry = *entry;
8648 			vm_map_unlock(map);
8649 
8650 			if (tmp_entry.is_sub_map) {
8651 				vm_map_t sub_map;
8652 				vm_map_offset_t sub_start, sub_end;
8653 				pmap_t pmap;
8654 				vm_map_offset_t pmap_addr;
8655 
8656 
8657 				sub_map = VME_SUBMAP(&tmp_entry);
8658 				sub_start = VME_OFFSET(&tmp_entry);
8659 				sub_end = sub_start + (tmp_entry.vme_end -
8660 				    tmp_entry.vme_start);
8661 				if (tmp_entry.use_pmap) {
8662 					pmap = sub_map->pmap;
8663 					pmap_addr = tmp_entry.vme_start;
8664 				} else {
8665 					pmap = map->pmap;
8666 					pmap_addr = tmp_entry.vme_start;
8667 				}
8668 				(void) vm_map_unwire_nested(sub_map,
8669 				    sub_start, sub_end,
8670 				    user_wire,
8671 				    pmap, pmap_addr);
8672 			} else {
8673 				vm_map_offset_t entry_end = tmp_entry.vme_end;
8674 				vm_map_offset_t max_end;
8675 
8676 				if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8677 					max_end = end - VM_MAP_PAGE_SIZE(map);
8678 					if (entry_end > max_end) {
8679 						entry_end = max_end;
8680 					}
8681 				}
8682 
8683 				if (tmp_entry.vme_kernel_object) {
8684 					pmap_protect_options(
8685 						map->pmap,
8686 						tmp_entry.vme_start,
8687 						entry_end,
8688 						VM_PROT_NONE,
8689 						PMAP_OPTIONS_REMOVE,
8690 						NULL);
8691 				}
8692 				vm_fault_unwire(map, &tmp_entry,
8693 				    tmp_entry.vme_kernel_object, map->pmap,
8694 				    tmp_entry.vme_start, entry_end);
8695 			}
8696 
8697 			vm_map_lock(map);
8698 
8699 			/*
8700 			 * Unwiring happened, we can now go back to deleting
8701 			 * them (after we clear the in_transition bit for the range).
8702 			 */
8703 			if (last_timestamp + 1 != map->timestamp) {
8704 				state |= VMDS_NEEDS_LOOKUP;
8705 			}
8706 			clear_in_transition_end = tmp_entry.vme_end;
8707 			continue;
8708 		}
8709 
8710 		assert(entry->wired_count == 0);
8711 		assert(entry->user_wired_count == 0);
8712 
8713 
8714 		/*
8715 		 * Step 6: Entry is unwired and ready for us to delete !
8716 		 */
8717 
8718 		if (!entry->vme_permanent) {
8719 			/*
8720 			 * Typical case: the entry really shouldn't be permanent
8721 			 */
8722 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8723 		    (entry->protection & VM_PROT_EXECUTE) &&
8724 		    developer_mode_state()) {
8725 			/*
8726 			 * Allow debuggers to undo executable mappings
8727 			 * when developer mode is on.
8728 			 */
8729 #if 0
8730 			printf("FBDP %d[%s] removing permanent executable entry "
8731 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8732 			    proc_selfpid(),
8733 			    (current_task()->bsd_info
8734 			    ? proc_name_address(current_task()->bsd_info)
8735 			    : "?"), entry,
8736 			    (uint64_t)entry->vme_start,
8737 			    (uint64_t)entry->vme_end,
8738 			    entry->protection,
8739 			    entry->max_protection);
8740 #endif
8741 			entry->vme_permanent = FALSE;
8742 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8743 #if 0
8744 			printf("FBDP %d[%s] removing permanent entry "
8745 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8746 			    proc_selfpid(),
8747 			    (current_task()->bsd_info
8748 			    ? proc_name_address(current_task()->bsd_info)
8749 			    : "?"), entry,
8750 			    (uint64_t)entry->vme_start,
8751 			    (uint64_t)entry->vme_end,
8752 			    entry->protection,
8753 			    entry->max_protection);
8754 #endif
8755 			entry->vme_permanent = FALSE;
8756 #if CODE_SIGNING_MONITOR
8757 		} else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8758 			entry->vme_permanent = FALSE;
8759 
8760 			printf("%d[%s] %s(0x%llx,0x%llx): "
8761 			    "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8762 			    "prot 0x%x/0x%x\n",
8763 			    proc_selfpid(),
8764 			    (get_bsdtask_info(current_task())
8765 			    ? proc_name_address(get_bsdtask_info(current_task()))
8766 			    : "?"),
8767 			    __FUNCTION__,
8768 			    (uint64_t)start,
8769 			    (uint64_t)end,
8770 			    (uint64_t)entry->vme_start,
8771 			    (uint64_t)entry->vme_end,
8772 			    entry->protection,
8773 			    entry->max_protection);
8774 #endif
8775 		} else {
8776 			DTRACE_VM6(vm_map_delete_permanent,
8777 			    vm_map_entry_t, entry,
8778 			    vm_map_offset_t, entry->vme_start,
8779 			    vm_map_offset_t, entry->vme_end,
8780 			    vm_prot_t, entry->protection,
8781 			    vm_prot_t, entry->max_protection,
8782 			    int, VME_ALIAS(entry));
8783 		}
8784 
8785 		if (entry->is_sub_map) {
8786 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8787 			    "map %p (%d) entry %p submap %p (%d)\n",
8788 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8789 			    VME_SUBMAP(entry),
8790 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8791 			if (entry->use_pmap) {
8792 #ifndef NO_NESTED_PMAP
8793 				int pmap_flags;
8794 
8795 				if (map->terminated) {
8796 					/*
8797 					 * This is the final cleanup of the
8798 					 * address space being terminated.
8799 					 * No new mappings are expected and
8800 					 * we don't really need to unnest the
8801 					 * shared region (and lose the "global"
8802 					 * pmap mappings, if applicable).
8803 					 *
8804 					 * Tell the pmap layer that we're
8805 					 * "clean" wrt nesting.
8806 					 */
8807 					pmap_flags = PMAP_UNNEST_CLEAN;
8808 				} else {
8809 					/*
8810 					 * We're unmapping part of the nested
8811 					 * shared region, so we can't keep the
8812 					 * nested pmap.
8813 					 */
8814 					pmap_flags = 0;
8815 				}
8816 				pmap_unnest_options(
8817 					map->pmap,
8818 					(addr64_t)entry->vme_start,
8819 					entry->vme_end - entry->vme_start,
8820 					pmap_flags);
8821 #endif  /* NO_NESTED_PMAP */
8822 				if (map->mapped_in_other_pmaps &&
8823 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8824 					/* clean up parent map/maps */
8825 					vm_map_submap_pmap_clean(
8826 						map, entry->vme_start,
8827 						entry->vme_end,
8828 						VME_SUBMAP(entry),
8829 						VME_OFFSET(entry));
8830 				}
8831 			} else {
8832 				vm_map_submap_pmap_clean(
8833 					map, entry->vme_start, entry->vme_end,
8834 					VME_SUBMAP(entry),
8835 					VME_OFFSET(entry));
8836 			}
8837 		} else if (entry->vme_kernel_object ||
8838 		    VME_OBJECT(entry) == compressor_object) {
8839 			/*
8840 			 * nothing to do
8841 			 */
8842 		} else if (map->mapped_in_other_pmaps &&
8843 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8844 			vm_object_pmap_protect_options(
8845 				VME_OBJECT(entry), VME_OFFSET(entry),
8846 				entry->vme_end - entry->vme_start,
8847 				PMAP_NULL,
8848 				PAGE_SIZE,
8849 				entry->vme_start,
8850 				VM_PROT_NONE,
8851 				PMAP_OPTIONS_REMOVE);
8852 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8853 		    (state & VMDS_KERNEL_PMAP)) {
8854 			/* Remove translations associated
8855 			 * with this range unless the entry
8856 			 * does not have an object, or
8857 			 * it's the kernel map or a descendant
8858 			 * since the platform could potentially
8859 			 * create "backdoor" mappings invisible
8860 			 * to the VM. It is expected that
8861 			 * objectless, non-kernel ranges
8862 			 * do not have such VM invisible
8863 			 * translations.
8864 			 */
8865 			vm_map_address_t remove_start = entry->vme_start;
8866 			vm_map_address_t remove_end = entry->vme_end;
8867 #if MACH_ASSERT
8868 			/*
8869 			 * Prevent panics in pmap_remove() from some vm test code
8870 			 * which uses virtual address ranges that pmap disallows.
8871 			 */
8872 			if (thread_get_test_option(test_option_vm_map_clamp_pmap_remove)) {
8873 				vm_map_clamp_to_pmap(map, &remove_start, &remove_end);
8874 			}
8875 #endif /* MACH_ASSERT */
8876 			pmap_remove(map->pmap, remove_start, remove_end);
8877 		}
8878 
8879 #if DEBUG
8880 		/*
8881 		 * All pmap mappings for this map entry must have been
8882 		 * cleared by now.
8883 		 */
8884 		assert(pmap_is_empty(map->pmap,
8885 		    entry->vme_start,
8886 		    entry->vme_end));
8887 #endif /* DEBUG */
8888 
8889 		if (entry->iokit_acct) {
8890 			/* alternate accounting */
8891 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8892 			    vm_map_t, map,
8893 			    vm_map_offset_t, entry->vme_start,
8894 			    vm_map_offset_t, entry->vme_end,
8895 			    int, VME_ALIAS(entry));
8896 			vm_map_iokit_unmapped_region(map,
8897 			    (entry->vme_end -
8898 			    entry->vme_start));
8899 			entry->iokit_acct = FALSE;
8900 			entry->use_pmap = FALSE;
8901 		}
8902 
8903 		/* move "s" forward */
8904 		s    = entry->vme_end;
8905 		next = entry->vme_next;
8906 		if (!entry->map_aligned) {
8907 			vm_map_offset_t rounded_s;
8908 
8909 			/*
8910 			 * Skip artificial gap due to mis-aligned entry
8911 			 * on devices with a page size smaller than the
8912 			 * map's page size (i.e. 16k task on a 4k device).
8913 			 */
8914 			rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8915 			if (next == vm_map_to_entry(map)) {
8916 				s = rounded_s;
8917 			} else if (s < rounded_s) {
8918 				s = MIN(rounded_s, next->vme_start);
8919 			}
8920 		}
8921 		ret.kmr_size += s - entry->vme_start;
8922 
8923 		if (entry->vme_permanent) {
8924 			/*
8925 			 * A permanent entry can not be removed, so leave it
8926 			 * in place but remove all access permissions.
8927 			 */
8928 			if (__improbable(vm_log_map_delete_permanent_prot_none)) {
8929 				printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8930 				    __FUNCTION__, __LINE__,
8931 				    proc_selfpid(),
8932 				    (get_bsdtask_info(current_task())
8933 				    ? proc_name_address(get_bsdtask_info(current_task()))
8934 				    : "?"),
8935 				    map,
8936 				    entry,
8937 				    (uint64_t)entry->vme_start,
8938 				    (uint64_t)entry->vme_end,
8939 				    entry->is_sub_map,
8940 				    entry->protection,
8941 				    entry->max_protection);
8942 			}
8943 			DTRACE_VM6(vm_map_delete_permanent_prot_none,
8944 			    vm_map_entry_t, entry,
8945 			    vm_map_offset_t, entry->vme_start,
8946 			    vm_map_offset_t, entry->vme_end,
8947 			    vm_prot_t, entry->protection,
8948 			    vm_prot_t, entry->max_protection,
8949 			    int, VME_ALIAS(entry));
8950 			entry->protection = VM_PROT_NONE;
8951 			entry->max_protection = VM_PROT_NONE;
8952 #ifdef __arm64e__
8953 			entry->used_for_tpro = FALSE;
8954 #endif
8955 		} else {
8956 			vm_map_entry_zap(map, entry, zap_list);
8957 		}
8958 
8959 		entry = next;
8960 		next  = VM_MAP_ENTRY_NULL;
8961 
8962 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8963 			unsigned int last_timestamp = map->timestamp++;
8964 
8965 			if (lck_rw_lock_yield_exclusive(&map->lock,
8966 			    LCK_RW_YIELD_ANY_WAITER)) {
8967 				if (last_timestamp != map->timestamp + 1) {
8968 					state |= VMDS_NEEDS_LOOKUP;
8969 				}
8970 			} else {
8971 				/* we didn't yield, undo our change */
8972 				map->timestamp--;
8973 			}
8974 		}
8975 	}
8976 
8977 	if (map->wait_for_space) {
8978 		thread_wakeup((event_t) map);
8979 	}
8980 
8981 	if (state & VMDS_NEEDS_WAKEUP) {
8982 		vm_map_entry_wakeup(map);
8983 	}
8984 
8985 out:
8986 	if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8987 		__vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8988 	}
8989 
8990 	if (state & VMDS_KERNEL_KMEMPTR) {
8991 		kmem_free_space(start, end, range_id, &slot);
8992 	}
8993 
8994 	if (state & VMDS_FOUND_GAP) {
8995 		DTRACE_VM3(kern_vm_deallocate_gap,
8996 		    vm_map_offset_t, gap_start,
8997 		    vm_map_offset_t, save_start,
8998 		    vm_map_offset_t, save_end);
8999 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
9000 			ret.kmr_return = KERN_INVALID_VALUE;
9001 		} else {
9002 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
9003 		}
9004 	}
9005 
9006 	return ret;
9007 }
9008 
9009 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)9010 vm_map_remove_and_unlock(
9011 	vm_map_t        map,
9012 	vm_map_offset_t start,
9013 	vm_map_offset_t end,
9014 	vmr_flags_t     flags,
9015 	kmem_guard_t    guard)
9016 {
9017 	kmem_return_t ret;
9018 	VM_MAP_ZAP_DECLARE(zap);
9019 
9020 	ret = vm_map_delete(map, start, end, flags, guard, &zap);
9021 	vm_map_unlock(map);
9022 
9023 	vm_map_zap_dispose(&zap);
9024 
9025 	return ret;
9026 }
9027 
9028 /*
9029  *	vm_map_remove_guard:
9030  *
9031  *	Remove the given address range from the target map.
9032  *	This is the exported form of vm_map_delete.
9033  */
9034 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)9035 vm_map_remove_guard(
9036 	vm_map_t        map,
9037 	vm_map_offset_t start,
9038 	vm_map_offset_t end,
9039 	vmr_flags_t     flags,
9040 	kmem_guard_t    guard)
9041 {
9042 	vm_map_lock(map);
9043 	return vm_map_remove_and_unlock(map, start, end, flags, guard);
9044 }
9045 
9046 
9047 /*
9048  *  vm_map_setup:
9049  *
9050  *  Perform any required setup on a new task's map. Must be called before the task
9051  *  is enabled for IPC access, since after this point other threads may be able
9052  *  to look up the task port and make VM API calls.
9053  */
9054 void
vm_map_setup(vm_map_t map,task_t task)9055 vm_map_setup(vm_map_t map, task_t task)
9056 {
9057 	/*
9058 	 * map does NOT take a reference on owning_task. If the map has terminated,
9059 	 * it is possible that the pointer is NULL, so reads of owning_task must
9060 	 * happen under the map lock and explicitly check for NULL.
9061 	 */
9062 	vm_map_lock(map);
9063 	assert(!map->owning_task);
9064 	map->owning_task = task;
9065 	vm_map_unlock(map);
9066 #if CONFIG_DEFERRED_RECLAIM
9067 	vm_deferred_reclamation_metadata_t vdrm = task->deferred_reclamation_metadata;
9068 	if (vdrm) {
9069 		vm_deferred_reclamation_task_fork_register(vdrm);
9070 	}
9071 #endif /* CONFIG_DEFERRED_RECLAIM */
9072 }
9073 
9074 /*
9075  *	vm_map_terminate:
9076  *
9077  *	Clean out a task's map.
9078  */
9079 kern_return_t
vm_map_terminate(vm_map_t map)9080 vm_map_terminate(
9081 	vm_map_t        map)
9082 {
9083 	vm_map_lock(map);
9084 	map->terminated = TRUE;
9085 	map->owning_task = NULL;
9086 	vm_map_disable_hole_optimization(map);
9087 	(void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
9088 	    VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
9089 	return KERN_SUCCESS;
9090 }
9091 
9092 /*
9093  *	Routine:	vm_map_copy_allocate
9094  *
9095  *	Description:
9096  *		Allocates and initializes a map copy object.
9097  */
9098 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)9099 vm_map_copy_allocate(uint16_t type)
9100 {
9101 	vm_map_copy_t new_copy;
9102 
9103 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
9104 	new_copy->type = type;
9105 	if (type == VM_MAP_COPY_ENTRY_LIST) {
9106 		new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
9107 		vm_map_store_init(&new_copy->cpy_hdr);
9108 	}
9109 	return new_copy;
9110 }
9111 
9112 /*
9113  *	Routine:	vm_map_copy_discard
9114  *
9115  *	Description:
9116  *		Dispose of a map copy object (returned by
9117  *		vm_map_copyin).
9118  */
9119 void
vm_map_copy_discard(vm_map_copy_t copy)9120 vm_map_copy_discard(
9121 	vm_map_copy_t   copy)
9122 {
9123 	if (copy == VM_MAP_COPY_NULL) {
9124 		return;
9125 	}
9126 
9127 	/*
9128 	 * Assert that the vm_map_copy is coming from the right
9129 	 * zone and hasn't been forged
9130 	 */
9131 	vm_map_copy_require(copy);
9132 
9133 	switch (copy->type) {
9134 	case VM_MAP_COPY_ENTRY_LIST:
9135 		while (vm_map_copy_first_entry(copy) !=
9136 		    vm_map_copy_to_entry(copy)) {
9137 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
9138 
9139 			vm_map_copy_entry_unlink(copy, entry);
9140 			if (entry->is_sub_map) {
9141 				vm_map_deallocate(VME_SUBMAP(entry));
9142 			} else {
9143 				vm_object_deallocate(VME_OBJECT(entry));
9144 			}
9145 			vm_map_copy_entry_dispose(entry);
9146 		}
9147 		break;
9148 	case VM_MAP_COPY_KERNEL_BUFFER:
9149 
9150 		/*
9151 		 * The vm_map_copy_t and possibly the data buffer were
9152 		 * allocated by a single call to kalloc_data(), i.e. the
9153 		 * vm_map_copy_t was not allocated out of the zone.
9154 		 */
9155 		if (copy->size > msg_ool_size_small || copy->offset) {
9156 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9157 			    (long long)copy->size, (long long)copy->offset);
9158 		}
9159 		kfree_data(copy->cpy_kdata, copy->size);
9160 	}
9161 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9162 }
9163 
9164 #if XNU_PLATFORM_MacOSX
9165 
9166 __exported
9167 extern vm_map_copy_t vm_map_copy_copy(vm_map_copy_t copy);
9168 
9169 /*
9170  *	Routine:	vm_map_copy_copy
9171  *
9172  *	Description:
9173  *			Move the information in a map copy object to
9174  *			a new map copy object, leaving the old one
9175  *			empty.
9176  *
9177  *			This is used by kernel routines that need
9178  *			to look at out-of-line data (in copyin form)
9179  *			before deciding whether to return SUCCESS.
9180  *			If the routine returns FAILURE, the original
9181  *			copy object will be deallocated; therefore,
9182  *			these routines must make a copy of the copy
9183  *			object and leave the original empty so that
9184  *			deallocation will not fail.
9185  */
9186 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9187 vm_map_copy_copy(
9188 	vm_map_copy_t   copy)
9189 {
9190 	vm_map_copy_t   new_copy;
9191 
9192 	if (copy == VM_MAP_COPY_NULL) {
9193 		return VM_MAP_COPY_NULL;
9194 	}
9195 
9196 	/*
9197 	 * Assert that the vm_map_copy is coming from the right
9198 	 * zone and hasn't been forged
9199 	 */
9200 	vm_map_copy_require(copy);
9201 
9202 	/*
9203 	 * Allocate a new copy object, and copy the information
9204 	 * from the old one into it.
9205 	 */
9206 
9207 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9208 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9209 #if __has_feature(ptrauth_calls)
9210 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9211 		new_copy->cpy_kdata = copy->cpy_kdata;
9212 	}
9213 #endif
9214 
9215 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9216 		/*
9217 		 * The links in the entry chain must be
9218 		 * changed to point to the new copy object.
9219 		 */
9220 		vm_map_copy_first_entry(copy)->vme_prev
9221 		        = vm_map_copy_to_entry(new_copy);
9222 		vm_map_copy_last_entry(copy)->vme_next
9223 		        = vm_map_copy_to_entry(new_copy);
9224 	}
9225 
9226 	/*
9227 	 * Change the old copy object into one that contains
9228 	 * nothing to be deallocated.
9229 	 */
9230 	bzero(copy, sizeof(struct vm_map_copy));
9231 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9232 
9233 	/*
9234 	 * Return the new object.
9235 	 */
9236 	return new_copy;
9237 }
9238 
9239 #endif /* XNU_PLATFORM_MacOSX */
9240 
9241 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9242 vm_map_entry_is_overwritable(
9243 	vm_map_t        dst_map __unused,
9244 	vm_map_entry_t  entry)
9245 {
9246 	if (!(entry->protection & VM_PROT_WRITE)) {
9247 		/* can't overwrite if not writable */
9248 		return FALSE;
9249 	}
9250 #if !__x86_64__
9251 	if (entry->used_for_jit &&
9252 	    vm_map_cs_enforcement(dst_map) &&
9253 	    !dst_map->cs_debugged) {
9254 		/*
9255 		 * Can't overwrite a JIT region while cs_enforced
9256 		 * and not cs_debugged.
9257 		 */
9258 		return FALSE;
9259 	}
9260 
9261 #if __arm64e__
9262 	/* Do not allow overwrite HW assisted TPRO entries */
9263 	if (entry->used_for_tpro) {
9264 		return FALSE;
9265 	}
9266 #endif /* __arm64e__ */
9267 
9268 	if (entry->vme_permanent) {
9269 		if (entry->is_sub_map) {
9270 			/*
9271 			 * We can't tell if the submap contains "permanent"
9272 			 * entries within the range targeted by the caller.
9273 			 * The caller will have to check for that with
9274 			 * vm_map_overwrite_submap_recurse() for example.
9275 			 */
9276 		} else {
9277 			/*
9278 			 * Do not allow overwriting of a "permanent"
9279 			 * entry.
9280 			 */
9281 			DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9282 			    vm_map_entry_t, entry,
9283 			    vm_map_offset_t, entry->vme_start,
9284 			    vm_map_offset_t, entry->vme_end,
9285 			    vm_prot_t, entry->protection,
9286 			    vm_prot_t, entry->max_protection,
9287 			    int, VME_ALIAS(entry));
9288 			return FALSE;
9289 		}
9290 	}
9291 #endif /* !__x86_64__ */
9292 
9293 	if (entry->is_sub_map) {
9294 		/* remember not to assume every entry has a VM object... */
9295 	}
9296 
9297 
9298 	return TRUE;
9299 }
9300 
9301 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9302 vm_map_overwrite_submap_recurse(
9303 	vm_map_t        dst_map,
9304 	vm_map_offset_t dst_addr,
9305 	vm_map_size_t   dst_size)
9306 {
9307 	vm_map_offset_t dst_end;
9308 	vm_map_entry_t  tmp_entry;
9309 	vm_map_entry_t  entry;
9310 	kern_return_t   result;
9311 	boolean_t       encountered_sub_map = FALSE;
9312 
9313 
9314 
9315 	/*
9316 	 *	Verify that the destination is all writeable
9317 	 *	initially.  We have to trunc the destination
9318 	 *	address and round the copy size or we'll end up
9319 	 *	splitting entries in strange ways.
9320 	 */
9321 
9322 	dst_end = vm_map_round_page(dst_addr + dst_size,
9323 	    VM_MAP_PAGE_MASK(dst_map));
9324 	vm_map_lock(dst_map);
9325 
9326 start_pass_1:
9327 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9328 		vm_map_unlock(dst_map);
9329 		return KERN_INVALID_ADDRESS;
9330 	}
9331 
9332 	vm_map_clip_start(dst_map,
9333 	    tmp_entry,
9334 	    vm_map_trunc_page(dst_addr,
9335 	    VM_MAP_PAGE_MASK(dst_map)));
9336 	if (tmp_entry->is_sub_map) {
9337 		/* clipping did unnest if needed */
9338 		assert(!tmp_entry->use_pmap);
9339 	}
9340 
9341 	for (entry = tmp_entry;;) {
9342 		vm_map_entry_t  next;
9343 
9344 		next = entry->vme_next;
9345 		while (entry->is_sub_map) {
9346 			vm_map_offset_t sub_start;
9347 			vm_map_offset_t sub_end;
9348 			vm_map_offset_t local_end;
9349 			vm_map_t        sub_map;
9350 
9351 			if (entry->in_transition) {
9352 				/*
9353 				 * Say that we are waiting, and wait for entry.
9354 				 */
9355 				entry->needs_wakeup = TRUE;
9356 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9357 
9358 				goto start_pass_1;
9359 			}
9360 
9361 			encountered_sub_map = TRUE;
9362 			sub_start = VME_OFFSET(entry);
9363 
9364 			if (entry->vme_end < dst_end) {
9365 				sub_end = entry->vme_end;
9366 			} else {
9367 				sub_end = dst_end;
9368 			}
9369 			sub_end -= entry->vme_start;
9370 			sub_end += VME_OFFSET(entry);
9371 			local_end = entry->vme_end;
9372 			sub_map = VME_SUBMAP(entry);
9373 			vm_map_reference(sub_map);
9374 			vm_map_unlock(dst_map);
9375 
9376 			result = vm_map_overwrite_submap_recurse(
9377 				sub_map,
9378 				sub_start,
9379 				sub_end - sub_start);
9380 
9381 			vm_map_deallocate(sub_map);
9382 			sub_map = VM_MAP_NULL;
9383 
9384 			if (result != KERN_SUCCESS) {
9385 				return result;
9386 			}
9387 			if (dst_end <= entry->vme_end) {
9388 				return KERN_SUCCESS;
9389 			}
9390 			vm_map_lock(dst_map);
9391 			if (!vm_map_lookup_entry(dst_map, local_end,
9392 			    &tmp_entry)) {
9393 				vm_map_unlock(dst_map);
9394 				return KERN_INVALID_ADDRESS;
9395 			}
9396 			entry = tmp_entry;
9397 			next = entry->vme_next;
9398 		}
9399 		assert(!entry->is_sub_map);
9400 
9401 		if (!(entry->protection & VM_PROT_WRITE)) {
9402 			vm_map_unlock(dst_map);
9403 			return KERN_PROTECTION_FAILURE;
9404 		}
9405 
9406 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9407 			vm_map_unlock(dst_map);
9408 			return KERN_PROTECTION_FAILURE;
9409 		}
9410 
9411 		/*
9412 		 *	If the entry is in transition, we must wait
9413 		 *	for it to exit that state.  Anything could happen
9414 		 *	when we unlock the map, so start over.
9415 		 */
9416 		if (entry->in_transition) {
9417 			/*
9418 			 * Say that we are waiting, and wait for entry.
9419 			 */
9420 			entry->needs_wakeup = TRUE;
9421 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9422 
9423 			goto start_pass_1;
9424 		}
9425 
9426 /*
9427  *		our range is contained completely within this map entry
9428  */
9429 		if (dst_end <= entry->vme_end) {
9430 			vm_map_unlock(dst_map);
9431 			return KERN_SUCCESS;
9432 		}
9433 /*
9434  *		check that range specified is contiguous region
9435  */
9436 		if ((next == vm_map_to_entry(dst_map)) ||
9437 		    (next->vme_start != entry->vme_end)) {
9438 			vm_map_unlock(dst_map);
9439 			return KERN_INVALID_ADDRESS;
9440 		}
9441 
9442 		/*
9443 		 *	Check for permanent objects in the destination.
9444 		 */
9445 		assert(!entry->is_sub_map);
9446 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9447 		    ((!VME_OBJECT(entry)->internal) ||
9448 		    (VME_OBJECT(entry)->true_share))) {
9449 			if (encountered_sub_map) {
9450 				vm_map_unlock(dst_map);
9451 				return KERN_FAILURE;
9452 			}
9453 		}
9454 
9455 
9456 		entry = next;
9457 	}/* for */
9458 	vm_map_unlock(dst_map);
9459 	return KERN_SUCCESS;
9460 }
9461 
9462 /*
9463  *	Routine:	vm_map_copy_overwrite
9464  *
9465  *	Description:
9466  *		Copy the memory described by the map copy
9467  *		object (copy; returned by vm_map_copyin) onto
9468  *		the specified destination region (dst_map, dst_addr).
9469  *		The destination must be writeable.
9470  *
9471  *		Unlike vm_map_copyout, this routine actually
9472  *		writes over previously-mapped memory.  If the
9473  *		previous mapping was to a permanent (user-supplied)
9474  *		memory object, it is preserved.
9475  *
9476  *		The attributes (protection and inheritance) of the
9477  *		destination region are preserved.
9478  *
9479  *		If successful, consumes the copy object.
9480  *		Otherwise, the caller is responsible for it.
9481  *
9482  *	Implementation notes:
9483  *		To overwrite aligned temporary virtual memory, it is
9484  *		sufficient to remove the previous mapping and insert
9485  *		the new copy.  This replacement is done either on
9486  *		the whole region (if no permanent virtual memory
9487  *		objects are embedded in the destination region) or
9488  *		in individual map entries.
9489  *
9490  *		To overwrite permanent virtual memory , it is necessary
9491  *		to copy each page, as the external memory management
9492  *		interface currently does not provide any optimizations.
9493  *
9494  *		Unaligned memory also has to be copied.  It is possible
9495  *		to use 'vm_trickery' to copy the aligned data.  This is
9496  *		not done but not hard to implement.
9497  *
9498  *		Once a page of permanent memory has been overwritten,
9499  *		it is impossible to interrupt this function; otherwise,
9500  *		the call would be neither atomic nor location-independent.
9501  *		The kernel-state portion of a user thread must be
9502  *		interruptible.
9503  *
9504  *		It may be expensive to forward all requests that might
9505  *		overwrite permanent memory (vm_write, vm_copy) to
9506  *		uninterruptible kernel threads.  This routine may be
9507  *		called by interruptible threads; however, success is
9508  *		not guaranteed -- if the request cannot be performed
9509  *		atomically and interruptibly, an error indication is
9510  *		returned.
9511  *
9512  *		Callers of this function must call vm_map_copy_require on
9513  *		previously created vm_map_copy_t or pass a newly created
9514  *		one to ensure that it hasn't been forged.
9515  */
9516 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9517 vm_map_copy_overwrite_nested(
9518 	vm_map_t                dst_map,
9519 	vm_map_address_t        dst_addr,
9520 	vm_map_copy_t           copy,
9521 	boolean_t               interruptible,
9522 	pmap_t                  pmap,
9523 	boolean_t               discard_on_success)
9524 {
9525 	vm_map_offset_t         dst_end;
9526 	vm_map_entry_t          tmp_entry;
9527 	vm_map_entry_t          entry;
9528 	kern_return_t           kr;
9529 	boolean_t               aligned = TRUE;
9530 	boolean_t               contains_permanent_objects = FALSE;
9531 	boolean_t               encountered_sub_map = FALSE;
9532 	vm_map_offset_t         base_addr;
9533 	vm_map_size_t           copy_size;
9534 	vm_map_size_t           total_size;
9535 	uint16_t                copy_page_shift;
9536 
9537 	/*
9538 	 *	Check for special kernel buffer allocated
9539 	 *	by new_ipc_kmsg_copyin.
9540 	 */
9541 
9542 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9543 		kr = vm_map_copyout_kernel_buffer(
9544 			dst_map, &dst_addr,
9545 			copy, copy->size, TRUE, discard_on_success);
9546 		return kr;
9547 	}
9548 
9549 	/*
9550 	 *      Only works for entry lists at the moment.  Will
9551 	 *	support page lists later.
9552 	 */
9553 
9554 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9555 
9556 	if (copy->size == 0) {
9557 		if (discard_on_success) {
9558 			vm_map_copy_discard(copy);
9559 		}
9560 		return KERN_SUCCESS;
9561 	}
9562 
9563 	copy_page_shift = copy->cpy_hdr.page_shift;
9564 
9565 	/*
9566 	 *	Verify that the destination is all writeable
9567 	 *	initially.  We have to trunc the destination
9568 	 *	address and round the copy size or we'll end up
9569 	 *	splitting entries in strange ways.
9570 	 */
9571 
9572 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
9573 	    VM_MAP_PAGE_MASK(dst_map)) ||
9574 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
9575 	    VM_MAP_PAGE_MASK(dst_map)) ||
9576 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
9577 	    VM_MAP_PAGE_MASK(dst_map)) ||
9578 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9579 		aligned = FALSE;
9580 		dst_end = vm_map_round_page(dst_addr + copy->size,
9581 		    VM_MAP_PAGE_MASK(dst_map));
9582 	} else {
9583 		dst_end = dst_addr + copy->size;
9584 	}
9585 
9586 	vm_map_lock(dst_map);
9587 
9588 	/* LP64todo - remove this check when vm_map_commpage64()
9589 	 * no longer has to stuff in a map_entry for the commpage
9590 	 * above the map's max_offset.
9591 	 */
9592 	if (dst_addr >= dst_map->max_offset) {
9593 		vm_map_unlock(dst_map);
9594 		return KERN_INVALID_ADDRESS;
9595 	}
9596 
9597 start_pass_1:
9598 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9599 		vm_map_unlock(dst_map);
9600 		return KERN_INVALID_ADDRESS;
9601 	}
9602 	vm_map_clip_start(dst_map,
9603 	    tmp_entry,
9604 	    vm_map_trunc_page(dst_addr,
9605 	    VM_MAP_PAGE_MASK(dst_map)));
9606 	for (entry = tmp_entry;;) {
9607 		vm_map_entry_t  next = entry->vme_next;
9608 
9609 		while (entry->is_sub_map) {
9610 			vm_map_offset_t sub_start;
9611 			vm_map_offset_t sub_end;
9612 			vm_map_offset_t local_end;
9613 
9614 			if (entry->in_transition) {
9615 				/*
9616 				 * Say that we are waiting, and wait for entry.
9617 				 */
9618 				entry->needs_wakeup = TRUE;
9619 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9620 
9621 				goto start_pass_1;
9622 			}
9623 
9624 			local_end = entry->vme_end;
9625 			if (!(entry->needs_copy)) {
9626 				vm_map_t sub_map = VM_MAP_NULL;
9627 
9628 				/* if needs_copy we are a COW submap */
9629 				/* in such a case we just replace so */
9630 				/* there is no need for the follow-  */
9631 				/* ing check.                        */
9632 				encountered_sub_map = TRUE;
9633 				sub_start = VME_OFFSET(entry);
9634 
9635 				if (entry->vme_end < dst_end) {
9636 					sub_end = entry->vme_end;
9637 				} else {
9638 					sub_end = dst_end;
9639 				}
9640 				sub_end -= entry->vme_start;
9641 				sub_end += VME_OFFSET(entry);
9642 				sub_map = VME_SUBMAP(entry);
9643 				vm_map_reference(sub_map);
9644 				vm_map_unlock(dst_map);
9645 
9646 				kr = vm_map_overwrite_submap_recurse(
9647 					sub_map,
9648 					sub_start,
9649 					sub_end - sub_start);
9650 
9651 				vm_map_deallocate(sub_map);
9652 				sub_map = VM_MAP_NULL;
9653 				if (kr != KERN_SUCCESS) {
9654 					return kr;
9655 				}
9656 				vm_map_lock(dst_map);
9657 			}
9658 
9659 			if (dst_end <= entry->vme_end) {
9660 				goto start_overwrite;
9661 			}
9662 			if (!vm_map_lookup_entry(dst_map, local_end,
9663 			    &entry)) {
9664 				vm_map_unlock(dst_map);
9665 				return KERN_INVALID_ADDRESS;
9666 			}
9667 			next = entry->vme_next;
9668 		}
9669 		assert(!entry->is_sub_map);
9670 
9671 		if (!(entry->protection & VM_PROT_WRITE)) {
9672 			vm_map_unlock(dst_map);
9673 			return KERN_PROTECTION_FAILURE;
9674 		}
9675 
9676 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9677 			vm_map_unlock(dst_map);
9678 			return KERN_PROTECTION_FAILURE;
9679 		}
9680 
9681 		/*
9682 		 *	If the entry is in transition, we must wait
9683 		 *	for it to exit that state.  Anything could happen
9684 		 *	when we unlock the map, so start over.
9685 		 */
9686 		if (entry->in_transition) {
9687 			/*
9688 			 * Say that we are waiting, and wait for entry.
9689 			 */
9690 			entry->needs_wakeup = TRUE;
9691 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9692 
9693 			goto start_pass_1;
9694 		}
9695 
9696 /*
9697  *		our range is contained completely within this map entry
9698  */
9699 		if (dst_end <= entry->vme_end) {
9700 			break;
9701 		}
9702 /*
9703  *		check that range specified is contiguous region
9704  */
9705 		if ((next == vm_map_to_entry(dst_map)) ||
9706 		    (next->vme_start != entry->vme_end)) {
9707 			vm_map_unlock(dst_map);
9708 			return KERN_INVALID_ADDRESS;
9709 		}
9710 
9711 
9712 		/*
9713 		 *	Check for permanent objects in the destination.
9714 		 */
9715 		assert(!entry->is_sub_map);
9716 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9717 		    ((!VME_OBJECT(entry)->internal) ||
9718 		    (VME_OBJECT(entry)->true_share))) {
9719 			contains_permanent_objects = TRUE;
9720 		}
9721 
9722 		entry = next;
9723 	}/* for */
9724 
9725 start_overwrite:
9726 	/*
9727 	 *	If there are permanent objects in the destination, then
9728 	 *	the copy cannot be interrupted.
9729 	 */
9730 
9731 	if (interruptible && contains_permanent_objects) {
9732 		vm_map_unlock(dst_map);
9733 		return KERN_FAILURE;   /* XXX */
9734 	}
9735 
9736 	/*
9737 	 *
9738 	 *	Make a second pass, overwriting the data
9739 	 *	At the beginning of each loop iteration,
9740 	 *	the next entry to be overwritten is "tmp_entry"
9741 	 *	(initially, the value returned from the lookup above),
9742 	 *	and the starting address expected in that entry
9743 	 *	is "start".
9744 	 */
9745 
9746 	total_size = copy->size;
9747 	if (encountered_sub_map) {
9748 		copy_size = 0;
9749 		/* re-calculate tmp_entry since we've had the map */
9750 		/* unlocked */
9751 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9752 			vm_map_unlock(dst_map);
9753 			return KERN_INVALID_ADDRESS;
9754 		}
9755 	} else {
9756 		copy_size = copy->size;
9757 	}
9758 
9759 	base_addr = dst_addr;
9760 	while (TRUE) {
9761 		/* deconstruct the copy object and do in parts */
9762 		/* only in sub_map, interruptable case */
9763 		vm_map_entry_t  copy_entry;
9764 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9765 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9766 		int             nentries;
9767 		int             remaining_entries = 0;
9768 		vm_map_offset_t new_offset = 0;
9769 
9770 		for (entry = tmp_entry; copy_size == 0;) {
9771 			vm_map_entry_t  next;
9772 
9773 			next = entry->vme_next;
9774 
9775 			/* tmp_entry and base address are moved along */
9776 			/* each time we encounter a sub-map.  Otherwise */
9777 			/* entry can outpase tmp_entry, and the copy_size */
9778 			/* may reflect the distance between them */
9779 			/* if the current entry is found to be in transition */
9780 			/* we will start over at the beginning or the last */
9781 			/* encounter of a submap as dictated by base_addr */
9782 			/* we will zero copy_size accordingly. */
9783 			if (entry->in_transition) {
9784 				/*
9785 				 * Say that we are waiting, and wait for entry.
9786 				 */
9787 				entry->needs_wakeup = TRUE;
9788 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9789 
9790 				if (!vm_map_lookup_entry(dst_map, base_addr,
9791 				    &tmp_entry)) {
9792 					vm_map_unlock(dst_map);
9793 					return KERN_INVALID_ADDRESS;
9794 				}
9795 				copy_size = 0;
9796 				entry = tmp_entry;
9797 				continue;
9798 			}
9799 			if (entry->is_sub_map) {
9800 				vm_map_offset_t sub_start;
9801 				vm_map_offset_t sub_end;
9802 				vm_map_offset_t local_end;
9803 				vm_map_t        sub_map = VM_MAP_NULL;
9804 				bool            use_pmap;
9805 
9806 				if (entry->needs_copy) {
9807 					/* if this is a COW submap */
9808 					/* just back the range with a */
9809 					/* anonymous entry */
9810 					assert(!entry->vme_permanent);
9811 					if (entry->vme_end < dst_end) {
9812 						sub_end = entry->vme_end;
9813 					} else {
9814 						sub_end = dst_end;
9815 					}
9816 					if (entry->vme_start < base_addr) {
9817 						sub_start = base_addr;
9818 					} else {
9819 						sub_start = entry->vme_start;
9820 					}
9821 					vm_map_clip_end(
9822 						dst_map, entry, sub_end);
9823 					vm_map_clip_start(
9824 						dst_map, entry, sub_start);
9825 					assert(!entry->use_pmap);
9826 					assert(!entry->iokit_acct);
9827 					entry->use_pmap = TRUE;
9828 					vm_map_deallocate(VME_SUBMAP(entry));
9829 					assert(!entry->vme_permanent);
9830 					VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9831 					VME_OFFSET_SET(entry, 0);
9832 					entry->is_shared = FALSE;
9833 					entry->needs_copy = FALSE;
9834 					entry->protection = VM_PROT_DEFAULT;
9835 					entry->max_protection = VM_PROT_ALL;
9836 					entry->wired_count = 0;
9837 					entry->user_wired_count = 0;
9838 					if (entry->inheritance
9839 					    == VM_INHERIT_SHARE) {
9840 						entry->inheritance = VM_INHERIT_COPY;
9841 					}
9842 					continue;
9843 				}
9844 				/* first take care of any non-sub_map */
9845 				/* entries to send */
9846 				if (base_addr < entry->vme_start) {
9847 					/* stuff to send */
9848 					copy_size =
9849 					    entry->vme_start - base_addr;
9850 					break;
9851 				}
9852 				sub_start = VME_OFFSET(entry);
9853 
9854 				if (entry->vme_end < dst_end) {
9855 					sub_end = entry->vme_end;
9856 				} else {
9857 					sub_end = dst_end;
9858 				}
9859 				sub_end -= entry->vme_start;
9860 				sub_end += VME_OFFSET(entry);
9861 				local_end = entry->vme_end;
9862 				use_pmap = entry->use_pmap;
9863 				sub_map = VME_SUBMAP(entry);
9864 				vm_map_reference(sub_map);
9865 				vm_map_unlock(dst_map);
9866 				copy_size = sub_end - sub_start;
9867 
9868 				/* adjust the copy object */
9869 				if (total_size > copy_size) {
9870 					vm_map_size_t   local_size = 0;
9871 					vm_map_size_t   entry_size;
9872 
9873 					nentries = 1;
9874 					new_offset = copy->offset;
9875 					copy_entry = vm_map_copy_first_entry(copy);
9876 					while (copy_entry !=
9877 					    vm_map_copy_to_entry(copy)) {
9878 						entry_size = copy_entry->vme_end -
9879 						    copy_entry->vme_start;
9880 						if ((local_size < copy_size) &&
9881 						    ((local_size + entry_size)
9882 						    >= copy_size)) {
9883 							vm_map_copy_clip_end(copy,
9884 							    copy_entry,
9885 							    copy_entry->vme_start +
9886 							    (copy_size - local_size));
9887 							entry_size = copy_entry->vme_end -
9888 							    copy_entry->vme_start;
9889 							local_size += entry_size;
9890 							new_offset += entry_size;
9891 						}
9892 						if (local_size >= copy_size) {
9893 							next_copy = copy_entry->vme_next;
9894 							copy_entry->vme_next =
9895 							    vm_map_copy_to_entry(copy);
9896 							previous_prev =
9897 							    copy->cpy_hdr.links.prev;
9898 							copy->cpy_hdr.links.prev = copy_entry;
9899 							copy->size = copy_size;
9900 							remaining_entries =
9901 							    copy->cpy_hdr.nentries;
9902 							remaining_entries -= nentries;
9903 							copy->cpy_hdr.nentries = nentries;
9904 							break;
9905 						} else {
9906 							local_size += entry_size;
9907 							new_offset += entry_size;
9908 							nentries++;
9909 						}
9910 						copy_entry = copy_entry->vme_next;
9911 					}
9912 				}
9913 
9914 				if ((use_pmap) && (pmap == NULL)) {
9915 					kr = vm_map_copy_overwrite_nested(
9916 						sub_map,
9917 						sub_start,
9918 						copy,
9919 						interruptible,
9920 						sub_map->pmap,
9921 						TRUE);
9922 				} else if (pmap != NULL) {
9923 					kr = vm_map_copy_overwrite_nested(
9924 						sub_map,
9925 						sub_start,
9926 						copy,
9927 						interruptible, pmap,
9928 						TRUE);
9929 				} else {
9930 					kr = vm_map_copy_overwrite_nested(
9931 						sub_map,
9932 						sub_start,
9933 						copy,
9934 						interruptible,
9935 						dst_map->pmap,
9936 						TRUE);
9937 				}
9938 
9939 				vm_map_deallocate(sub_map);
9940 				sub_map = VM_MAP_NULL;
9941 
9942 				if (kr != KERN_SUCCESS) {
9943 					if (next_copy != NULL) {
9944 						copy->cpy_hdr.nentries +=
9945 						    remaining_entries;
9946 						copy->cpy_hdr.links.prev->vme_next =
9947 						    next_copy;
9948 						copy->cpy_hdr.links.prev
9949 						        = previous_prev;
9950 						copy->size = total_size;
9951 					}
9952 					return kr;
9953 				}
9954 				if (dst_end <= local_end) {
9955 					return KERN_SUCCESS;
9956 				}
9957 				/* otherwise copy no longer exists, it was */
9958 				/* destroyed after successful copy_overwrite */
9959 				copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9960 				copy->offset = new_offset;
9961 				copy->cpy_hdr.page_shift = copy_page_shift;
9962 
9963 				total_size -= copy_size;
9964 				copy_size = 0;
9965 				/* put back remainder of copy in container */
9966 				if (next_copy != NULL) {
9967 					copy->cpy_hdr.nentries = remaining_entries;
9968 					copy->cpy_hdr.links.next = next_copy;
9969 					copy->cpy_hdr.links.prev = previous_prev;
9970 					copy->size = total_size;
9971 					next_copy->vme_prev =
9972 					    vm_map_copy_to_entry(copy);
9973 					next_copy = NULL;
9974 				}
9975 				base_addr = local_end;
9976 				vm_map_lock(dst_map);
9977 				if (!vm_map_lookup_entry(dst_map,
9978 				    local_end, &tmp_entry)) {
9979 					vm_map_unlock(dst_map);
9980 					return KERN_INVALID_ADDRESS;
9981 				}
9982 				entry = tmp_entry;
9983 				continue;
9984 			}
9985 			assert(!entry->is_sub_map);
9986 
9987 			if (dst_end <= entry->vme_end) {
9988 				copy_size = dst_end - base_addr;
9989 				break;
9990 			}
9991 
9992 			if ((next == vm_map_to_entry(dst_map)) ||
9993 			    (next->vme_start != entry->vme_end)) {
9994 				vm_map_unlock(dst_map);
9995 				return KERN_INVALID_ADDRESS;
9996 			}
9997 
9998 			entry = next;
9999 		}/* for */
10000 
10001 		next_copy = NULL;
10002 		nentries = 1;
10003 
10004 		/* adjust the copy object */
10005 		if (total_size > copy_size) {
10006 			vm_map_size_t   local_size = 0;
10007 			vm_map_size_t   entry_size;
10008 
10009 			new_offset = copy->offset;
10010 			copy_entry = vm_map_copy_first_entry(copy);
10011 			while (copy_entry != vm_map_copy_to_entry(copy)) {
10012 				entry_size = copy_entry->vme_end -
10013 				    copy_entry->vme_start;
10014 				if ((local_size < copy_size) &&
10015 				    ((local_size + entry_size)
10016 				    >= copy_size)) {
10017 					vm_map_copy_clip_end(copy, copy_entry,
10018 					    copy_entry->vme_start +
10019 					    (copy_size - local_size));
10020 					entry_size = copy_entry->vme_end -
10021 					    copy_entry->vme_start;
10022 					local_size += entry_size;
10023 					new_offset += entry_size;
10024 				}
10025 				if (local_size >= copy_size) {
10026 					next_copy = copy_entry->vme_next;
10027 					copy_entry->vme_next =
10028 					    vm_map_copy_to_entry(copy);
10029 					previous_prev =
10030 					    copy->cpy_hdr.links.prev;
10031 					copy->cpy_hdr.links.prev = copy_entry;
10032 					copy->size = copy_size;
10033 					remaining_entries =
10034 					    copy->cpy_hdr.nentries;
10035 					remaining_entries -= nentries;
10036 					copy->cpy_hdr.nentries = nentries;
10037 					break;
10038 				} else {
10039 					local_size += entry_size;
10040 					new_offset += entry_size;
10041 					nentries++;
10042 				}
10043 				copy_entry = copy_entry->vme_next;
10044 			}
10045 		}
10046 
10047 		if (aligned) {
10048 			pmap_t  local_pmap;
10049 
10050 			if (pmap) {
10051 				local_pmap = pmap;
10052 			} else {
10053 				local_pmap = dst_map->pmap;
10054 			}
10055 
10056 			if ((kr =  vm_map_copy_overwrite_aligned(
10057 				    dst_map, tmp_entry, copy,
10058 				    base_addr, local_pmap)) != KERN_SUCCESS) {
10059 				if (next_copy != NULL) {
10060 					copy->cpy_hdr.nentries +=
10061 					    remaining_entries;
10062 					copy->cpy_hdr.links.prev->vme_next =
10063 					    next_copy;
10064 					copy->cpy_hdr.links.prev =
10065 					    previous_prev;
10066 					copy->size += copy_size;
10067 				}
10068 				return kr;
10069 			}
10070 			vm_map_unlock(dst_map);
10071 		} else {
10072 			/*
10073 			 * Performance gain:
10074 			 *
10075 			 * if the copy and dst address are misaligned but the same
10076 			 * offset within the page we can copy_not_aligned the
10077 			 * misaligned parts and copy aligned the rest.  If they are
10078 			 * aligned but len is unaligned we simply need to copy
10079 			 * the end bit unaligned.  We'll need to split the misaligned
10080 			 * bits of the region in this case !
10081 			 */
10082 			/* ALWAYS UNLOCKS THE dst_map MAP */
10083 			kr = vm_map_copy_overwrite_unaligned(
10084 				dst_map,
10085 				tmp_entry,
10086 				copy,
10087 				base_addr,
10088 				discard_on_success);
10089 			if (kr != KERN_SUCCESS) {
10090 				if (next_copy != NULL) {
10091 					copy->cpy_hdr.nentries +=
10092 					    remaining_entries;
10093 					copy->cpy_hdr.links.prev->vme_next =
10094 					    next_copy;
10095 					copy->cpy_hdr.links.prev =
10096 					    previous_prev;
10097 					copy->size += copy_size;
10098 				}
10099 				return kr;
10100 			}
10101 		}
10102 		total_size -= copy_size;
10103 		if (total_size == 0) {
10104 			break;
10105 		}
10106 		base_addr += copy_size;
10107 		copy_size = 0;
10108 		copy->offset = new_offset;
10109 		if (next_copy != NULL) {
10110 			copy->cpy_hdr.nentries = remaining_entries;
10111 			copy->cpy_hdr.links.next = next_copy;
10112 			copy->cpy_hdr.links.prev = previous_prev;
10113 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
10114 			copy->size = total_size;
10115 		}
10116 		vm_map_lock(dst_map);
10117 		while (TRUE) {
10118 			if (!vm_map_lookup_entry(dst_map,
10119 			    base_addr, &tmp_entry)) {
10120 				vm_map_unlock(dst_map);
10121 				return KERN_INVALID_ADDRESS;
10122 			}
10123 			if (tmp_entry->in_transition) {
10124 				entry->needs_wakeup = TRUE;
10125 				vm_map_entry_wait(dst_map, THREAD_UNINT);
10126 			} else {
10127 				break;
10128 			}
10129 		}
10130 		vm_map_clip_start(dst_map,
10131 		    tmp_entry,
10132 		    vm_map_trunc_page(base_addr,
10133 		    VM_MAP_PAGE_MASK(dst_map)));
10134 
10135 		entry = tmp_entry;
10136 	} /* while */
10137 
10138 	/*
10139 	 *	Throw away the vm_map_copy object
10140 	 */
10141 	if (discard_on_success) {
10142 		vm_map_copy_discard(copy);
10143 	}
10144 
10145 	return KERN_SUCCESS;
10146 }/* vm_map_copy_overwrite */
10147 
10148 static __attribute__((always_inline, warn_unused_result))
10149 kern_return_t
vm_map_copy_addr_size_sanitize(vm_map_t map,vm_map_offset_ut addr_u,vm_map_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * addr,vm_map_offset_t * end,vm_map_size_t * size)10150 vm_map_copy_addr_size_sanitize(
10151 	vm_map_t                map,
10152 	vm_map_offset_ut        addr_u,
10153 	vm_map_size_ut          size_u,
10154 	vm_sanitize_caller_t    vm_sanitize_caller,
10155 	vm_map_offset_t        *addr,
10156 	vm_map_offset_t        *end,
10157 	vm_map_size_t          *size)
10158 {
10159 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
10160 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
10161 	    VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
10162 
10163 	return vm_sanitize_addr_size(addr_u, size_u,
10164 	           vm_sanitize_caller, map,
10165 	           flags,
10166 	           addr, end, size);
10167 }
10168 
10169 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_ut dst_addr_u,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t interruptible)10170 vm_map_copy_overwrite(
10171 	vm_map_t                dst_map,
10172 	vm_map_offset_ut        dst_addr_u,
10173 	vm_map_copy_t           copy,
10174 	vm_map_size_ut          copy_size_u,
10175 	boolean_t               interruptible)
10176 {
10177 	vm_map_offset_t dst_addr, dst_end;
10178 	vm_map_size_t   copy_size;
10179 	vm_map_size_t   head_size, tail_size;
10180 	vm_map_copy_t   head_copy, tail_copy;
10181 	vm_map_offset_t head_addr, tail_addr;
10182 	vm_map_entry_t  entry;
10183 	kern_return_t   kr;
10184 	vm_map_offset_t effective_page_mask, effective_page_size;
10185 	uint16_t        copy_page_shift;
10186 
10187 	head_size = 0;
10188 	tail_size = 0;
10189 	head_copy = NULL;
10190 	tail_copy = NULL;
10191 	head_addr = 0;
10192 	tail_addr = 0;
10193 
10194 	/*
10195 	 *	Check for null copy object.
10196 	 */
10197 	if (copy == VM_MAP_COPY_NULL) {
10198 		return KERN_SUCCESS;
10199 	}
10200 
10201 	/*
10202 	 * Sanitize any input parameters that are addr/size/prot/inherit
10203 	 */
10204 	kr = vm_map_copy_addr_size_sanitize(
10205 		dst_map,
10206 		dst_addr_u,
10207 		copy_size_u,
10208 		VM_SANITIZE_CALLER_VM_MAP_COPY_OVERWRITE,
10209 		&dst_addr,
10210 		&dst_end,
10211 		&copy_size);
10212 	if (__improbable(kr != KERN_SUCCESS)) {
10213 		return vm_sanitize_get_kr(kr);
10214 	}
10215 
10216 	/*
10217 	 * Assert that the vm_map_copy is coming from the right
10218 	 * zone and hasn't been forged
10219 	 */
10220 	vm_map_copy_require(copy);
10221 
10222 	if (interruptible ||
10223 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
10224 		/*
10225 		 * We can't split the "copy" map if we're interruptible
10226 		 * or if we don't have a "copy" map...
10227 		 */
10228 blunt_copy:
10229 		kr = vm_map_copy_overwrite_nested(dst_map,
10230 		    dst_addr,
10231 		    copy,
10232 		    interruptible,
10233 		    (pmap_t) NULL,
10234 		    TRUE);
10235 		if (kr) {
10236 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10237 		}
10238 		return kr;
10239 	}
10240 
10241 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10242 	if (copy_page_shift < PAGE_SHIFT ||
10243 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10244 		goto blunt_copy;
10245 	}
10246 
10247 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10248 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10249 	} else {
10250 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10251 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10252 		    effective_page_mask);
10253 	}
10254 	effective_page_size = effective_page_mask + 1;
10255 
10256 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10257 		/*
10258 		 * Too small to bother with optimizing...
10259 		 */
10260 		goto blunt_copy;
10261 	}
10262 
10263 	if ((dst_addr & effective_page_mask) !=
10264 	    (copy->offset & effective_page_mask)) {
10265 		/*
10266 		 * Incompatible mis-alignment of source and destination...
10267 		 */
10268 		goto blunt_copy;
10269 	}
10270 
10271 	/*
10272 	 * Proper alignment or identical mis-alignment at the beginning.
10273 	 * Let's try and do a small unaligned copy first (if needed)
10274 	 * and then an aligned copy for the rest.
10275 	 */
10276 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10277 		head_addr = dst_addr;
10278 		head_size = (effective_page_size -
10279 		    (copy->offset & effective_page_mask));
10280 		head_size = MIN(head_size, copy_size);
10281 	}
10282 	if (!vm_map_page_aligned(copy->offset + copy_size,
10283 	    effective_page_mask)) {
10284 		/*
10285 		 * Mis-alignment at the end.
10286 		 * Do an aligned copy up to the last page and
10287 		 * then an unaligned copy for the remaining bytes.
10288 		 */
10289 		tail_size = ((copy->offset + copy_size) &
10290 		    effective_page_mask);
10291 		tail_size = MIN(tail_size, copy_size);
10292 		tail_addr = dst_addr + copy_size - tail_size;
10293 		assert(tail_addr >= head_addr + head_size);
10294 	}
10295 	assert(head_size + tail_size <= copy_size);
10296 
10297 	if (head_size + tail_size == copy_size) {
10298 		/*
10299 		 * It's all unaligned, no optimization possible...
10300 		 */
10301 		goto blunt_copy;
10302 	}
10303 
10304 	/*
10305 	 * Can't optimize if there are any submaps in the
10306 	 * destination due to the way we free the "copy" map
10307 	 * progressively in vm_map_copy_overwrite_nested()
10308 	 * in that case.
10309 	 */
10310 	vm_map_lock_read(dst_map);
10311 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10312 		vm_map_unlock_read(dst_map);
10313 		goto blunt_copy;
10314 	}
10315 	for (;
10316 	    (entry != vm_map_to_entry(dst_map) &&
10317 	    entry->vme_start < dst_addr + copy_size);
10318 	    entry = entry->vme_next) {
10319 		if (entry->is_sub_map) {
10320 			vm_map_unlock_read(dst_map);
10321 			goto blunt_copy;
10322 		}
10323 	}
10324 	vm_map_unlock_read(dst_map);
10325 
10326 	if (head_size) {
10327 		/*
10328 		 * Unaligned copy of the first "head_size" bytes, to reach
10329 		 * a page boundary.
10330 		 */
10331 
10332 		/*
10333 		 * Extract "head_copy" out of "copy".
10334 		 */
10335 		head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10336 		head_copy->cpy_hdr.entries_pageable =
10337 		    copy->cpy_hdr.entries_pageable;
10338 		head_copy->cpy_hdr.page_shift = copy_page_shift;
10339 
10340 		entry = vm_map_copy_first_entry(copy);
10341 		if (entry->vme_end < copy->offset + head_size) {
10342 			head_size = entry->vme_end - copy->offset;
10343 		}
10344 
10345 		head_copy->offset = copy->offset;
10346 		head_copy->size = head_size;
10347 		copy->offset += head_size;
10348 		copy->size -= head_size;
10349 		copy_size -= head_size;
10350 		assert(copy_size > 0);
10351 
10352 		vm_map_copy_clip_end(copy, entry, copy->offset);
10353 		vm_map_copy_entry_unlink(copy, entry);
10354 		vm_map_copy_entry_link(head_copy,
10355 		    vm_map_copy_to_entry(head_copy),
10356 		    entry);
10357 
10358 		/*
10359 		 * Do the unaligned copy.
10360 		 */
10361 		kr = vm_map_copy_overwrite_nested(dst_map,
10362 		    head_addr,
10363 		    head_copy,
10364 		    interruptible,
10365 		    (pmap_t) NULL,
10366 		    FALSE);
10367 		if (kr != KERN_SUCCESS) {
10368 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10369 			goto done;
10370 		}
10371 	}
10372 
10373 	if (tail_size) {
10374 		/*
10375 		 * Extract "tail_copy" out of "copy".
10376 		 */
10377 		tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10378 		tail_copy->cpy_hdr.entries_pageable =
10379 		    copy->cpy_hdr.entries_pageable;
10380 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
10381 
10382 		tail_copy->offset = copy->offset + copy_size - tail_size;
10383 		tail_copy->size = tail_size;
10384 
10385 		copy->size -= tail_size;
10386 		copy_size -= tail_size;
10387 		assert(copy_size > 0);
10388 
10389 		entry = vm_map_copy_last_entry(copy);
10390 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10391 		entry = vm_map_copy_last_entry(copy);
10392 		vm_map_copy_entry_unlink(copy, entry);
10393 		vm_map_copy_entry_link(tail_copy,
10394 		    vm_map_copy_last_entry(tail_copy),
10395 		    entry);
10396 	}
10397 
10398 	/*
10399 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10400 	 * we want to avoid TOCTOU issues w.r.t copy->size but
10401 	 * we don't need to change vm_map_copy_overwrite_nested()
10402 	 * and all other vm_map_copy_overwrite variants.
10403 	 *
10404 	 * So we assign the original copy_size that was passed into
10405 	 * this routine back to copy.
10406 	 *
10407 	 * This use of local 'copy_size' passed into this routine is
10408 	 * to try and protect against TOCTOU attacks where the kernel
10409 	 * has been exploited. We don't expect this to be an issue
10410 	 * during normal system operation.
10411 	 */
10412 	assertf(copy->size == copy_size,
10413 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10414 	copy->size = copy_size;
10415 
10416 	/*
10417 	 * Copy most (or possibly all) of the data.
10418 	 */
10419 	kr = vm_map_copy_overwrite_nested(dst_map,
10420 	    dst_addr + head_size,
10421 	    copy,
10422 	    interruptible,
10423 	    (pmap_t) NULL,
10424 	    FALSE);
10425 	if (kr != KERN_SUCCESS) {
10426 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10427 		goto done;
10428 	}
10429 
10430 	if (tail_size) {
10431 		kr = vm_map_copy_overwrite_nested(dst_map,
10432 		    tail_addr,
10433 		    tail_copy,
10434 		    interruptible,
10435 		    (pmap_t) NULL,
10436 		    FALSE);
10437 		if (kr) {
10438 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10439 		}
10440 	}
10441 
10442 done:
10443 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10444 	if (kr == KERN_SUCCESS) {
10445 		/*
10446 		 * Discard all the copy maps.
10447 		 */
10448 		if (head_copy) {
10449 			vm_map_copy_discard(head_copy);
10450 			head_copy = NULL;
10451 		}
10452 		vm_map_copy_discard(copy);
10453 		if (tail_copy) {
10454 			vm_map_copy_discard(tail_copy);
10455 			tail_copy = NULL;
10456 		}
10457 	} else {
10458 		/*
10459 		 * Re-assemble the original copy map.
10460 		 */
10461 		if (head_copy) {
10462 			entry = vm_map_copy_first_entry(head_copy);
10463 			vm_map_copy_entry_unlink(head_copy, entry);
10464 			vm_map_copy_entry_link(copy,
10465 			    vm_map_copy_to_entry(copy),
10466 			    entry);
10467 			copy->offset -= head_size;
10468 			copy->size += head_size;
10469 			vm_map_copy_discard(head_copy);
10470 			head_copy = NULL;
10471 		}
10472 		if (tail_copy) {
10473 			entry = vm_map_copy_last_entry(tail_copy);
10474 			vm_map_copy_entry_unlink(tail_copy, entry);
10475 			vm_map_copy_entry_link(copy,
10476 			    vm_map_copy_last_entry(copy),
10477 			    entry);
10478 			copy->size += tail_size;
10479 			vm_map_copy_discard(tail_copy);
10480 			tail_copy = NULL;
10481 		}
10482 	}
10483 	return kr;
10484 }
10485 
10486 
10487 /*
10488  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
10489  *
10490  *	Decription:
10491  *	Physically copy unaligned data
10492  *
10493  *	Implementation:
10494  *	Unaligned parts of pages have to be physically copied.  We use
10495  *	a modified form of vm_fault_copy (which understands none-aligned
10496  *	page offsets and sizes) to do the copy.  We attempt to copy as
10497  *	much memory in one go as possibly, however vm_fault_copy copies
10498  *	within 1 memory object so we have to find the smaller of "amount left"
10499  *	"source object data size" and "target object data size".  With
10500  *	unaligned data we don't need to split regions, therefore the source
10501  *	(copy) object should be one map entry, the target range may be split
10502  *	over multiple map entries however.  In any event we are pessimistic
10503  *	about these assumptions.
10504  *
10505  *	Callers of this function must call vm_map_copy_require on
10506  *	previously created vm_map_copy_t or pass a newly created
10507  *	one to ensure that it hasn't been forged.
10508  *
10509  *	Assumptions:
10510  *	dst_map is locked on entry and is return locked on success,
10511  *	unlocked on error.
10512  */
10513 
10514 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10515 vm_map_copy_overwrite_unaligned(
10516 	vm_map_t        dst_map,
10517 	vm_map_entry_t  entry,
10518 	vm_map_copy_t   copy,
10519 	vm_map_offset_t start,
10520 	boolean_t       discard_on_success)
10521 {
10522 	vm_map_entry_t          copy_entry;
10523 	vm_map_entry_t          copy_entry_next;
10524 	vm_map_version_t        version;
10525 	vm_object_t             dst_object;
10526 	vm_object_offset_t      dst_offset;
10527 	vm_object_offset_t      src_offset;
10528 	vm_object_offset_t      entry_offset;
10529 	vm_map_offset_t         entry_end;
10530 	vm_map_size_t           src_size,
10531 	    dst_size,
10532 	    copy_size,
10533 	    amount_left;
10534 	kern_return_t           kr = KERN_SUCCESS;
10535 
10536 
10537 	copy_entry = vm_map_copy_first_entry(copy);
10538 
10539 	vm_map_lock_write_to_read(dst_map);
10540 
10541 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10542 	amount_left = copy->size;
10543 /*
10544  *	unaligned so we never clipped this entry, we need the offset into
10545  *	the vm_object not just the data.
10546  */
10547 	while (amount_left > 0) {
10548 		if (entry == vm_map_to_entry(dst_map)) {
10549 			vm_map_unlock_read(dst_map);
10550 			return KERN_INVALID_ADDRESS;
10551 		}
10552 
10553 		/* "start" must be within the current map entry */
10554 		assert((start >= entry->vme_start) && (start < entry->vme_end));
10555 
10556 		/*
10557 		 *	Check protection again
10558 		 */
10559 		if (!(entry->protection & VM_PROT_WRITE)) {
10560 			vm_map_unlock_read(dst_map);
10561 			return KERN_PROTECTION_FAILURE;
10562 		}
10563 		if (entry->is_sub_map) {
10564 			/* not implemented... */
10565 			vm_map_unlock_read(dst_map);
10566 			return KERN_INVALID_ARGUMENT;
10567 		}
10568 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10569 			vm_map_unlock_read(dst_map);
10570 			return KERN_PROTECTION_FAILURE;
10571 		}
10572 		/*
10573 		 *	If the entry is in transition, we must wait
10574 		 *	for it to exit that state.  Anything could happen
10575 		 *	when we unlock the map, so start over.
10576 		 */
10577 		if (entry->in_transition) {
10578 			/*
10579 			 * Say that we are waiting, and wait for entry.
10580 			 */
10581 			entry->needs_wakeup = TRUE;
10582 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10583 
10584 			goto RetryLookup;
10585 		}
10586 
10587 		dst_offset = start - entry->vme_start;
10588 
10589 		dst_size = entry->vme_end - start;
10590 
10591 		src_size = copy_entry->vme_end -
10592 		    (copy_entry->vme_start + src_offset);
10593 
10594 		if (dst_size < src_size) {
10595 /*
10596  *			we can only copy dst_size bytes before
10597  *			we have to get the next destination entry
10598  */
10599 			copy_size = dst_size;
10600 		} else {
10601 /*
10602  *			we can only copy src_size bytes before
10603  *			we have to get the next source copy entry
10604  */
10605 			copy_size = src_size;
10606 		}
10607 
10608 		if (copy_size > amount_left) {
10609 			copy_size = amount_left;
10610 		}
10611 /*
10612  *		Entry needs copy, create a shadow shadow object for
10613  *		Copy on write region.
10614  */
10615 		assert(!entry->is_sub_map);
10616 		if (entry->needs_copy) {
10617 			if (vm_map_lock_read_to_write(dst_map)) {
10618 				vm_map_lock_read(dst_map);
10619 				goto RetryLookup;
10620 			}
10621 			VME_OBJECT_SHADOW(entry,
10622 			    (vm_map_size_t)(entry->vme_end
10623 			    - entry->vme_start),
10624 			    vm_map_always_shadow(dst_map));
10625 			entry->needs_copy = FALSE;
10626 			vm_map_lock_write_to_read(dst_map);
10627 		}
10628 		dst_object = VME_OBJECT(entry);
10629 /*
10630  *		unlike with the virtual (aligned) copy we're going
10631  *		to fault on it therefore we need a target object.
10632  */
10633 		if (dst_object == VM_OBJECT_NULL) {
10634 			if (vm_map_lock_read_to_write(dst_map)) {
10635 				vm_map_lock_read(dst_map);
10636 				goto RetryLookup;
10637 			}
10638 			dst_object = vm_object_allocate((vm_map_size_t)
10639 			    entry->vme_end - entry->vme_start,
10640 			    dst_map->serial_id);
10641 			VME_OBJECT_SET(entry, dst_object, false, 0);
10642 			VME_OFFSET_SET(entry, 0);
10643 			assert(entry->use_pmap);
10644 			vm_map_lock_write_to_read(dst_map);
10645 		}
10646 /*
10647  *		Take an object reference and unlock map. The "entry" may
10648  *		disappear or change when the map is unlocked.
10649  */
10650 		vm_object_reference(dst_object);
10651 		version.main_timestamp = dst_map->timestamp;
10652 		entry_offset = VME_OFFSET(entry);
10653 		entry_end = entry->vme_end;
10654 		vm_map_unlock_read(dst_map);
10655 /*
10656  *		Copy as much as possible in one pass
10657  */
10658 		kr = vm_fault_copy(
10659 			VME_OBJECT(copy_entry),
10660 			VME_OFFSET(copy_entry) + src_offset,
10661 			&copy_size,
10662 			dst_object,
10663 			entry_offset + dst_offset,
10664 			dst_map,
10665 			&version,
10666 			THREAD_UNINT );
10667 
10668 		start += copy_size;
10669 		src_offset += copy_size;
10670 		amount_left -= copy_size;
10671 /*
10672  *		Release the object reference
10673  */
10674 		vm_object_deallocate(dst_object);
10675 /*
10676  *		If a hard error occurred, return it now
10677  */
10678 		if (kr != KERN_SUCCESS) {
10679 			return kr;
10680 		}
10681 
10682 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10683 		    || amount_left == 0) {
10684 /*
10685  *			all done with this copy entry, dispose.
10686  */
10687 			copy_entry_next = copy_entry->vme_next;
10688 
10689 			if (discard_on_success) {
10690 				vm_map_copy_entry_unlink(copy, copy_entry);
10691 				assert(!copy_entry->is_sub_map);
10692 				vm_object_deallocate(VME_OBJECT(copy_entry));
10693 				vm_map_copy_entry_dispose(copy_entry);
10694 			}
10695 
10696 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10697 			    amount_left) {
10698 /*
10699  *				not finished copying but run out of source
10700  */
10701 				return KERN_INVALID_ADDRESS;
10702 			}
10703 
10704 			copy_entry = copy_entry_next;
10705 
10706 			src_offset = 0;
10707 		}
10708 
10709 		if (amount_left == 0) {
10710 			return KERN_SUCCESS;
10711 		}
10712 
10713 		vm_map_lock_read(dst_map);
10714 		if (version.main_timestamp == dst_map->timestamp) {
10715 			if (start == entry_end) {
10716 /*
10717  *				destination region is split.  Use the version
10718  *				information to avoid a lookup in the normal
10719  *				case.
10720  */
10721 				entry = entry->vme_next;
10722 /*
10723  *				should be contiguous. Fail if we encounter
10724  *				a hole in the destination.
10725  */
10726 				if (start != entry->vme_start) {
10727 					vm_map_unlock_read(dst_map);
10728 					return KERN_INVALID_ADDRESS;
10729 				}
10730 			}
10731 		} else {
10732 /*
10733  *			Map version check failed.
10734  *			we must lookup the entry because somebody
10735  *			might have changed the map behind our backs.
10736  */
10737 RetryLookup:
10738 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10739 				vm_map_unlock_read(dst_map);
10740 				return KERN_INVALID_ADDRESS;
10741 			}
10742 		}
10743 	}/* while */
10744 
10745 	return KERN_SUCCESS;
10746 }/* vm_map_copy_overwrite_unaligned */
10747 
10748 /*
10749  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
10750  *
10751  *	Description:
10752  *	Does all the vm_trickery possible for whole pages.
10753  *
10754  *	Implementation:
10755  *
10756  *	If there are no permanent objects in the destination,
10757  *	and the source and destination map entry zones match,
10758  *	and the destination map entry is not shared,
10759  *	then the map entries can be deleted and replaced
10760  *	with those from the copy.  The following code is the
10761  *	basic idea of what to do, but there are lots of annoying
10762  *	little details about getting protection and inheritance
10763  *	right.  Should add protection, inheritance, and sharing checks
10764  *	to the above pass and make sure that no wiring is involved.
10765  *
10766  *	Callers of this function must call vm_map_copy_require on
10767  *	previously created vm_map_copy_t or pass a newly created
10768  *	one to ensure that it hasn't been forged.
10769  */
10770 
10771 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10772 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10773 int vm_map_copy_overwrite_aligned_src_large = 0;
10774 
10775 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10776 vm_map_copy_overwrite_aligned(
10777 	vm_map_t        dst_map,
10778 	vm_map_entry_t  tmp_entry,
10779 	vm_map_copy_t   copy,
10780 	vm_map_offset_t start,
10781 	__unused pmap_t pmap)
10782 {
10783 	vm_object_t     object;
10784 	vm_map_entry_t  copy_entry;
10785 	vm_map_size_t   copy_size;
10786 	vm_map_size_t   size;
10787 	vm_map_entry_t  entry;
10788 
10789 	while ((copy_entry = vm_map_copy_first_entry(copy))
10790 	    != vm_map_copy_to_entry(copy)) {
10791 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10792 
10793 		entry = tmp_entry;
10794 
10795 		if (entry->is_sub_map) {
10796 			/* unnested when clipped earlier */
10797 			assert(!entry->use_pmap);
10798 		}
10799 		if (entry == vm_map_to_entry(dst_map)) {
10800 			vm_map_unlock(dst_map);
10801 			return KERN_INVALID_ADDRESS;
10802 		}
10803 		size = (entry->vme_end - entry->vme_start);
10804 		/*
10805 		 *	Make sure that no holes popped up in the
10806 		 *	address map, and that the protection is
10807 		 *	still valid, in case the map was unlocked
10808 		 *	earlier.
10809 		 */
10810 
10811 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10812 		    && !entry->needs_copy)) {
10813 			vm_map_unlock(dst_map);
10814 			return KERN_INVALID_ADDRESS;
10815 		}
10816 		assert(entry != vm_map_to_entry(dst_map));
10817 
10818 		/*
10819 		 *	Check protection again
10820 		 */
10821 
10822 		if (!(entry->protection & VM_PROT_WRITE)) {
10823 			vm_map_unlock(dst_map);
10824 			return KERN_PROTECTION_FAILURE;
10825 		}
10826 
10827 		if (entry->is_sub_map) {
10828 			/* not properly implemented */
10829 			vm_map_unlock(dst_map);
10830 			return KERN_PROTECTION_FAILURE;
10831 		}
10832 
10833 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10834 			vm_map_unlock(dst_map);
10835 			return KERN_PROTECTION_FAILURE;
10836 		}
10837 
10838 		/*
10839 		 *	If the entry is in transition, we must wait
10840 		 *	for it to exit that state.  Anything could happen
10841 		 *	when we unlock the map, so start over.
10842 		 */
10843 		if (entry->in_transition) {
10844 			/*
10845 			 * Say that we are waiting, and wait for entry.
10846 			 */
10847 			entry->needs_wakeup = TRUE;
10848 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10849 
10850 			goto RetryLookup;
10851 		}
10852 
10853 		/*
10854 		 *	Adjust to source size first
10855 		 */
10856 
10857 		if (copy_size < size) {
10858 			if (entry->map_aligned &&
10859 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10860 			    VM_MAP_PAGE_MASK(dst_map))) {
10861 				/* no longer map-aligned */
10862 				entry->map_aligned = FALSE;
10863 			}
10864 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10865 			size = copy_size;
10866 		}
10867 
10868 		/*
10869 		 *	Adjust to destination size
10870 		 */
10871 
10872 		if (size < copy_size) {
10873 			vm_map_copy_clip_end(copy, copy_entry,
10874 			    copy_entry->vme_start + size);
10875 			copy_size = size;
10876 		}
10877 
10878 		assert((entry->vme_end - entry->vme_start) == size);
10879 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10880 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10881 
10882 		/*
10883 		 *	If the destination contains temporary unshared memory,
10884 		 *	we can perform the copy by throwing it away and
10885 		 *	installing the source data.
10886 		 *
10887 		 *	Exceptions for mappings with special semantics:
10888 		 *	+ "permanent" entries,
10889 		 *	+ JIT regions,
10890 		 *	+ TPRO regions,
10891 		 *      + pmap-specific protection policies,
10892 		 *	+ VM objects with COPY_NONE copy strategy.
10893 		 */
10894 
10895 		object = VME_OBJECT(entry);
10896 		if ((!entry->is_shared &&
10897 		    !entry->vme_permanent &&
10898 		    !entry->used_for_jit &&
10899 #if __arm64e__
10900 		    !entry->used_for_tpro &&
10901 #endif /* __arm64e__ */
10902 		    !(entry->protection & VM_PROT_EXECUTE) &&
10903 		    !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10904 		    ((object == VM_OBJECT_NULL) ||
10905 		    (object->internal &&
10906 		    !object->true_share &&
10907 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10908 		    entry->needs_copy) {
10909 			vm_object_t     old_object = VME_OBJECT(entry);
10910 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10911 			vm_object_offset_t      offset;
10912 
10913 			assert(!entry->is_sub_map);
10914 			/*
10915 			 * Ensure that the source and destination aren't
10916 			 * identical
10917 			 */
10918 			if (old_object == VME_OBJECT(copy_entry) &&
10919 			    old_offset == VME_OFFSET(copy_entry)) {
10920 				vm_map_copy_entry_unlink(copy, copy_entry);
10921 				vm_map_copy_entry_dispose(copy_entry);
10922 
10923 				if (old_object != VM_OBJECT_NULL) {
10924 					vm_object_deallocate(old_object);
10925 				}
10926 
10927 				start = tmp_entry->vme_end;
10928 				tmp_entry = tmp_entry->vme_next;
10929 				continue;
10930 			}
10931 
10932 #if XNU_TARGET_OS_OSX
10933 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10934 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10935 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10936 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10937 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10938 				/*
10939 				 * Virtual vs. Physical copy tradeoff #1.
10940 				 *
10941 				 * Copying only a few pages out of a large
10942 				 * object:  do a physical copy instead of
10943 				 * a virtual copy, to avoid possibly keeping
10944 				 * the entire large object alive because of
10945 				 * those few copy-on-write pages.
10946 				 */
10947 				vm_map_copy_overwrite_aligned_src_large++;
10948 				goto slow_copy;
10949 			}
10950 #endif /* XNU_TARGET_OS_OSX */
10951 
10952 			if ((dst_map->pmap != kernel_pmap) &&
10953 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10954 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10955 				vm_object_t new_object, new_shadow;
10956 
10957 				/*
10958 				 * We're about to map something over a mapping
10959 				 * established by malloc()...
10960 				 */
10961 				new_object = VME_OBJECT(copy_entry);
10962 				if (new_object != VM_OBJECT_NULL) {
10963 					vm_object_lock_shared(new_object);
10964 				}
10965 				while (new_object != VM_OBJECT_NULL &&
10966 #if XNU_TARGET_OS_OSX
10967 				    !new_object->true_share &&
10968 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10969 #endif /* XNU_TARGET_OS_OSX */
10970 				    new_object->internal) {
10971 					new_shadow = new_object->shadow;
10972 					if (new_shadow == VM_OBJECT_NULL) {
10973 						break;
10974 					}
10975 					vm_object_lock_shared(new_shadow);
10976 					vm_object_unlock(new_object);
10977 					new_object = new_shadow;
10978 				}
10979 				if (new_object != VM_OBJECT_NULL) {
10980 					if (!new_object->internal) {
10981 						/*
10982 						 * The new mapping is backed
10983 						 * by an external object.  We
10984 						 * don't want malloc'ed memory
10985 						 * to be replaced with such a
10986 						 * non-anonymous mapping, so
10987 						 * let's go off the optimized
10988 						 * path...
10989 						 */
10990 						vm_map_copy_overwrite_aligned_src_not_internal++;
10991 						vm_object_unlock(new_object);
10992 						goto slow_copy;
10993 					}
10994 #if XNU_TARGET_OS_OSX
10995 					if (new_object->true_share ||
10996 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10997 						/*
10998 						 * Same if there's a "true_share"
10999 						 * object in the shadow chain, or
11000 						 * an object with a non-default
11001 						 * (SYMMETRIC) copy strategy.
11002 						 */
11003 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
11004 						vm_object_unlock(new_object);
11005 						goto slow_copy;
11006 					}
11007 #endif /* XNU_TARGET_OS_OSX */
11008 					vm_object_unlock(new_object);
11009 				}
11010 				/*
11011 				 * The new mapping is still backed by
11012 				 * anonymous (internal) memory, so it's
11013 				 * OK to substitute it for the original
11014 				 * malloc() mapping.
11015 				 */
11016 			}
11017 
11018 			if (old_object != VM_OBJECT_NULL) {
11019 				assert(!entry->vme_permanent);
11020 				if (entry->is_sub_map) {
11021 					if (entry->use_pmap) {
11022 #ifndef NO_NESTED_PMAP
11023 						pmap_unnest(dst_map->pmap,
11024 						    (addr64_t)entry->vme_start,
11025 						    entry->vme_end - entry->vme_start);
11026 #endif  /* NO_NESTED_PMAP */
11027 						if (dst_map->mapped_in_other_pmaps) {
11028 							/* clean up parent */
11029 							/* map/maps */
11030 							vm_map_submap_pmap_clean(
11031 								dst_map, entry->vme_start,
11032 								entry->vme_end,
11033 								VME_SUBMAP(entry),
11034 								VME_OFFSET(entry));
11035 						}
11036 					} else {
11037 						vm_map_submap_pmap_clean(
11038 							dst_map, entry->vme_start,
11039 							entry->vme_end,
11040 							VME_SUBMAP(entry),
11041 							VME_OFFSET(entry));
11042 					}
11043 					vm_map_deallocate(VME_SUBMAP(entry));
11044 				} else {
11045 					if (dst_map->mapped_in_other_pmaps) {
11046 						vm_object_pmap_protect_options(
11047 							VME_OBJECT(entry),
11048 							VME_OFFSET(entry),
11049 							entry->vme_end
11050 							- entry->vme_start,
11051 							PMAP_NULL,
11052 							PAGE_SIZE,
11053 							entry->vme_start,
11054 							VM_PROT_NONE,
11055 							PMAP_OPTIONS_REMOVE);
11056 					} else {
11057 						pmap_remove_options(
11058 							dst_map->pmap,
11059 							(addr64_t)(entry->vme_start),
11060 							(addr64_t)(entry->vme_end),
11061 							PMAP_OPTIONS_REMOVE);
11062 					}
11063 					vm_object_deallocate(old_object);
11064 				}
11065 			}
11066 
11067 			if (entry->iokit_acct) {
11068 				/* keep using iokit accounting */
11069 				entry->use_pmap = FALSE;
11070 			} else {
11071 				/* use pmap accounting */
11072 				entry->use_pmap = TRUE;
11073 			}
11074 			assert(!entry->vme_permanent);
11075 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
11076 			object = VME_OBJECT(entry);
11077 			entry->needs_copy = copy_entry->needs_copy;
11078 			entry->wired_count = 0;
11079 			entry->user_wired_count = 0;
11080 			offset = VME_OFFSET(copy_entry);
11081 			VME_OFFSET_SET(entry, offset);
11082 
11083 			vm_map_copy_entry_unlink(copy, copy_entry);
11084 			vm_map_copy_entry_dispose(copy_entry);
11085 
11086 			/*
11087 			 * we could try to push pages into the pmap at this point, BUT
11088 			 * this optimization only saved on average 2 us per page if ALL
11089 			 * the pages in the source were currently mapped
11090 			 * and ALL the pages in the dest were touched, if there were fewer
11091 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
11092 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
11093 			 */
11094 
11095 			/*
11096 			 *	Set up for the next iteration.  The map
11097 			 *	has not been unlocked, so the next
11098 			 *	address should be at the end of this
11099 			 *	entry, and the next map entry should be
11100 			 *	the one following it.
11101 			 */
11102 
11103 			start = tmp_entry->vme_end;
11104 			tmp_entry = tmp_entry->vme_next;
11105 		} else {
11106 			vm_map_version_t        version;
11107 			vm_object_t             dst_object;
11108 			vm_object_offset_t      dst_offset;
11109 			kern_return_t           r;
11110 
11111 slow_copy:
11112 			if (entry->needs_copy) {
11113 				VME_OBJECT_SHADOW(entry,
11114 				    (entry->vme_end -
11115 				    entry->vme_start),
11116 				    vm_map_always_shadow(dst_map));
11117 				entry->needs_copy = FALSE;
11118 			}
11119 
11120 			dst_object = VME_OBJECT(entry);
11121 			dst_offset = VME_OFFSET(entry);
11122 
11123 			/*
11124 			 *	Take an object reference, and record
11125 			 *	the map version information so that the
11126 			 *	map can be safely unlocked.
11127 			 */
11128 
11129 			if (dst_object == VM_OBJECT_NULL) {
11130 				/*
11131 				 * We would usually have just taken the
11132 				 * optimized path above if the destination
11133 				 * object has not been allocated yet.  But we
11134 				 * now disable that optimization if the copy
11135 				 * entry's object is not backed by anonymous
11136 				 * memory to avoid replacing malloc'ed
11137 				 * (i.e. re-usable) anonymous memory with a
11138 				 * not-so-anonymous mapping.
11139 				 * So we have to handle this case here and
11140 				 * allocate a new VM object for this map entry.
11141 				 */
11142 				dst_object = vm_object_allocate(
11143 					entry->vme_end - entry->vme_start,
11144 					dst_map->serial_id
11145 					);
11146 				dst_offset = 0;
11147 				VME_OBJECT_SET(entry, dst_object, false, 0);
11148 				VME_OFFSET_SET(entry, dst_offset);
11149 				assert(entry->use_pmap);
11150 			}
11151 
11152 			vm_object_reference(dst_object);
11153 
11154 			/* account for unlock bumping up timestamp */
11155 			version.main_timestamp = dst_map->timestamp + 1;
11156 
11157 			vm_map_unlock(dst_map);
11158 
11159 			/*
11160 			 *	Copy as much as possible in one pass
11161 			 */
11162 
11163 			copy_size = size;
11164 			r = vm_fault_copy(
11165 				VME_OBJECT(copy_entry),
11166 				VME_OFFSET(copy_entry),
11167 				&copy_size,
11168 				dst_object,
11169 				dst_offset,
11170 				dst_map,
11171 				&version,
11172 				THREAD_UNINT );
11173 
11174 			/*
11175 			 *	Release the object reference
11176 			 */
11177 
11178 			vm_object_deallocate(dst_object);
11179 
11180 			/*
11181 			 *	If a hard error occurred, return it now
11182 			 */
11183 
11184 			if (r != KERN_SUCCESS) {
11185 				return r;
11186 			}
11187 
11188 			if (copy_size != 0) {
11189 				/*
11190 				 *	Dispose of the copied region
11191 				 */
11192 
11193 				vm_map_copy_clip_end(copy, copy_entry,
11194 				    copy_entry->vme_start + copy_size);
11195 				vm_map_copy_entry_unlink(copy, copy_entry);
11196 				vm_object_deallocate(VME_OBJECT(copy_entry));
11197 				vm_map_copy_entry_dispose(copy_entry);
11198 			}
11199 
11200 			/*
11201 			 *	Pick up in the destination map where we left off.
11202 			 *
11203 			 *	Use the version information to avoid a lookup
11204 			 *	in the normal case.
11205 			 */
11206 
11207 			start += copy_size;
11208 			vm_map_lock(dst_map);
11209 			if (version.main_timestamp == dst_map->timestamp &&
11210 			    copy_size != 0) {
11211 				/* We can safely use saved tmp_entry value */
11212 
11213 				if (tmp_entry->map_aligned &&
11214 				    !VM_MAP_PAGE_ALIGNED(
11215 					    start,
11216 					    VM_MAP_PAGE_MASK(dst_map))) {
11217 					/* no longer map-aligned */
11218 					tmp_entry->map_aligned = FALSE;
11219 				}
11220 				vm_map_clip_end(dst_map, tmp_entry, start);
11221 				tmp_entry = tmp_entry->vme_next;
11222 			} else {
11223 				/* Must do lookup of tmp_entry */
11224 
11225 RetryLookup:
11226 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11227 					vm_map_unlock(dst_map);
11228 					return KERN_INVALID_ADDRESS;
11229 				}
11230 				if (tmp_entry->map_aligned &&
11231 				    !VM_MAP_PAGE_ALIGNED(
11232 					    start,
11233 					    VM_MAP_PAGE_MASK(dst_map))) {
11234 					/* no longer map-aligned */
11235 					tmp_entry->map_aligned = FALSE;
11236 				}
11237 				vm_map_clip_start(dst_map, tmp_entry, start);
11238 			}
11239 		}
11240 	}/* while */
11241 
11242 	return KERN_SUCCESS;
11243 }/* vm_map_copy_overwrite_aligned */
11244 
11245 /*
11246  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
11247  *
11248  *	Description:
11249  *		Copy in data to a kernel buffer from space in the
11250  *		source map. The original space may be optionally
11251  *		deallocated.
11252  *
11253  *		If successful, returns a new copy object.
11254  */
11255 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11256 vm_map_copyin_kernel_buffer(
11257 	vm_map_t        src_map,
11258 	vm_map_offset_t src_addr,
11259 	vm_map_size_t   len,
11260 	boolean_t       src_destroy,
11261 	vm_map_copy_t   *copy_result)
11262 {
11263 	kern_return_t kr;
11264 	vm_map_copy_t copy;
11265 	void *kdata;
11266 
11267 	if (len > msg_ool_size_small) {
11268 		return KERN_INVALID_ARGUMENT;
11269 	}
11270 
11271 	kdata = kalloc_data(len, Z_WAITOK);
11272 	if (kdata == NULL) {
11273 		return KERN_RESOURCE_SHORTAGE;
11274 	}
11275 	kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11276 	if (kr != KERN_SUCCESS) {
11277 		kfree_data(kdata, len);
11278 		return kr;
11279 	}
11280 
11281 	copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11282 	copy->cpy_kdata = kdata;
11283 	copy->size = len;
11284 	copy->offset = 0;
11285 
11286 	if (src_destroy) {
11287 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11288 
11289 		if (src_map == kernel_map) {
11290 			flags |= VM_MAP_REMOVE_KUNWIRE;
11291 		}
11292 
11293 		(void)vm_map_remove_guard(src_map,
11294 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11295 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11296 		    flags, KMEM_GUARD_NONE);
11297 	}
11298 
11299 	*copy_result = copy;
11300 	return KERN_SUCCESS;
11301 }
11302 
11303 /*
11304  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
11305  *
11306  *	Description:
11307  *		Copy out data from a kernel buffer into space in the
11308  *		destination map. The space may be otpionally dynamically
11309  *		allocated.
11310  *
11311  *		If successful, consumes the copy object.
11312  *		Otherwise, the caller is responsible for it.
11313  *
11314  *		Callers of this function must call vm_map_copy_require on
11315  *		previously created vm_map_copy_t or pass a newly created
11316  *		one to ensure that it hasn't been forged.
11317  */
11318 static int vm_map_copyout_kernel_buffer_failures = 0;
11319 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11320 vm_map_copyout_kernel_buffer(
11321 	vm_map_t                map,
11322 	vm_map_address_t        *addr,  /* IN/OUT */
11323 	vm_map_copy_t           copy,
11324 	vm_map_size_t           copy_size,
11325 	boolean_t               overwrite,
11326 	boolean_t               consume_on_success)
11327 {
11328 	kern_return_t kr = KERN_SUCCESS;
11329 	thread_t thread = current_thread();
11330 
11331 	assert(copy->size == copy_size);
11332 
11333 	/*
11334 	 * check for corrupted vm_map_copy structure
11335 	 */
11336 	if (copy_size > msg_ool_size_small || copy->offset) {
11337 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11338 		    (long long)copy->size, (long long)copy->offset);
11339 	}
11340 
11341 	if (!overwrite) {
11342 		/*
11343 		 * Allocate space in the target map for the data
11344 		 */
11345 		vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11346 
11347 		if (map == kernel_map) {
11348 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11349 		}
11350 
11351 		*addr = 0;
11352 		kr = vm_map_enter(map,
11353 		    addr,
11354 		    vm_map_round_page(copy_size,
11355 		    VM_MAP_PAGE_MASK(map)),
11356 		    (vm_map_offset_t) 0,
11357 		    vmk_flags,
11358 		    VM_OBJECT_NULL,
11359 		    (vm_object_offset_t) 0,
11360 		    FALSE,
11361 		    VM_PROT_DEFAULT,
11362 		    VM_PROT_ALL,
11363 		    VM_INHERIT_DEFAULT);
11364 		if (kr != KERN_SUCCESS) {
11365 			return kr;
11366 		}
11367 #if KASAN
11368 		if (map->pmap == kernel_pmap) {
11369 			kasan_notify_address(*addr, copy->size);
11370 		}
11371 #endif
11372 	}
11373 
11374 	/*
11375 	 * Copyout the data from the kernel buffer to the target map.
11376 	 */
11377 	if (thread->map == map) {
11378 		/*
11379 		 * If the target map is the current map, just do
11380 		 * the copy.
11381 		 */
11382 		assert((vm_size_t)copy_size == copy_size);
11383 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11384 			kr = KERN_INVALID_ADDRESS;
11385 		}
11386 	} else {
11387 		vm_map_switch_context_t switch_ctx;
11388 
11389 		/*
11390 		 * If the target map is another map, assume the
11391 		 * target's address space identity for the duration
11392 		 * of the copy.
11393 		 */
11394 		vm_map_reference(map);
11395 		switch_ctx = vm_map_switch_to(map);
11396 
11397 		assert((vm_size_t)copy_size == copy_size);
11398 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11399 			vm_map_copyout_kernel_buffer_failures++;
11400 			kr = KERN_INVALID_ADDRESS;
11401 		}
11402 
11403 		vm_map_switch_back(switch_ctx);
11404 		vm_map_deallocate(map);
11405 	}
11406 
11407 	if (kr != KERN_SUCCESS) {
11408 		/* the copy failed, clean up */
11409 		if (!overwrite) {
11410 			/*
11411 			 * Deallocate the space we allocated in the target map.
11412 			 */
11413 			(void) vm_map_remove(map,
11414 			    vm_map_trunc_page(*addr,
11415 			    VM_MAP_PAGE_MASK(map)),
11416 			    vm_map_round_page((*addr +
11417 			    vm_map_round_page(copy_size,
11418 			    VM_MAP_PAGE_MASK(map))),
11419 			    VM_MAP_PAGE_MASK(map)));
11420 			*addr = 0;
11421 		}
11422 	} else {
11423 		/* copy was successful, dicard the copy structure */
11424 		if (consume_on_success) {
11425 			kfree_data(copy->cpy_kdata, copy_size);
11426 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11427 		}
11428 	}
11429 
11430 	return kr;
11431 }
11432 
11433 /*
11434  *	Routine:	vm_map_copy_insert      [internal use only]
11435  *
11436  *	Description:
11437  *		Link a copy chain ("copy") into a map at the
11438  *		specified location (after "where").
11439  *
11440  *		Callers of this function must call vm_map_copy_require on
11441  *		previously created vm_map_copy_t or pass a newly created
11442  *		one to ensure that it hasn't been forged.
11443  *	Side effects:
11444  *		The copy chain is destroyed.
11445  */
11446 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11447 vm_map_copy_insert(
11448 	vm_map_t        map,
11449 	vm_map_entry_t  after_where,
11450 	vm_map_copy_t   copy)
11451 {
11452 	vm_map_entry_t  entry;
11453 
11454 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11455 		entry = vm_map_copy_first_entry(copy);
11456 		vm_map_copy_entry_unlink(copy, entry);
11457 		vm_map_store_entry_link(map, after_where, entry,
11458 		    VM_MAP_KERNEL_FLAGS_NONE);
11459 		after_where = entry;
11460 	}
11461 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11462 }
11463 
11464 /*
11465  * Callers of this function must call vm_map_copy_require on
11466  * previously created vm_map_copy_t or pass a newly created
11467  * one to ensure that it hasn't been forged.
11468  */
11469 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11470 vm_map_copy_remap(
11471 	vm_map_t        map,
11472 	vm_map_entry_t  where,
11473 	vm_map_copy_t   copy,
11474 	vm_map_offset_t adjustment,
11475 	vm_prot_t       cur_prot,
11476 	vm_prot_t       max_prot,
11477 	vm_inherit_t    inheritance)
11478 {
11479 	vm_map_entry_t  copy_entry, new_entry;
11480 
11481 	for (copy_entry = vm_map_copy_first_entry(copy);
11482 	    copy_entry != vm_map_copy_to_entry(copy);
11483 	    copy_entry = copy_entry->vme_next) {
11484 		/* get a new VM map entry for the map */
11485 		new_entry = vm_map_entry_create(map);
11486 		/* copy the "copy entry" to the new entry */
11487 		vm_map_entry_copy(map, new_entry, copy_entry);
11488 		/* adjust "start" and "end" */
11489 		new_entry->vme_start += adjustment;
11490 		new_entry->vme_end += adjustment;
11491 		/* clear some attributes */
11492 		new_entry->inheritance = inheritance;
11493 		new_entry->protection = cur_prot;
11494 		new_entry->max_protection = max_prot;
11495 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11496 		/* take an extra reference on the entry's "object" */
11497 		if (new_entry->is_sub_map) {
11498 			assert(!new_entry->use_pmap); /* not nested */
11499 			vm_map_reference(VME_SUBMAP(new_entry));
11500 		} else {
11501 			vm_object_reference(VME_OBJECT(new_entry));
11502 		}
11503 		/* insert the new entry in the map */
11504 		vm_map_store_entry_link(map, where, new_entry,
11505 		    VM_MAP_KERNEL_FLAGS_NONE);
11506 		/* continue inserting the "copy entries" after the new entry */
11507 		where = new_entry;
11508 	}
11509 }
11510 
11511 
11512 /*
11513  * Returns true if *size matches (or is in the range of) copy->size.
11514  * Upon returning true, the *size field is updated with the actual size of the
11515  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11516  */
11517 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11518 vm_map_copy_validate_size(
11519 	vm_map_t                dst_map,
11520 	vm_map_copy_t           copy,
11521 	vm_map_size_t           *size)
11522 {
11523 	if (copy == VM_MAP_COPY_NULL) {
11524 		return FALSE;
11525 	}
11526 
11527 	/*
11528 	 * Assert that the vm_map_copy is coming from the right
11529 	 * zone and hasn't been forged
11530 	 */
11531 	vm_map_copy_require(copy);
11532 
11533 	vm_map_size_t copy_sz = copy->size;
11534 	vm_map_size_t sz = *size;
11535 	switch (copy->type) {
11536 	case VM_MAP_COPY_KERNEL_BUFFER:
11537 		if (sz == copy_sz) {
11538 			return TRUE;
11539 		}
11540 		break;
11541 	case VM_MAP_COPY_ENTRY_LIST:
11542 		/*
11543 		 * potential page-size rounding prevents us from exactly
11544 		 * validating this flavor of vm_map_copy, but we can at least
11545 		 * assert that it's within a range.
11546 		 */
11547 		if (copy_sz >= sz &&
11548 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11549 			*size = copy_sz;
11550 			return TRUE;
11551 		}
11552 		break;
11553 	default:
11554 		break;
11555 	}
11556 	return FALSE;
11557 }
11558 
11559 static kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11560 vm_map_copyout_internal(
11561 	vm_map_t                dst_map,
11562 	vm_map_address_t       *dst_addr,      /* OUT */
11563 	vm_map_copy_t           copy,
11564 	vm_map_size_ut          copy_size_u,
11565 	boolean_t               consume_on_success,
11566 	vm_prot_t               cur_protection,
11567 	vm_prot_t               max_protection,
11568 	vm_inherit_t            inheritance)
11569 {
11570 	vm_map_size_t           size, copy_size;
11571 	vm_map_size_t           adjustment;
11572 	vm_map_offset_t         start;
11573 	vm_object_offset_t      vm_copy_start;
11574 	vm_map_entry_t          last;
11575 	vm_map_entry_t          entry;
11576 	vm_map_copy_t           original_copy;
11577 	kern_return_t           kr;
11578 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11579 
11580 	/*
11581 	 *	Check for null copy object.
11582 	 */
11583 
11584 	if (copy == VM_MAP_COPY_NULL) {
11585 		*dst_addr = 0;
11586 		return KERN_SUCCESS;
11587 	}
11588 
11589 	/*
11590 	 * Assert that the vm_map_copy is coming from the right
11591 	 * zone and hasn't been forged
11592 	 */
11593 	vm_map_copy_require(copy);
11594 
11595 	if (!VM_SANITIZE_UNSAFE_IS_EQUAL(copy_size_u, copy->size)) {
11596 		*dst_addr = 0;
11597 		ktriage_record(thread_tid(current_thread()),
11598 		    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11599 		    KDBG_TRIAGE_RESERVED,
11600 		    KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR),
11601 		    KERN_FAILURE /* arg */);
11602 		return KERN_FAILURE;
11603 	}
11604 	copy_size = copy->size;
11605 
11606 	/*
11607 	 *	Check for special kernel buffer allocated
11608 	 *	by new_ipc_kmsg_copyin.
11609 	 */
11610 
11611 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11612 		kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11613 		    copy, copy_size, FALSE,
11614 		    consume_on_success);
11615 		if (kr) {
11616 			ktriage_record(thread_tid(current_thread()),
11617 			    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11618 			    KDBG_TRIAGE_RESERVED,
11619 			    KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11620 		}
11621 		return kr;
11622 	}
11623 
11624 
11625 	original_copy = copy;
11626 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11627 		vm_map_copy_t target_copy;
11628 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11629 
11630 		target_copy = VM_MAP_COPY_NULL;
11631 		DEBUG4K_ADJUST("adjusting...\n");
11632 		kr = vm_map_copy_adjust_to_target(
11633 			copy,
11634 			0, /* offset */
11635 			copy->size, /* size */
11636 			dst_map,
11637 			TRUE, /* copy */
11638 			&target_copy,
11639 			&overmap_start,
11640 			&overmap_end,
11641 			&trimmed_start);
11642 		if (kr != KERN_SUCCESS) {
11643 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11644 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11645 			return kr;
11646 		}
11647 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11648 		if (target_copy != copy) {
11649 			copy = target_copy;
11650 		}
11651 		copy_size = copy->size;
11652 	}
11653 
11654 	/*
11655 	 *	Find space for the data
11656 	 */
11657 
11658 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11659 	    VM_MAP_COPY_PAGE_MASK(copy));
11660 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11661 	    VM_MAP_COPY_PAGE_MASK(copy))
11662 	    - vm_copy_start;
11663 
11664 	vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map, size);
11665 
11666 	vm_map_lock(dst_map);
11667 	kr = vm_map_locate_space_anywhere(dst_map, size, 0, vmk_flags,
11668 	    &start, &last);
11669 	if (kr != KERN_SUCCESS) {
11670 		vm_map_unlock(dst_map);
11671 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11672 		return kr;
11673 	}
11674 
11675 	adjustment = start - vm_copy_start;
11676 	if (!consume_on_success) {
11677 		/*
11678 		 * We're not allowed to consume "copy", so we'll have to
11679 		 * copy its map entries into the destination map below.
11680 		 * No need to re-allocate map entries from the correct
11681 		 * (pageable or not) zone, since we'll get new map entries
11682 		 * during the transfer.
11683 		 * We'll also adjust the map entries's "start" and "end"
11684 		 * during the transfer, to keep "copy"'s entries consistent
11685 		 * with its "offset".
11686 		 */
11687 		goto after_adjustments;
11688 	}
11689 
11690 	/*
11691 	 *	Since we're going to just drop the map
11692 	 *	entries from the copy into the destination
11693 	 *	map, they must come from the same pool.
11694 	 */
11695 
11696 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11697 		/*
11698 		 * Mismatches occur when dealing with the default
11699 		 * pager.
11700 		 */
11701 		vm_map_entry_t  next, new;
11702 
11703 		/*
11704 		 * Find the zone that the copies were allocated from
11705 		 */
11706 
11707 		entry = vm_map_copy_first_entry(copy);
11708 
11709 		/*
11710 		 * Reinitialize the copy so that vm_map_copy_entry_link
11711 		 * will work.
11712 		 */
11713 		vm_map_store_copy_reset(copy, entry);
11714 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11715 
11716 		/*
11717 		 * Copy each entry.
11718 		 */
11719 		while (entry != vm_map_copy_to_entry(copy)) {
11720 			new = vm_map_copy_entry_create(copy);
11721 			vm_map_entry_copy_full(new, entry);
11722 			new->vme_no_copy_on_read = FALSE;
11723 			assert(!new->iokit_acct);
11724 			if (new->is_sub_map) {
11725 				/* clr address space specifics */
11726 				new->use_pmap = FALSE;
11727 			}
11728 			vm_map_copy_entry_link(copy,
11729 			    vm_map_copy_last_entry(copy),
11730 			    new);
11731 			next = entry->vme_next;
11732 			vm_map_entry_dispose(entry);
11733 			entry = next;
11734 		}
11735 	}
11736 
11737 	/*
11738 	 *	Adjust the addresses in the copy chain, and
11739 	 *	reset the region attributes.
11740 	 */
11741 
11742 	for (entry = vm_map_copy_first_entry(copy);
11743 	    entry != vm_map_copy_to_entry(copy);
11744 	    entry = entry->vme_next) {
11745 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11746 			/*
11747 			 * We're injecting this copy entry into a map that
11748 			 * has the standard page alignment, so clear
11749 			 * "map_aligned" (which might have been inherited
11750 			 * from the original map entry).
11751 			 */
11752 			entry->map_aligned = FALSE;
11753 		}
11754 
11755 		entry->vme_start += adjustment;
11756 		entry->vme_end += adjustment;
11757 
11758 		if (entry->map_aligned) {
11759 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11760 			    VM_MAP_PAGE_MASK(dst_map)));
11761 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11762 			    VM_MAP_PAGE_MASK(dst_map)));
11763 		}
11764 
11765 		entry->inheritance = VM_INHERIT_DEFAULT;
11766 		entry->protection = VM_PROT_DEFAULT;
11767 		entry->max_protection = VM_PROT_ALL;
11768 		entry->behavior = VM_BEHAVIOR_DEFAULT;
11769 
11770 		/*
11771 		 * If the entry is now wired,
11772 		 * map the pages into the destination map.
11773 		 */
11774 		if (entry->wired_count != 0) {
11775 			vm_map_offset_t va;
11776 			vm_object_offset_t       offset;
11777 			vm_object_t object;
11778 			vm_prot_t prot;
11779 			int     type_of_fault;
11780 			uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11781 
11782 			/* TODO4K would need to use actual page size */
11783 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11784 
11785 			object = VME_OBJECT(entry);
11786 			offset = VME_OFFSET(entry);
11787 			va = entry->vme_start;
11788 
11789 			pmap_pageable(dst_map->pmap,
11790 			    entry->vme_start,
11791 			    entry->vme_end,
11792 			    TRUE);
11793 
11794 			while (va < entry->vme_end) {
11795 				vm_page_t       m;
11796 				struct vm_object_fault_info fault_info = {
11797 					.interruptible = THREAD_UNINT,
11798 				};
11799 
11800 				/*
11801 				 * Look up the page in the object.
11802 				 * Assert that the page will be found in the
11803 				 * top object:
11804 				 * either
11805 				 *	the object was newly created by
11806 				 *	vm_object_copy_slowly, and has
11807 				 *	copies of all of the pages from
11808 				 *	the source object
11809 				 * or
11810 				 *	the object was moved from the old
11811 				 *	map entry; because the old map
11812 				 *	entry was wired, all of the pages
11813 				 *	were in the top-level object.
11814 				 *	(XXX not true if we wire pages for
11815 				 *	 reading)
11816 				 */
11817 				vm_object_lock(object);
11818 
11819 				m = vm_page_lookup(object, offset);
11820 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11821 				    m->vmp_absent) {
11822 					panic("vm_map_copyout: wiring %p", m);
11823 				}
11824 
11825 				prot = entry->protection;
11826 
11827 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11828 				    prot) {
11829 					prot |= VM_PROT_EXECUTE;
11830 				}
11831 
11832 				type_of_fault = DBG_CACHE_HIT_FAULT;
11833 
11834 				fault_info.user_tag = VME_ALIAS(entry);
11835 				fault_info.pmap_options = 0;
11836 				if (entry->iokit_acct ||
11837 				    (!entry->is_sub_map && !entry->use_pmap)) {
11838 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11839 				}
11840 				if (entry->vme_xnu_user_debug &&
11841 				    !VM_PAGE_OBJECT(m)->code_signed) {
11842 					/*
11843 					 * Modified code-signed executable
11844 					 * region: this page does not belong
11845 					 * to a code-signed VM object, so it
11846 					 * must have been copied and should
11847 					 * therefore be typed XNU_USER_DEBUG
11848 					 * rather than XNU_USER_EXEC.
11849 					 */
11850 					fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11851 				}
11852 
11853 				vm_fault_enter(m,
11854 				    dst_map->pmap,
11855 				    va,
11856 				    PAGE_SIZE, 0,
11857 				    prot,
11858 				    prot,
11859 				    VM_PAGE_WIRED(m),
11860 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11861 				    &fault_info,
11862 				    NULL,             /* need_retry */
11863 				    &type_of_fault,
11864 				    &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11865 
11866 				vm_object_unlock(object);
11867 
11868 				offset += PAGE_SIZE_64;
11869 				va += PAGE_SIZE;
11870 			}
11871 		}
11872 	}
11873 
11874 after_adjustments:
11875 
11876 	/*
11877 	 *	Correct the page alignment for the result
11878 	 */
11879 
11880 	*dst_addr = start + (copy->offset - vm_copy_start);
11881 
11882 #if KASAN
11883 	kasan_notify_address(*dst_addr, size);
11884 #endif
11885 
11886 	/*
11887 	 *	Update the hints and the map size
11888 	 */
11889 
11890 	if (consume_on_success) {
11891 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11892 	} else {
11893 		SAVE_HINT_MAP_WRITE(dst_map, last);
11894 	}
11895 
11896 	dst_map->size += size;
11897 
11898 	/*
11899 	 *	Link in the copy
11900 	 */
11901 
11902 	if (consume_on_success) {
11903 		vm_map_copy_insert(dst_map, last, copy);
11904 		if (copy != original_copy) {
11905 			vm_map_copy_discard(original_copy);
11906 			original_copy = VM_MAP_COPY_NULL;
11907 		}
11908 	} else {
11909 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11910 		    cur_protection, max_protection,
11911 		    inheritance);
11912 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11913 			vm_map_copy_discard(copy);
11914 			copy = original_copy;
11915 		}
11916 	}
11917 
11918 
11919 	vm_map_unlock(dst_map);
11920 
11921 	/*
11922 	 * XXX	If wiring_required, call vm_map_pageable
11923 	 */
11924 
11925 	return KERN_SUCCESS;
11926 }
11927 
11928 /*
11929  *	Routine:	vm_map_copyout_size
11930  *
11931  *	Description:
11932  *		Copy out a copy chain ("copy") into newly-allocated
11933  *		space in the destination map. Uses a prevalidated
11934  *		size for the copy object (vm_map_copy_validate_size).
11935  *
11936  *		If successful, consumes the copy object.
11937  *		Otherwise, the caller is responsible for it.
11938  */
11939 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size)11940 vm_map_copyout_size(
11941 	vm_map_t                dst_map,
11942 	vm_map_address_t       *dst_addr,      /* OUT */
11943 	vm_map_copy_t           copy,
11944 	vm_map_size_ut          copy_size)
11945 {
11946 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11947 	           TRUE,                     /* consume_on_success */
11948 	           VM_PROT_DEFAULT,
11949 	           VM_PROT_ALL,
11950 	           VM_INHERIT_DEFAULT);
11951 }
11952 
11953 /*
11954  *	Routine:	vm_map_copyout
11955  *
11956  *	Description:
11957  *		Copy out a copy chain ("copy") into newly-allocated
11958  *		space in the destination map.
11959  *
11960  *		If successful, consumes the copy object.
11961  *		Otherwise, the caller is responsible for it.
11962  */
11963 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11964 vm_map_copyout(
11965 	vm_map_t                dst_map,
11966 	vm_map_address_t       *dst_addr,      /* OUT */
11967 	vm_map_copy_t           copy)
11968 {
11969 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11970 	           TRUE,                     /* consume_on_success */
11971 	           VM_PROT_DEFAULT,
11972 	           VM_PROT_ALL,
11973 	           VM_INHERIT_DEFAULT);
11974 }
11975 
11976 /*
11977  *	Routine:	vm_map_copyin
11978  *
11979  *	Description:
11980  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11981  *
11982  */
11983 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,vm_map_copy_t * copy_result)11984 vm_map_copyin(
11985 	vm_map_t                src_map,
11986 	vm_map_address_ut       src_addr,
11987 	vm_map_size_ut          len,
11988 	boolean_t               src_destroy,
11989 	vm_map_copy_t          *copy_result)   /* OUT */
11990 {
11991 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11992 	           FALSE, copy_result, FALSE);
11993 }
11994 
11995 /*
11996  *	Routine:	vm_map_copyin_common
11997  *
11998  *	Description:
11999  *		Copy the specified region (src_addr, len) from the
12000  *		source address space (src_map), possibly removing
12001  *		the region from the source address space (src_destroy).
12002  *
12003  *	Returns:
12004  *		A vm_map_copy_t object (copy_result), suitable for
12005  *		insertion into another address space (using vm_map_copyout),
12006  *		copying over another address space region (using
12007  *		vm_map_copy_overwrite).  If the copy is unused, it
12008  *		should be destroyed (using vm_map_copy_discard).
12009  *
12010  *	In/out conditions:
12011  *		The source map should not be locked on entry.
12012  */
12013 
12014 typedef struct submap_map {
12015 	vm_map_t        parent_map;
12016 	vm_map_offset_t base_start;
12017 	vm_map_offset_t base_end;
12018 	vm_map_size_t   base_len;
12019 	struct submap_map *next;
12020 } submap_map_t;
12021 
12022 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)12023 vm_map_copyin_common(
12024 	vm_map_t                src_map,
12025 	vm_map_address_ut       src_addr,
12026 	vm_map_size_ut          len,
12027 	boolean_t               src_destroy,
12028 	__unused boolean_t      src_volatile,
12029 	vm_map_copy_t          *copy_result,   /* OUT */
12030 	boolean_t               use_maxprot)
12031 {
12032 	int flags;
12033 
12034 	flags = 0;
12035 	if (src_destroy) {
12036 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
12037 	}
12038 	if (use_maxprot) {
12039 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
12040 	}
12041 	return vm_map_copyin_internal(src_map,
12042 	           src_addr,
12043 	           len,
12044 	           flags,
12045 	           copy_result);
12046 }
12047 
12048 static __attribute__((always_inline, warn_unused_result))
12049 kern_return_t
vm_map_copyin_sanitize(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,vm_map_offset_t * src_start,vm_map_offset_t * src_end,vm_map_size_t * len,vm_map_offset_t * src_addr_unaligned)12050 vm_map_copyin_sanitize(
12051 	vm_map_t                src_map,
12052 	vm_map_address_ut       src_addr_u,
12053 	vm_map_size_ut          len_u,
12054 	vm_map_offset_t        *src_start,
12055 	vm_map_offset_t        *src_end,
12056 	vm_map_size_t          *len,
12057 	vm_map_offset_t        *src_addr_unaligned)
12058 {
12059 	kern_return_t   kr;
12060 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS |
12061 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
12062 	    VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
12063 
12064 #if KASAN_TBI
12065 	if (vm_kernel_map_is_kernel(src_map)) {
12066 		flags |= VM_SANITIZE_FLAGS_CANONICALIZE;
12067 	}
12068 #endif /* KASAN_TBI */
12069 
12070 	kr = vm_sanitize_addr_size(src_addr_u, len_u,
12071 	    VM_SANITIZE_CALLER_VM_MAP_COPYIN,
12072 	    src_map,
12073 	    flags,
12074 	    src_start, src_end, len);
12075 	if (__improbable(kr != KERN_SUCCESS)) {
12076 		return kr;
12077 	}
12078 
12079 	/*
12080 	 *	Compute (page aligned) start and end of region
12081 	 */
12082 	*src_addr_unaligned  = *src_start; /* remember unaligned value */
12083 	*src_start = vm_map_trunc_page(*src_addr_unaligned,
12084 	    VM_MAP_PAGE_MASK(src_map));
12085 	*src_end   = vm_map_round_page(*src_end, VM_MAP_PAGE_MASK(src_map));
12086 	return KERN_SUCCESS;
12087 }
12088 
12089 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,int flags,vm_map_copy_t * copy_result)12090 vm_map_copyin_internal(
12091 	vm_map_t                src_map,
12092 	vm_map_address_ut       src_addr_u,
12093 	vm_map_size_ut          len_u,
12094 	int                     flags,
12095 	vm_map_copy_t          *copy_result)   /* OUT */
12096 {
12097 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
12098 	                                 * in multi-level lookup, this
12099 	                                 * entry contains the actual
12100 	                                 * vm_object/offset.
12101 	                                 */
12102 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
12103 
12104 	vm_map_offset_t src_start;      /* Start of current entry --
12105 	                                 * where copy is taking place now
12106 	                                 */
12107 	vm_map_offset_t src_end;        /* End of entire region to be
12108 	                                 * copied */
12109 	vm_map_offset_t src_addr_unaligned;
12110 	vm_map_offset_t src_base;
12111 	vm_map_size_t   len;
12112 	vm_map_t        base_map = src_map;
12113 	boolean_t       map_share = FALSE;
12114 	submap_map_t    *parent_maps = NULL;
12115 
12116 	vm_map_copy_t   copy;           /* Resulting copy */
12117 	vm_map_address_t copy_addr;
12118 	vm_map_size_t   copy_size;
12119 	boolean_t       src_destroy;
12120 	boolean_t       use_maxprot;
12121 	boolean_t       preserve_purgeable;
12122 	boolean_t       entry_was_shared;
12123 	vm_map_entry_t  saved_src_entry;
12124 	kern_return_t   kr;
12125 
12126 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
12127 		return KERN_INVALID_ARGUMENT;
12128 	}
12129 
12130 	/*
12131 	 *	Check for copies of zero bytes.
12132 	 */
12133 	if (VM_SANITIZE_UNSAFE_IS_ZERO(len_u)) {
12134 		*copy_result = VM_MAP_COPY_NULL;
12135 		return KERN_SUCCESS;
12136 	}
12137 
12138 	/*
12139 	 * Sanitize any input parameters that are addr/size/prot/inherit
12140 	 */
12141 	kr = vm_map_copyin_sanitize(
12142 		src_map,
12143 		src_addr_u,
12144 		len_u,
12145 		&src_start,
12146 		&src_end,
12147 		&len,
12148 		&src_addr_unaligned);
12149 	if (__improbable(kr != KERN_SUCCESS)) {
12150 		return vm_sanitize_get_kr(kr);
12151 	}
12152 
12153 
12154 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
12155 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
12156 	preserve_purgeable =
12157 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
12158 
12159 	/*
12160 	 * If the copy is sufficiently small, use a kernel buffer instead
12161 	 * of making a virtual copy.  The theory being that the cost of
12162 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
12163 	 * for small regions.
12164 	 */
12165 	if ((len <= msg_ool_size_small) &&
12166 	    !use_maxprot &&
12167 	    !preserve_purgeable &&
12168 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
12169 	    /*
12170 	     * Since the "msg_ool_size_small" threshold was increased and
12171 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
12172 	     * address space limits, we revert to doing a virtual copy if the
12173 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
12174 	     * of the commpage would now fail when it used to work.
12175 	     */
12176 	    (src_start >= vm_map_min(src_map) &&
12177 	    src_start < vm_map_max(src_map) &&
12178 	    src_end >= vm_map_min(src_map) &&
12179 	    src_end < vm_map_max(src_map))) {
12180 		return vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len,
12181 		           src_destroy, copy_result);
12182 	}
12183 
12184 	/*
12185 	 *	Allocate a header element for the list.
12186 	 *
12187 	 *	Use the start and end in the header to
12188 	 *	remember the endpoints prior to rounding.
12189 	 */
12190 
12191 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12192 	copy->cpy_hdr.entries_pageable = TRUE;
12193 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
12194 	copy->offset = src_addr_unaligned;
12195 	copy->size = len;
12196 
12197 	new_entry = vm_map_copy_entry_create(copy);
12198 
12199 #define RETURN(x)                                               \
12200 	MACRO_BEGIN                                             \
12201 	vm_map_unlock(src_map);                                 \
12202 	if(src_map != base_map)                                 \
12203 	        vm_map_deallocate(src_map);                     \
12204 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
12205 	        vm_map_copy_entry_dispose(new_entry);           \
12206 	vm_map_copy_discard(copy);                              \
12207 	{                                                       \
12208 	        submap_map_t	*_ptr;                          \
12209                                                                 \
12210 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12211 	                parent_maps=parent_maps->next;          \
12212 	                if (_ptr->parent_map != base_map)       \
12213 	                        vm_map_deallocate(_ptr->parent_map);    \
12214 	                kfree_type(submap_map_t, _ptr);         \
12215 	        }                                               \
12216 	}                                                       \
12217 	MACRO_RETURN(x);                                        \
12218 	MACRO_END
12219 
12220 	/*
12221 	 *	Find the beginning of the region.
12222 	 */
12223 
12224 	vm_map_lock(src_map);
12225 
12226 	/*
12227 	 * Lookup the original "src_addr_unaligned" rather than the truncated
12228 	 * "src_start", in case "src_start" falls in a non-map-aligned
12229 	 * map entry *before* the map entry that contains "src_addr_unaligned"...
12230 	 */
12231 	if (!vm_map_lookup_entry(src_map, src_addr_unaligned, &tmp_entry)) {
12232 		RETURN(KERN_INVALID_ADDRESS);
12233 	}
12234 	if (!tmp_entry->is_sub_map) {
12235 		/*
12236 		 * ... but clip to the map-rounded "src_start" rather than
12237 		 * "src_addr_unaligned" to preserve map-alignment.  We'll adjust the
12238 		 * first copy entry at the end, if needed.
12239 		 */
12240 		vm_map_clip_start(src_map, tmp_entry, src_start);
12241 	}
12242 	if (src_start < tmp_entry->vme_start) {
12243 		/*
12244 		 * Move "src_start" up to the start of the
12245 		 * first map entry to copy.
12246 		 */
12247 		src_start = tmp_entry->vme_start;
12248 	}
12249 	/* set for later submap fix-up */
12250 	copy_addr = src_start;
12251 
12252 	/*
12253 	 *	Go through entries until we get to the end.
12254 	 */
12255 
12256 	while (TRUE) {
12257 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
12258 		vm_map_size_t   src_size;               /* Size of source
12259 		                                         * map entry (in both
12260 		                                         * maps)
12261 		                                         */
12262 
12263 		vm_object_t             src_object;     /* Object to copy */
12264 		vm_object_offset_t      src_offset;
12265 
12266 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
12267 
12268 		boolean_t       src_needs_copy;         /* Should source map
12269 		                                         * be made read-only
12270 		                                         * for copy-on-write?
12271 		                                         */
12272 
12273 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
12274 
12275 		boolean_t       was_wired;              /* Was source wired? */
12276 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
12277 		vm_map_version_t version;               /* Version before locks
12278 		                                         * dropped to make copy
12279 		                                         */
12280 		kern_return_t   result;                 /* Return value from
12281 		                                         * copy_strategically.
12282 		                                         */
12283 		while (tmp_entry->is_sub_map) {
12284 			vm_map_size_t submap_len;
12285 			submap_map_t *ptr;
12286 
12287 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
12288 			ptr->next = parent_maps;
12289 			parent_maps = ptr;
12290 			ptr->parent_map = src_map;
12291 			ptr->base_start = src_start;
12292 			ptr->base_end = src_end;
12293 			submap_len = tmp_entry->vme_end - src_start;
12294 			if (submap_len > (src_end - src_start)) {
12295 				submap_len = src_end - src_start;
12296 			}
12297 			ptr->base_len = submap_len;
12298 
12299 			src_start -= tmp_entry->vme_start;
12300 			src_start += VME_OFFSET(tmp_entry);
12301 			src_end = src_start + submap_len;
12302 			src_map = VME_SUBMAP(tmp_entry);
12303 			vm_map_lock(src_map);
12304 			/* keep an outstanding reference for all maps in */
12305 			/* the parents tree except the base map */
12306 			vm_map_reference(src_map);
12307 			vm_map_unlock(ptr->parent_map);
12308 			if (!vm_map_lookup_entry(
12309 				    src_map, src_start, &tmp_entry)) {
12310 				RETURN(KERN_INVALID_ADDRESS);
12311 			}
12312 			map_share = TRUE;
12313 			if (!tmp_entry->is_sub_map) {
12314 				vm_map_clip_start(src_map, tmp_entry, src_start);
12315 			}
12316 			src_entry = tmp_entry;
12317 		}
12318 		/* we are now in the lowest level submap... */
12319 
12320 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12321 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12322 			/* This is not, supported for now.In future */
12323 			/* we will need to detect the phys_contig   */
12324 			/* condition and then upgrade copy_slowly   */
12325 			/* to do physical copy from the device mem  */
12326 			/* based object. We can piggy-back off of   */
12327 			/* the was wired boolean to set-up the      */
12328 			/* proper handling */
12329 			RETURN(KERN_PROTECTION_FAILURE);
12330 		}
12331 		/*
12332 		 *	Create a new address map entry to hold the result.
12333 		 *	Fill in the fields from the appropriate source entries.
12334 		 *	We must unlock the source map to do this if we need
12335 		 *	to allocate a map entry.
12336 		 */
12337 		if (new_entry == VM_MAP_ENTRY_NULL) {
12338 			version.main_timestamp = src_map->timestamp;
12339 			vm_map_unlock(src_map);
12340 
12341 			new_entry = vm_map_copy_entry_create(copy);
12342 
12343 			vm_map_lock(src_map);
12344 			if ((version.main_timestamp + 1) != src_map->timestamp) {
12345 				if (!vm_map_lookup_entry(src_map, src_start,
12346 				    &tmp_entry)) {
12347 					RETURN(KERN_INVALID_ADDRESS);
12348 				}
12349 				if (!tmp_entry->is_sub_map) {
12350 					vm_map_clip_start(src_map, tmp_entry, src_start);
12351 				}
12352 				continue; /* restart w/ new tmp_entry */
12353 			}
12354 		}
12355 
12356 		/*
12357 		 *	Verify that the region can be read.
12358 		 */
12359 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12360 		    !use_maxprot) ||
12361 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
12362 			RETURN(KERN_PROTECTION_FAILURE);
12363 		}
12364 
12365 		src_object = VME_OBJECT(src_entry);
12366 
12367 
12368 		/*
12369 		 *	Clip against the endpoints of the entire region.
12370 		 */
12371 
12372 		vm_map_clip_end(src_map, src_entry, src_end);
12373 
12374 		src_size = src_entry->vme_end - src_start;
12375 		src_offset = VME_OFFSET(src_entry);
12376 		was_wired = (src_entry->wired_count != 0);
12377 
12378 		vm_map_entry_copy(src_map, new_entry, src_entry);
12379 		if (new_entry->is_sub_map) {
12380 			/* clr address space specifics */
12381 			new_entry->use_pmap = FALSE;
12382 		} else {
12383 			/*
12384 			 * We're dealing with a copy-on-write operation,
12385 			 * so the resulting mapping should not inherit the
12386 			 * original mapping's accounting settings.
12387 			 * "iokit_acct" should have been cleared in
12388 			 * vm_map_entry_copy().
12389 			 * "use_pmap" should be reset to its default (TRUE)
12390 			 * so that the new mapping gets accounted for in
12391 			 * the task's memory footprint.
12392 			 */
12393 			assert(!new_entry->iokit_acct);
12394 			new_entry->use_pmap = TRUE;
12395 		}
12396 
12397 		/*
12398 		 *	Attempt non-blocking copy-on-write optimizations.
12399 		 */
12400 
12401 		/*
12402 		 * If we are destroying the source, and the object
12403 		 * is internal, we could move the object reference
12404 		 * from the source to the copy.  The copy is
12405 		 * copy-on-write only if the source is.
12406 		 * We make another reference to the object, because
12407 		 * destroying the source entry will deallocate it.
12408 		 *
12409 		 * This memory transfer has to be atomic, (to prevent
12410 		 * the VM object from being shared or copied while
12411 		 * it's being moved here), so we could only do this
12412 		 * if we won't have to unlock the VM map until the
12413 		 * original mapping has been fully removed.
12414 		 */
12415 
12416 RestartCopy:
12417 		if ((src_object == VM_OBJECT_NULL ||
12418 		    (!was_wired && !map_share && !tmp_entry->is_shared
12419 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12420 		    vm_object_copy_quickly(
12421 			    VME_OBJECT(new_entry),
12422 			    src_offset,
12423 			    src_size,
12424 			    &src_needs_copy,
12425 			    &new_entry_needs_copy)) {
12426 			new_entry->needs_copy = new_entry_needs_copy;
12427 
12428 			/*
12429 			 *	Handle copy-on-write obligations
12430 			 */
12431 
12432 			if (src_needs_copy && !tmp_entry->needs_copy) {
12433 				vm_prot_t prot;
12434 
12435 				prot = src_entry->protection & ~VM_PROT_WRITE;
12436 
12437 				if (override_nx(src_map, VME_ALIAS(src_entry))
12438 				    && prot) {
12439 					prot |= VM_PROT_EXECUTE;
12440 				}
12441 
12442 				vm_object_pmap_protect(
12443 					src_object,
12444 					src_offset,
12445 					src_size,
12446 					(src_entry->is_shared ?
12447 					PMAP_NULL
12448 					: src_map->pmap),
12449 					VM_MAP_PAGE_SIZE(src_map),
12450 					src_entry->vme_start,
12451 					prot);
12452 
12453 				assert(tmp_entry->wired_count == 0);
12454 				tmp_entry->needs_copy = TRUE;
12455 			}
12456 
12457 			/*
12458 			 *	The map has never been unlocked, so it's safe
12459 			 *	to move to the next entry rather than doing
12460 			 *	another lookup.
12461 			 */
12462 
12463 			goto CopySuccessful;
12464 		}
12465 
12466 		entry_was_shared = tmp_entry->is_shared;
12467 
12468 		/*
12469 		 *	Take an object reference, so that we may
12470 		 *	release the map lock(s).
12471 		 */
12472 
12473 		assert(src_object != VM_OBJECT_NULL);
12474 		vm_object_reference(src_object);
12475 
12476 		/*
12477 		 *	Record the timestamp for later verification.
12478 		 *	Unlock the map.
12479 		 */
12480 
12481 		version.main_timestamp = src_map->timestamp;
12482 		vm_map_unlock(src_map); /* Increments timestamp once! */
12483 		saved_src_entry = src_entry;
12484 		tmp_entry = VM_MAP_ENTRY_NULL;
12485 		src_entry = VM_MAP_ENTRY_NULL;
12486 
12487 		/*
12488 		 *	Perform the copy
12489 		 */
12490 
12491 		if (was_wired ||
12492 		    (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12493 		    !(flags & VM_MAP_COPYIN_FORK)) ||
12494 		    (debug4k_no_cow_copyin &&
12495 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12496 CopySlowly:
12497 			vm_object_lock(src_object);
12498 			result = vm_object_copy_slowly(
12499 				src_object,
12500 				src_offset,
12501 				src_size,
12502 				THREAD_UNINT,
12503 				&new_copy_object);
12504 			/* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12505 			saved_used_for_jit = new_entry->used_for_jit;
12506 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12507 			new_entry->used_for_jit = saved_used_for_jit;
12508 			VME_OFFSET_SET(new_entry,
12509 			    src_offset - vm_object_trunc_page(src_offset));
12510 			new_entry->needs_copy = FALSE;
12511 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12512 		    (entry_was_shared || map_share)) {
12513 			vm_object_t new_object;
12514 
12515 			vm_object_lock_shared(src_object);
12516 			new_object = vm_object_copy_delayed(
12517 				src_object,
12518 				src_offset,
12519 				src_size,
12520 				TRUE);
12521 			if (new_object == VM_OBJECT_NULL) {
12522 				goto CopySlowly;
12523 			}
12524 
12525 			VME_OBJECT_SET(new_entry, new_object, false, 0);
12526 			assert(new_entry->wired_count == 0);
12527 			new_entry->needs_copy = TRUE;
12528 			assert(!new_entry->iokit_acct);
12529 			assert(new_object->purgable == VM_PURGABLE_DENY);
12530 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12531 			result = KERN_SUCCESS;
12532 		} else {
12533 			vm_object_offset_t new_offset;
12534 			new_offset = VME_OFFSET(new_entry);
12535 			result = vm_object_copy_strategically(src_object,
12536 			    src_offset,
12537 			    src_size,
12538 			    (flags & VM_MAP_COPYIN_FORK),
12539 			    &new_copy_object,
12540 			    &new_offset,
12541 			    &new_entry_needs_copy);
12542 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12543 			saved_used_for_jit = new_entry->used_for_jit;
12544 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12545 			new_entry->used_for_jit = saved_used_for_jit;
12546 			if (new_offset != VME_OFFSET(new_entry)) {
12547 				VME_OFFSET_SET(new_entry, new_offset);
12548 			}
12549 
12550 			new_entry->needs_copy = new_entry_needs_copy;
12551 		}
12552 
12553 		if (result == KERN_SUCCESS &&
12554 		    ((preserve_purgeable &&
12555 		    src_object->purgable != VM_PURGABLE_DENY) ||
12556 		    new_entry->used_for_jit)) {
12557 			/*
12558 			 * Purgeable objects should be COPY_NONE, true share;
12559 			 * this should be propogated to the copy.
12560 			 *
12561 			 * Also force mappings the pmap specially protects to
12562 			 * be COPY_NONE; trying to COW these mappings would
12563 			 * change the effective protections, which could have
12564 			 * side effects if the pmap layer relies on the
12565 			 * specified protections.
12566 			 */
12567 
12568 			vm_object_t     new_object;
12569 
12570 			new_object = VME_OBJECT(new_entry);
12571 			assert(new_object != src_object);
12572 			vm_object_lock(new_object);
12573 			assert(os_ref_get_count_raw(&new_object->ref_count) == 1);
12574 			assert(new_object->shadow == VM_OBJECT_NULL);
12575 			assert(new_object->vo_copy == VM_OBJECT_NULL);
12576 			assert(new_object->vo_owner == NULL);
12577 
12578 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12579 
12580 			if (preserve_purgeable &&
12581 			    src_object->purgable != VM_PURGABLE_DENY) {
12582 				VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12583 
12584 				/* start as non-volatile with no owner... */
12585 				VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12586 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12587 				/* ... and move to src_object's purgeable state */
12588 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12589 					int state;
12590 					state = src_object->purgable;
12591 					vm_object_purgable_control(
12592 						new_object,
12593 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
12594 						&state);
12595 				}
12596 				/* no pmap accounting for purgeable objects */
12597 				new_entry->use_pmap = FALSE;
12598 			}
12599 
12600 			vm_object_unlock(new_object);
12601 			new_object = VM_OBJECT_NULL;
12602 		}
12603 
12604 		/*
12605 		 *	Throw away the extra reference
12606 		 */
12607 
12608 		vm_object_deallocate(src_object);
12609 
12610 		if (result != KERN_SUCCESS &&
12611 		    result != KERN_MEMORY_RESTART_COPY) {
12612 			vm_map_lock(src_map);
12613 			RETURN(result);
12614 		}
12615 
12616 		/*
12617 		 *	Verify that the map has not substantially
12618 		 *	changed while the copy was being made.
12619 		 */
12620 
12621 		vm_map_lock(src_map);
12622 
12623 		if ((version.main_timestamp + 1) == src_map->timestamp) {
12624 			/* src_map hasn't changed: src_entry is still valid */
12625 			src_entry = saved_src_entry;
12626 			goto VerificationSuccessful;
12627 		}
12628 
12629 		/*
12630 		 *	Simple version comparison failed.
12631 		 *
12632 		 *	Retry the lookup and verify that the
12633 		 *	same object/offset are still present.
12634 		 *
12635 		 *	[Note: a memory manager that colludes with
12636 		 *	the calling task can detect that we have
12637 		 *	cheated.  While the map was unlocked, the
12638 		 *	mapping could have been changed and restored.]
12639 		 */
12640 
12641 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12642 			if (result != KERN_MEMORY_RESTART_COPY) {
12643 				vm_object_deallocate(VME_OBJECT(new_entry));
12644 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12645 				/* reset accounting state */
12646 				new_entry->iokit_acct = FALSE;
12647 				new_entry->use_pmap = TRUE;
12648 			}
12649 			RETURN(KERN_INVALID_ADDRESS);
12650 		}
12651 
12652 		src_entry = tmp_entry;
12653 		vm_map_clip_start(src_map, src_entry, src_start);
12654 
12655 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12656 		    !use_maxprot) ||
12657 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12658 			goto VerificationFailed;
12659 		}
12660 
12661 		if (src_entry->vme_end < new_entry->vme_end) {
12662 			/*
12663 			 * This entry might have been shortened
12664 			 * (vm_map_clip_end) or been replaced with
12665 			 * an entry that ends closer to "src_start"
12666 			 * than before.
12667 			 * Adjust "new_entry" accordingly; copying
12668 			 * less memory would be correct but we also
12669 			 * redo the copy (see below) if the new entry
12670 			 * no longer points at the same object/offset.
12671 			 */
12672 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12673 			    VM_MAP_COPY_PAGE_MASK(copy)));
12674 			new_entry->vme_end = src_entry->vme_end;
12675 			src_size = new_entry->vme_end - src_start;
12676 		} else if (src_entry->vme_end > new_entry->vme_end) {
12677 			/*
12678 			 * This entry might have been extended
12679 			 * (vm_map_entry_simplify() or coalesce)
12680 			 * or been replaced with an entry that ends farther
12681 			 * from "src_start" than before.
12682 			 *
12683 			 * We've called vm_object_copy_*() only on
12684 			 * the previous <start:end> range, so we can't
12685 			 * just extend new_entry.  We have to re-do
12686 			 * the copy based on the new entry as if it was
12687 			 * pointing at a different object/offset (see
12688 			 * "Verification failed" below).
12689 			 */
12690 		}
12691 
12692 		if ((VME_OBJECT(src_entry) != src_object) ||
12693 		    (VME_OFFSET(src_entry) != src_offset) ||
12694 		    (src_entry->vme_end > new_entry->vme_end)) {
12695 			/*
12696 			 *	Verification failed.
12697 			 *
12698 			 *	Start over with this top-level entry.
12699 			 */
12700 
12701 VerificationFailed:     ;
12702 
12703 			vm_object_deallocate(VME_OBJECT(new_entry));
12704 			tmp_entry = src_entry;
12705 			continue;
12706 		}
12707 
12708 		/*
12709 		 *	Verification succeeded.
12710 		 */
12711 
12712 VerificationSuccessful:;
12713 
12714 		if (result == KERN_MEMORY_RESTART_COPY) {
12715 			goto RestartCopy;
12716 		}
12717 
12718 		/*
12719 		 *	Copy succeeded.
12720 		 */
12721 
12722 CopySuccessful: ;
12723 
12724 		/*
12725 		 *	Link in the new copy entry.
12726 		 */
12727 
12728 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12729 		    new_entry);
12730 
12731 		/*
12732 		 *	Determine whether the entire region
12733 		 *	has been copied.
12734 		 */
12735 		src_base = src_start;
12736 		src_start = new_entry->vme_end;
12737 		new_entry = VM_MAP_ENTRY_NULL;
12738 		while ((src_start >= src_end) && (src_end != 0)) {
12739 			submap_map_t    *ptr;
12740 
12741 			if (src_map == base_map) {
12742 				/* back to the top */
12743 				break;
12744 			}
12745 
12746 			ptr = parent_maps;
12747 			assert(ptr != NULL);
12748 			parent_maps = parent_maps->next;
12749 
12750 			/* fix up the damage we did in that submap */
12751 			vm_map_simplify_range(src_map,
12752 			    src_base,
12753 			    src_end);
12754 
12755 			vm_map_unlock(src_map);
12756 			vm_map_deallocate(src_map);
12757 			vm_map_lock(ptr->parent_map);
12758 			src_map = ptr->parent_map;
12759 			src_base = ptr->base_start;
12760 			src_start = ptr->base_start + ptr->base_len;
12761 			src_end = ptr->base_end;
12762 			if (!vm_map_lookup_entry(src_map,
12763 			    src_start,
12764 			    &tmp_entry) &&
12765 			    (src_end > src_start)) {
12766 				RETURN(KERN_INVALID_ADDRESS);
12767 			}
12768 			kfree_type(submap_map_t, ptr);
12769 			if (parent_maps == NULL) {
12770 				map_share = FALSE;
12771 			}
12772 			src_entry = tmp_entry->vme_prev;
12773 		}
12774 
12775 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12776 		    (src_start >= src_addr_unaligned + len) &&
12777 		    (src_addr_unaligned + len != 0)) {
12778 			/*
12779 			 * Stop copying now, even though we haven't reached
12780 			 * "src_end".  We'll adjust the end of the last copy
12781 			 * entry at the end, if needed.
12782 			 *
12783 			 * If src_map's aligment is different from the
12784 			 * system's page-alignment, there could be
12785 			 * extra non-map-aligned map entries between
12786 			 * the original (non-rounded) "src_addr_unaligned + len"
12787 			 * and the rounded "src_end".
12788 			 * We do not want to copy those map entries since
12789 			 * they're not part of the copied range.
12790 			 */
12791 			break;
12792 		}
12793 
12794 		if ((src_start >= src_end) && (src_end != 0)) {
12795 			break;
12796 		}
12797 
12798 		/*
12799 		 *	Verify that there are no gaps in the region
12800 		 */
12801 
12802 		tmp_entry = src_entry->vme_next;
12803 		if ((tmp_entry->vme_start != src_start) ||
12804 		    (tmp_entry == vm_map_to_entry(src_map))) {
12805 			RETURN(KERN_INVALID_ADDRESS);
12806 		}
12807 	}
12808 
12809 	/*
12810 	 * If the source should be destroyed, do it now, since the
12811 	 * copy was successful.
12812 	 */
12813 	if (src_destroy) {
12814 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12815 
12816 		if (src_map == kernel_map) {
12817 			remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12818 		}
12819 		(void)vm_map_remove_and_unlock(src_map,
12820 		    vm_map_trunc_page(src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)),
12821 		    src_end,
12822 		    remove_flags,
12823 		    KMEM_GUARD_NONE);
12824 	} else {
12825 		/* fix up the damage we did in the base map */
12826 		vm_map_simplify_range(
12827 			src_map,
12828 			vm_map_trunc_page(src_addr_unaligned,
12829 			VM_MAP_PAGE_MASK(src_map)),
12830 			vm_map_round_page(src_end,
12831 			VM_MAP_PAGE_MASK(src_map)));
12832 		vm_map_unlock(src_map);
12833 	}
12834 
12835 	tmp_entry = VM_MAP_ENTRY_NULL;
12836 
12837 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12838 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12839 		vm_map_offset_t original_start, original_offset, original_end;
12840 
12841 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12842 
12843 		/* adjust alignment of first copy_entry's "vme_start" */
12844 		tmp_entry = vm_map_copy_first_entry(copy);
12845 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12846 			vm_map_offset_t adjustment;
12847 
12848 			original_start = tmp_entry->vme_start;
12849 			original_offset = VME_OFFSET(tmp_entry);
12850 
12851 			/* map-align the start of the first copy entry... */
12852 			adjustment = (tmp_entry->vme_start -
12853 			    vm_map_trunc_page(
12854 				    tmp_entry->vme_start,
12855 				    VM_MAP_PAGE_MASK(src_map)));
12856 			tmp_entry->vme_start -= adjustment;
12857 			VME_OFFSET_SET(tmp_entry,
12858 			    VME_OFFSET(tmp_entry) - adjustment);
12859 			copy_addr -= adjustment;
12860 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
12861 			/* ... adjust for mis-aligned start of copy range */
12862 			adjustment =
12863 			    (vm_map_trunc_page(copy->offset,
12864 			    PAGE_MASK) -
12865 			    vm_map_trunc_page(copy->offset,
12866 			    VM_MAP_PAGE_MASK(src_map)));
12867 			if (adjustment) {
12868 				assert(page_aligned(adjustment));
12869 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12870 				tmp_entry->vme_start += adjustment;
12871 				VME_OFFSET_SET(tmp_entry,
12872 				    (VME_OFFSET(tmp_entry) +
12873 				    adjustment));
12874 				copy_addr += adjustment;
12875 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12876 			}
12877 
12878 			/*
12879 			 * Assert that the adjustments haven't exposed
12880 			 * more than was originally copied...
12881 			 */
12882 			assert(tmp_entry->vme_start >= original_start);
12883 			assert(VME_OFFSET(tmp_entry) >= original_offset);
12884 			/*
12885 			 * ... and that it did not adjust outside of a
12886 			 * a single 16K page.
12887 			 */
12888 			assert(vm_map_trunc_page(tmp_entry->vme_start,
12889 			    VM_MAP_PAGE_MASK(src_map)) ==
12890 			    vm_map_trunc_page(original_start,
12891 			    VM_MAP_PAGE_MASK(src_map)));
12892 		}
12893 
12894 		/* adjust alignment of last copy_entry's "vme_end" */
12895 		tmp_entry = vm_map_copy_last_entry(copy);
12896 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12897 			vm_map_offset_t adjustment;
12898 
12899 			original_end = tmp_entry->vme_end;
12900 
12901 			/* map-align the end of the last copy entry... */
12902 			tmp_entry->vme_end =
12903 			    vm_map_round_page(tmp_entry->vme_end,
12904 			    VM_MAP_PAGE_MASK(src_map));
12905 			/* ... adjust for mis-aligned end of copy range */
12906 			adjustment =
12907 			    (vm_map_round_page((copy->offset +
12908 			    copy->size),
12909 			    VM_MAP_PAGE_MASK(src_map)) -
12910 			    vm_map_round_page((copy->offset +
12911 			    copy->size),
12912 			    PAGE_MASK));
12913 			if (adjustment) {
12914 				assert(page_aligned(adjustment));
12915 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12916 				tmp_entry->vme_end -= adjustment;
12917 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12918 			}
12919 
12920 			/*
12921 			 * Assert that the adjustments haven't exposed
12922 			 * more than was originally copied...
12923 			 */
12924 			assert(tmp_entry->vme_end <= original_end);
12925 			/*
12926 			 * ... and that it did not adjust outside of a
12927 			 * a single 16K page.
12928 			 */
12929 			assert(vm_map_round_page(tmp_entry->vme_end,
12930 			    VM_MAP_PAGE_MASK(src_map)) ==
12931 			    vm_map_round_page(original_end,
12932 			    VM_MAP_PAGE_MASK(src_map)));
12933 		}
12934 	}
12935 
12936 	/* Fix-up start and end points in copy.  This is necessary */
12937 	/* when the various entries in the copy object were picked */
12938 	/* up from different sub-maps */
12939 
12940 	tmp_entry = vm_map_copy_first_entry(copy);
12941 	copy_size = 0; /* compute actual size */
12942 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12943 		assert(VM_MAP_PAGE_ALIGNED(
12944 			    copy_addr + (tmp_entry->vme_end -
12945 			    tmp_entry->vme_start),
12946 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12947 		assert(VM_MAP_PAGE_ALIGNED(
12948 			    copy_addr,
12949 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12950 
12951 		/*
12952 		 * The copy_entries will be injected directly into the
12953 		 * destination map and might not be "map aligned" there...
12954 		 */
12955 		tmp_entry->map_aligned = FALSE;
12956 
12957 		tmp_entry->vme_end = copy_addr +
12958 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12959 		tmp_entry->vme_start = copy_addr;
12960 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12961 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12962 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12963 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12964 	}
12965 
12966 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12967 	    copy_size < copy->size) {
12968 		/*
12969 		 * The actual size of the VM map copy is smaller than what
12970 		 * was requested by the caller.  This must be because some
12971 		 * PAGE_SIZE-sized pages are missing at the end of the last
12972 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12973 		 * The caller might not have been aware of those missing
12974 		 * pages and might not want to be aware of it, which is
12975 		 * fine as long as they don't try to access (and crash on)
12976 		 * those missing pages.
12977 		 * Let's adjust the size of the "copy", to avoid failing
12978 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12979 		 */
12980 		assert(vm_map_round_page(copy_size,
12981 		    VM_MAP_PAGE_MASK(src_map)) ==
12982 		    vm_map_round_page(copy->size,
12983 		    VM_MAP_PAGE_MASK(src_map)));
12984 		copy->size = copy_size;
12985 	}
12986 
12987 	*copy_result = copy;
12988 	return KERN_SUCCESS;
12989 
12990 #undef  RETURN
12991 }
12992 
12993 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12994 vm_map_copy_extract(
12995 	vm_map_t                src_map,
12996 	vm_map_address_t        src_addr,
12997 	vm_map_size_t           len,
12998 	boolean_t               do_copy,
12999 	vm_map_copy_t           *copy_result,   /* OUT */
13000 	vm_prot_t               *cur_prot,      /* IN/OUT */
13001 	vm_prot_t               *max_prot,      /* IN/OUT */
13002 	vm_inherit_t            inheritance,
13003 	vm_map_kernel_flags_t   vmk_flags)
13004 {
13005 	vm_map_copy_t   copy;
13006 	kern_return_t   kr;
13007 	vm_prot_t required_cur_prot, required_max_prot;
13008 
13009 	/*
13010 	 *	Check for copies of zero bytes.
13011 	 */
13012 
13013 	if (len == 0) {
13014 		*copy_result = VM_MAP_COPY_NULL;
13015 		return KERN_SUCCESS;
13016 	}
13017 
13018 	/*
13019 	 *	Check that the end address doesn't overflow
13020 	 */
13021 	if (src_addr + len < src_addr) {
13022 		return KERN_INVALID_ADDRESS;
13023 	}
13024 	if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
13025 		return KERN_INVALID_ADDRESS;
13026 	}
13027 
13028 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
13029 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
13030 	}
13031 
13032 	required_cur_prot = *cur_prot;
13033 	required_max_prot = *max_prot;
13034 
13035 	/*
13036 	 *	Allocate a header element for the list.
13037 	 *
13038 	 *	Use the start and end in the header to
13039 	 *	remember the endpoints prior to rounding.
13040 	 */
13041 
13042 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
13043 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
13044 	copy->offset = 0;
13045 	copy->size = len;
13046 
13047 	kr = vm_map_remap_extract(src_map,
13048 	    src_addr,
13049 	    len,
13050 	    do_copy,             /* copy */
13051 	    copy,
13052 	    cur_prot,            /* IN/OUT */
13053 	    max_prot,            /* IN/OUT */
13054 	    inheritance,
13055 	    vmk_flags);
13056 	if (kr != KERN_SUCCESS) {
13057 		vm_map_copy_discard(copy);
13058 		if ((kr == KERN_INVALID_ADDRESS ||
13059 		    kr == KERN_INVALID_ARGUMENT) &&
13060 		    src_map->terminated) {
13061 			/* tell the caller that this address space is gone */
13062 			kr = KERN_TERMINATED;
13063 		}
13064 		return kr;
13065 	}
13066 	if (required_cur_prot != VM_PROT_NONE) {
13067 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
13068 		assert((*max_prot & required_max_prot) == required_max_prot);
13069 	}
13070 
13071 	*copy_result = copy;
13072 	return KERN_SUCCESS;
13073 }
13074 
13075 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)13076 vm_map_fork_share(
13077 	vm_map_t        old_map,
13078 	vm_map_entry_t  old_entry,
13079 	vm_map_t        new_map)
13080 {
13081 	vm_object_t     object;
13082 	vm_map_entry_t  new_entry;
13083 
13084 	/*
13085 	 *	New sharing code.  New map entry
13086 	 *	references original object.  Internal
13087 	 *	objects use asynchronous copy algorithm for
13088 	 *	future copies.  First make sure we have
13089 	 *	the right object.  If we need a shadow,
13090 	 *	or someone else already has one, then
13091 	 *	make a new shadow and share it.
13092 	 */
13093 
13094 	if (!old_entry->is_sub_map) {
13095 		object = VME_OBJECT(old_entry);
13096 	}
13097 
13098 	if (old_entry->is_sub_map) {
13099 		assert(old_entry->wired_count == 0);
13100 #ifndef NO_NESTED_PMAP
13101 #if !PMAP_FORK_NEST
13102 		if (old_entry->use_pmap) {
13103 			kern_return_t   result;
13104 
13105 			result = pmap_nest(new_map->pmap,
13106 			    (VME_SUBMAP(old_entry))->pmap,
13107 			    (addr64_t)old_entry->vme_start,
13108 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
13109 			if (result) {
13110 				panic("vm_map_fork_share: pmap_nest failed!");
13111 			}
13112 		}
13113 #endif /* !PMAP_FORK_NEST */
13114 #endif  /* NO_NESTED_PMAP */
13115 	} else if (object == VM_OBJECT_NULL) {
13116 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
13117 		    old_entry->vme_start), old_map->serial_id);
13118 		VME_OFFSET_SET(old_entry, 0);
13119 		VME_OBJECT_SET(old_entry, object, false, 0);
13120 		old_entry->use_pmap = TRUE;
13121 //		assert(!old_entry->needs_copy);
13122 	} else if (object->copy_strategy !=
13123 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
13124 		/*
13125 		 *	We are already using an asymmetric
13126 		 *	copy, and therefore we already have
13127 		 *	the right object.
13128 		 */
13129 
13130 		assert(!old_entry->needs_copy);
13131 	} else if (old_entry->needs_copy ||       /* case 1 */
13132 	    object->shadowed ||                 /* case 2 */
13133 	    (!object->true_share &&             /* case 3 */
13134 	    !old_entry->is_shared &&
13135 	    (object->vo_size >
13136 	    (vm_map_size_t)(old_entry->vme_end -
13137 	    old_entry->vme_start)))) {
13138 		bool is_writable;
13139 
13140 		/*
13141 		 *	We need to create a shadow.
13142 		 *	There are three cases here.
13143 		 *	In the first case, we need to
13144 		 *	complete a deferred symmetrical
13145 		 *	copy that we participated in.
13146 		 *	In the second and third cases,
13147 		 *	we need to create the shadow so
13148 		 *	that changes that we make to the
13149 		 *	object do not interfere with
13150 		 *	any symmetrical copies which
13151 		 *	have occured (case 2) or which
13152 		 *	might occur (case 3).
13153 		 *
13154 		 *	The first case is when we had
13155 		 *	deferred shadow object creation
13156 		 *	via the entry->needs_copy mechanism.
13157 		 *	This mechanism only works when
13158 		 *	only one entry points to the source
13159 		 *	object, and we are about to create
13160 		 *	a second entry pointing to the
13161 		 *	same object. The problem is that
13162 		 *	there is no way of mapping from
13163 		 *	an object to the entries pointing
13164 		 *	to it. (Deferred shadow creation
13165 		 *	works with one entry because occurs
13166 		 *	at fault time, and we walk from the
13167 		 *	entry to the object when handling
13168 		 *	the fault.)
13169 		 *
13170 		 *	The second case is when the object
13171 		 *	to be shared has already been copied
13172 		 *	with a symmetric copy, but we point
13173 		 *	directly to the object without
13174 		 *	needs_copy set in our entry. (This
13175 		 *	can happen because different ranges
13176 		 *	of an object can be pointed to by
13177 		 *	different entries. In particular,
13178 		 *	a single entry pointing to an object
13179 		 *	can be split by a call to vm_inherit,
13180 		 *	which, combined with task_create, can
13181 		 *	result in the different entries
13182 		 *	having different needs_copy values.)
13183 		 *	The shadowed flag in the object allows
13184 		 *	us to detect this case. The problem
13185 		 *	with this case is that if this object
13186 		 *	has or will have shadows, then we
13187 		 *	must not perform an asymmetric copy
13188 		 *	of this object, since such a copy
13189 		 *	allows the object to be changed, which
13190 		 *	will break the previous symmetrical
13191 		 *	copies (which rely upon the object
13192 		 *	not changing). In a sense, the shadowed
13193 		 *	flag says "don't change this object".
13194 		 *	We fix this by creating a shadow
13195 		 *	object for this object, and sharing
13196 		 *	that. This works because we are free
13197 		 *	to change the shadow object (and thus
13198 		 *	to use an asymmetric copy strategy);
13199 		 *	this is also semantically correct,
13200 		 *	since this object is temporary, and
13201 		 *	therefore a copy of the object is
13202 		 *	as good as the object itself. (This
13203 		 *	is not true for permanent objects,
13204 		 *	since the pager needs to see changes,
13205 		 *	which won't happen if the changes
13206 		 *	are made to a copy.)
13207 		 *
13208 		 *	The third case is when the object
13209 		 *	to be shared has parts sticking
13210 		 *	outside of the entry we're working
13211 		 *	with, and thus may in the future
13212 		 *	be subject to a symmetrical copy.
13213 		 *	(This is a preemptive version of
13214 		 *	case 2.)
13215 		 */
13216 		VME_OBJECT_SHADOW(old_entry,
13217 		    (vm_map_size_t) (old_entry->vme_end -
13218 		    old_entry->vme_start),
13219 		    vm_map_always_shadow(old_map));
13220 
13221 		/*
13222 		 *	If we're making a shadow for other than
13223 		 *	copy on write reasons, then we have
13224 		 *	to remove write permission.
13225 		 */
13226 
13227 		is_writable = false;
13228 		if (old_entry->protection & VM_PROT_WRITE) {
13229 			is_writable = true;
13230 #if __arm64e__
13231 		} else if (old_entry->used_for_tpro) {
13232 			is_writable = true;
13233 #endif /* __arm64e__ */
13234 		}
13235 		if (!old_entry->needs_copy && is_writable) {
13236 			vm_prot_t prot;
13237 
13238 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13239 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13240 				    __FUNCTION__, old_map, old_map->pmap,
13241 				    old_entry,
13242 				    (uint64_t)old_entry->vme_start,
13243 				    (uint64_t)old_entry->vme_end,
13244 				    old_entry->protection);
13245 			}
13246 
13247 			prot = old_entry->protection & ~VM_PROT_WRITE;
13248 
13249 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13250 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13251 				    __FUNCTION__, old_map, old_map->pmap,
13252 				    old_entry,
13253 				    (uint64_t)old_entry->vme_start,
13254 				    (uint64_t)old_entry->vme_end,
13255 				    prot);
13256 			}
13257 
13258 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13259 				prot |= VM_PROT_EXECUTE;
13260 			}
13261 
13262 
13263 			if (old_map->mapped_in_other_pmaps) {
13264 				vm_object_pmap_protect(
13265 					VME_OBJECT(old_entry),
13266 					VME_OFFSET(old_entry),
13267 					(old_entry->vme_end -
13268 					old_entry->vme_start),
13269 					PMAP_NULL,
13270 					PAGE_SIZE,
13271 					old_entry->vme_start,
13272 					prot);
13273 			} else {
13274 				pmap_protect(old_map->pmap,
13275 				    old_entry->vme_start,
13276 				    old_entry->vme_end,
13277 				    prot);
13278 			}
13279 		}
13280 
13281 		old_entry->needs_copy = FALSE;
13282 		object = VME_OBJECT(old_entry);
13283 	}
13284 
13285 
13286 	/*
13287 	 *	If object was using a symmetric copy strategy,
13288 	 *	change its copy strategy to the default
13289 	 *	asymmetric copy strategy, which is copy_delay
13290 	 *	in the non-norma case and copy_call in the
13291 	 *	norma case. Bump the reference count for the
13292 	 *	new entry.
13293 	 */
13294 
13295 	if (old_entry->is_sub_map) {
13296 		vm_map_reference(VME_SUBMAP(old_entry));
13297 	} else {
13298 		vm_object_lock(object);
13299 		vm_object_reference_locked(object);
13300 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13301 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13302 		}
13303 		vm_object_unlock(object);
13304 	}
13305 
13306 	/*
13307 	 *	Clone the entry, using object ref from above.
13308 	 *	Mark both entries as shared.
13309 	 */
13310 
13311 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13312 	vm_map_entry_copy(old_map, new_entry, old_entry);
13313 	old_entry->is_shared = TRUE;
13314 	new_entry->is_shared = TRUE;
13315 
13316 	/*
13317 	 * We're dealing with a shared mapping, so the resulting mapping
13318 	 * should inherit some of the original mapping's accounting settings.
13319 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13320 	 * "use_pmap" should stay the same as before (if it hasn't been reset
13321 	 * to TRUE when we cleared "iokit_acct").
13322 	 */
13323 	assert(!new_entry->iokit_acct);
13324 
13325 	/*
13326 	 *	If old entry's inheritence is VM_INHERIT_NONE,
13327 	 *	the new entry is for corpse fork, remove the
13328 	 *	write permission from the new entry.
13329 	 */
13330 	if (old_entry->inheritance == VM_INHERIT_NONE) {
13331 		new_entry->protection &= ~VM_PROT_WRITE;
13332 		new_entry->max_protection &= ~VM_PROT_WRITE;
13333 	}
13334 
13335 	/*
13336 	 *	Insert the entry into the new map -- we
13337 	 *	know we're inserting at the end of the new
13338 	 *	map.
13339 	 */
13340 
13341 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13342 	    VM_MAP_KERNEL_FLAGS_NONE);
13343 
13344 	/*
13345 	 *	Update the physical map
13346 	 */
13347 
13348 	if (old_entry->is_sub_map) {
13349 		/* Bill Angell pmap support goes here */
13350 	} else {
13351 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13352 		    old_entry->vme_end - old_entry->vme_start,
13353 		    old_entry->vme_start);
13354 	}
13355 }
13356 
13357 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13358 vm_map_fork_copy(
13359 	vm_map_t        old_map,
13360 	vm_map_entry_t  *old_entry_p,
13361 	vm_map_t        new_map,
13362 	int             vm_map_copyin_flags)
13363 {
13364 	vm_map_entry_t old_entry = *old_entry_p;
13365 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13366 	vm_map_offset_t start = old_entry->vme_start;
13367 	vm_map_copy_t copy;
13368 	vm_map_entry_t last = vm_map_last_entry(new_map);
13369 
13370 	vm_map_unlock(old_map);
13371 	/*
13372 	 *	Use maxprot version of copyin because we
13373 	 *	care about whether this memory can ever
13374 	 *	be accessed, not just whether it's accessible
13375 	 *	right now.
13376 	 */
13377 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13378 	if (vm_map_copyin_internal(old_map, start, entry_size,
13379 	    vm_map_copyin_flags, &copy)
13380 	    != KERN_SUCCESS) {
13381 		/*
13382 		 *	The map might have changed while it
13383 		 *	was unlocked, check it again.  Skip
13384 		 *	any blank space or permanently
13385 		 *	unreadable region.
13386 		 */
13387 		vm_map_lock(old_map);
13388 		if (!vm_map_lookup_entry(old_map, start, &last) ||
13389 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13390 			last = last->vme_next;
13391 		}
13392 		*old_entry_p = last;
13393 
13394 		/*
13395 		 * XXX	For some error returns, want to
13396 		 * XXX	skip to the next element.  Note
13397 		 *	that INVALID_ADDRESS and
13398 		 *	PROTECTION_FAILURE are handled above.
13399 		 */
13400 
13401 		return FALSE;
13402 	}
13403 
13404 	/*
13405 	 * Assert that the vm_map_copy is coming from the right
13406 	 * zone and hasn't been forged
13407 	 */
13408 	vm_map_copy_require(copy);
13409 
13410 	/*
13411 	 *	Insert the copy into the new map
13412 	 */
13413 	vm_map_copy_insert(new_map, last, copy);
13414 
13415 	/*
13416 	 *	Pick up the traversal at the end of
13417 	 *	the copied region.
13418 	 */
13419 
13420 	vm_map_lock(old_map);
13421 	start += entry_size;
13422 	if (!vm_map_lookup_entry(old_map, start, &last)) {
13423 		last = last->vme_next;
13424 	} else {
13425 		if (last->vme_start == start) {
13426 			/*
13427 			 * No need to clip here and we don't
13428 			 * want to cause any unnecessary
13429 			 * unnesting...
13430 			 */
13431 		} else {
13432 			vm_map_clip_start(old_map, last, start);
13433 		}
13434 	}
13435 	*old_entry_p = last;
13436 
13437 	return TRUE;
13438 }
13439 
13440 #if PMAP_FORK_NEST
13441 #define PMAP_FORK_NEST_DEBUG 0
13442 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13443 vm_map_fork_unnest(
13444 	pmap_t new_pmap,
13445 	vm_map_offset_t pre_nested_start,
13446 	vm_map_offset_t pre_nested_end,
13447 	vm_map_offset_t start,
13448 	vm_map_offset_t end)
13449 {
13450 	kern_return_t kr;
13451 	vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13452 
13453 	assertf(pre_nested_start <= pre_nested_end,
13454 	    "pre_nested start 0x%llx end 0x%llx",
13455 	    (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13456 	assertf(start <= end,
13457 	    "start 0x%llx end 0x%llx",
13458 	    (uint64_t) start, (uint64_t)end);
13459 
13460 	if (pre_nested_start == pre_nested_end) {
13461 		/* nothing was pre-nested: done */
13462 		return;
13463 	}
13464 	if (end <= pre_nested_start) {
13465 		/* fully before pre-nested range: done */
13466 		return;
13467 	}
13468 	if (start >= pre_nested_end) {
13469 		/* fully after pre-nested range: done */
13470 		return;
13471 	}
13472 	/* ignore parts of range outside of pre_nested range */
13473 	if (start < pre_nested_start) {
13474 		start = pre_nested_start;
13475 	}
13476 	if (end > pre_nested_end) {
13477 		end = pre_nested_end;
13478 	}
13479 	nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13480 	start_unnest = start & ~nesting_mask;
13481 	end_unnest = (end + nesting_mask) & ~nesting_mask;
13482 	kr = pmap_unnest(new_pmap,
13483 	    (addr64_t)start_unnest,
13484 	    (uint64_t)(end_unnest - start_unnest));
13485 #if PMAP_FORK_NEST_DEBUG
13486 	printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13487 #endif /* PMAP_FORK_NEST_DEBUG */
13488 	assertf(kr == KERN_SUCCESS,
13489 	    "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13490 	    (uint64_t)start, (uint64_t)end, new_pmap,
13491 	    (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13492 	    kr);
13493 }
13494 #endif /* PMAP_FORK_NEST */
13495 
13496 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13497 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13498 {
13499 	new_map->size_limit = old_map->size_limit;
13500 	new_map->data_limit = old_map->data_limit;
13501 	new_map->user_wire_limit = old_map->user_wire_limit;
13502 	new_map->reserved_regions = old_map->reserved_regions;
13503 }
13504 
13505 /*
13506  *	vm_map_fork:
13507  *
13508  *	Create and return a new map based on the old
13509  *	map, according to the inheritance values on the
13510  *	regions in that map and the options.
13511  *
13512  *	The source map must not be locked.
13513  */
13514 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13515 vm_map_fork(
13516 	ledger_t        ledger,
13517 	vm_map_t        old_map,
13518 	int             options)
13519 {
13520 	pmap_t          new_pmap;
13521 	vm_map_t        new_map;
13522 	vm_map_entry_t  old_entry;
13523 	vm_map_size_t   new_size = 0, entry_size;
13524 	vm_map_entry_t  new_entry;
13525 	boolean_t       src_needs_copy;
13526 	boolean_t       new_entry_needs_copy;
13527 	boolean_t       pmap_is64bit;
13528 	int             vm_map_copyin_flags;
13529 	vm_inherit_t    old_entry_inheritance;
13530 	int             map_create_options;
13531 	kern_return_t   footprint_collect_kr;
13532 
13533 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13534 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
13535 	    VM_MAP_FORK_CORPSE_FOOTPRINT |
13536 	    VM_MAP_FORK_SHARE_IF_OWNED)) {
13537 		/* unsupported option */
13538 		return VM_MAP_NULL;
13539 	}
13540 
13541 	pmap_is64bit =
13542 #if defined(__i386__) || defined(__x86_64__)
13543 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13544 #elif defined(__arm64__)
13545 	    old_map->pmap->is_64bit;
13546 #else
13547 #error Unknown architecture.
13548 #endif
13549 
13550 	unsigned int pmap_flags = 0;
13551 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13552 #if defined(HAS_APPLE_PAC)
13553 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13554 #endif
13555 #if CONFIG_ROSETTA
13556 	pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13557 #endif
13558 #if PMAP_CREATE_FORCE_4K_PAGES
13559 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13560 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
13561 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13562 	}
13563 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13564 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13565 	if (new_pmap == NULL) {
13566 		return VM_MAP_NULL;
13567 	}
13568 
13569 	vm_map_reference(old_map);
13570 	vm_map_lock(old_map);
13571 
13572 	/* Note that we're creating a map out of fork() */
13573 	map_create_options = VM_MAP_CREATE_VIA_FORK;
13574 	if (old_map->hdr.entries_pageable) {
13575 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
13576 	}
13577 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13578 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13579 		footprint_collect_kr = KERN_SUCCESS;
13580 	}
13581 	new_map = vm_map_create_options(new_pmap,
13582 	    old_map->min_offset,
13583 	    old_map->max_offset,
13584 	    map_create_options);
13585 
13586 	/* Inherit our parent's ID. */
13587 	vm_map_assign_serial(new_map, old_map->serial_id);
13588 
13589 	/* inherit cs_enforcement */
13590 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13591 
13592 	vm_map_lock(new_map);
13593 	vm_commit_pagezero_status(new_map);
13594 	/* inherit the parent map's page size */
13595 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13596 
13597 	/* inherit the parent rlimits */
13598 	vm_map_inherit_limits(new_map, old_map);
13599 
13600 #if CONFIG_MAP_RANGES
13601 	/* inherit the parent map's VM ranges */
13602 	vm_map_range_fork(new_map, old_map);
13603 #endif
13604 
13605 #if CODE_SIGNING_MONITOR
13606 	/* Prepare the monitor for the fork */
13607 	csm_fork_prepare(old_map->pmap, new_pmap);
13608 #endif
13609 
13610 #if PMAP_FORK_NEST
13611 	/*
13612 	 * Pre-nest the shared region's pmap.
13613 	 */
13614 	vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13615 	pmap_fork_nest(old_map->pmap, new_pmap,
13616 	    &pre_nested_start, &pre_nested_end);
13617 #if PMAP_FORK_NEST_DEBUG
13618 	printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13619 #endif /* PMAP_FORK_NEST_DEBUG */
13620 #endif /* PMAP_FORK_NEST */
13621 
13622 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13623 		/*
13624 		 * Abort any corpse collection if the system is shutting down.
13625 		 */
13626 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13627 		    get_system_inshutdown()) {
13628 #if PMAP_FORK_NEST
13629 			new_entry = vm_map_last_entry(new_map);
13630 			if (new_entry == vm_map_to_entry(new_map)) {
13631 				/* unnest all that was pre-nested */
13632 				vm_map_fork_unnest(new_pmap,
13633 				    pre_nested_start, pre_nested_end,
13634 				    vm_map_min(new_map), vm_map_max(new_map));
13635 			} else if (new_entry->vme_end < vm_map_max(new_map)) {
13636 				/* unnest hole at the end, if pre-nested */
13637 				vm_map_fork_unnest(new_pmap,
13638 				    pre_nested_start, pre_nested_end,
13639 				    new_entry->vme_end, vm_map_max(new_map));
13640 			}
13641 #endif /* PMAP_FORK_NEST */
13642 			vm_map_corpse_footprint_collect_done(new_map);
13643 			vm_map_unlock(new_map);
13644 			vm_map_unlock(old_map);
13645 			vm_map_deallocate(new_map);
13646 			vm_map_deallocate(old_map);
13647 			printf("Aborting corpse map due to system shutdown\n");
13648 			return VM_MAP_NULL;
13649 		}
13650 
13651 		entry_size = old_entry->vme_end - old_entry->vme_start;
13652 
13653 #if PMAP_FORK_NEST
13654 		/*
13655 		 * Undo any unnecessary pre-nesting.
13656 		 */
13657 		vm_map_offset_t prev_end;
13658 		if (old_entry == vm_map_first_entry(old_map)) {
13659 			prev_end = vm_map_min(old_map);
13660 		} else {
13661 			prev_end = old_entry->vme_prev->vme_end;
13662 		}
13663 		if (prev_end < old_entry->vme_start) {
13664 			/* unnest hole before this entry, if pre-nested */
13665 			vm_map_fork_unnest(new_pmap,
13666 			    pre_nested_start, pre_nested_end,
13667 			    prev_end, old_entry->vme_start);
13668 		}
13669 		if (old_entry->is_sub_map && old_entry->use_pmap) {
13670 			/* keep this entry nested in the child */
13671 #if PMAP_FORK_NEST_DEBUG
13672 			printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13673 #endif /* PMAP_FORK_NEST_DEBUG */
13674 		} else {
13675 			/* undo nesting for this entry, if pre-nested */
13676 			vm_map_fork_unnest(new_pmap,
13677 			    pre_nested_start, pre_nested_end,
13678 			    old_entry->vme_start, old_entry->vme_end);
13679 		}
13680 #endif /* PMAP_FORK_NEST */
13681 
13682 		old_entry_inheritance = old_entry->inheritance;
13683 
13684 		/*
13685 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13686 		 * share VM_INHERIT_NONE entries that are not backed by a
13687 		 * device pager.
13688 		 */
13689 		if (old_entry_inheritance == VM_INHERIT_NONE &&
13690 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13691 		    (old_entry->protection & VM_PROT_READ) &&
13692 		    !(!old_entry->is_sub_map &&
13693 		    VME_OBJECT(old_entry) != NULL &&
13694 		    VME_OBJECT(old_entry)->pager != NULL &&
13695 		    is_device_pager_ops(
13696 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13697 			old_entry_inheritance = VM_INHERIT_SHARE;
13698 		}
13699 		if (old_entry_inheritance == VM_INHERIT_COPY &&
13700 		    (options & VM_MAP_FORK_SHARE_IF_OWNED) &&
13701 		    !old_entry->is_sub_map &&
13702 		    VME_OBJECT(old_entry) != VM_OBJECT_NULL) {
13703 			vm_object_t object;
13704 			task_t owner;
13705 			object = VME_OBJECT(old_entry);
13706 			owner = VM_OBJECT_OWNER(object);
13707 			if (owner != TASK_NULL &&
13708 			    owner->map == old_map) {
13709 				/*
13710 				 * This mapping points at a VM object owned
13711 				 * by the task being forked.
13712 				 * Some tools reporting memory accounting
13713 				 * info rely on the object ID, so share this
13714 				 * mapping instead of copying, to make the
13715 				 * corpse look exactly like the original
13716 				 * task in that respect.
13717 				 */
13718 				assert(object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC);
13719 				old_entry_inheritance = VM_INHERIT_SHARE;
13720 			}
13721 		}
13722 
13723 		if (old_entry_inheritance != VM_INHERIT_NONE &&
13724 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13725 		    footprint_collect_kr == KERN_SUCCESS) {
13726 			/*
13727 			 * The corpse won't have old_map->pmap to query
13728 			 * footprint information, so collect that data now
13729 			 * and store it in new_map->vmmap_corpse_footprint
13730 			 * for later autopsy.
13731 			 */
13732 			footprint_collect_kr =
13733 			    vm_map_corpse_footprint_collect(old_map,
13734 			    old_entry,
13735 			    new_map);
13736 		}
13737 
13738 		switch (old_entry_inheritance) {
13739 		case VM_INHERIT_NONE:
13740 			break;
13741 
13742 		case VM_INHERIT_SHARE:
13743 			vm_map_fork_share(old_map, old_entry, new_map);
13744 			new_size += entry_size;
13745 			break;
13746 
13747 		case VM_INHERIT_COPY:
13748 
13749 			/*
13750 			 *	Inline the copy_quickly case;
13751 			 *	upon failure, fall back on call
13752 			 *	to vm_map_fork_copy.
13753 			 */
13754 
13755 			if (old_entry->is_sub_map) {
13756 				break;
13757 			}
13758 			if ((old_entry->wired_count != 0) ||
13759 			    ((VME_OBJECT(old_entry) != NULL) &&
13760 			    (VME_OBJECT(old_entry)->true_share))) {
13761 				goto slow_vm_map_fork_copy;
13762 			}
13763 
13764 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13765 			vm_map_entry_copy(old_map, new_entry, old_entry);
13766 			if (old_entry->vme_permanent) {
13767 				/* inherit "permanent" on fork() */
13768 				new_entry->vme_permanent = TRUE;
13769 			}
13770 
13771 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13772 				new_map->jit_entry_exists = TRUE;
13773 			}
13774 
13775 			if (new_entry->is_sub_map) {
13776 				/* clear address space specifics */
13777 				new_entry->use_pmap = FALSE;
13778 			} else {
13779 				/*
13780 				 * We're dealing with a copy-on-write operation,
13781 				 * so the resulting mapping should not inherit
13782 				 * the original mapping's accounting settings.
13783 				 * "iokit_acct" should have been cleared in
13784 				 * vm_map_entry_copy().
13785 				 * "use_pmap" should be reset to its default
13786 				 * (TRUE) so that the new mapping gets
13787 				 * accounted for in the task's memory footprint.
13788 				 */
13789 				assert(!new_entry->iokit_acct);
13790 				new_entry->use_pmap = TRUE;
13791 			}
13792 
13793 			if (!vm_object_copy_quickly(
13794 				    VME_OBJECT(new_entry),
13795 				    VME_OFFSET(old_entry),
13796 				    (old_entry->vme_end -
13797 				    old_entry->vme_start),
13798 				    &src_needs_copy,
13799 				    &new_entry_needs_copy)) {
13800 				vm_map_entry_dispose(new_entry);
13801 				goto slow_vm_map_fork_copy;
13802 			}
13803 
13804 			/*
13805 			 *	Handle copy-on-write obligations
13806 			 */
13807 
13808 			if (src_needs_copy && !old_entry->needs_copy) {
13809 				vm_prot_t prot;
13810 
13811 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13812 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13813 					    __FUNCTION__,
13814 					    old_map, old_map->pmap, old_entry,
13815 					    (uint64_t)old_entry->vme_start,
13816 					    (uint64_t)old_entry->vme_end,
13817 					    old_entry->protection);
13818 				}
13819 
13820 				prot = old_entry->protection & ~VM_PROT_WRITE;
13821 
13822 				if (override_nx(old_map, VME_ALIAS(old_entry))
13823 				    && prot) {
13824 					prot |= VM_PROT_EXECUTE;
13825 				}
13826 
13827 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13828 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13829 					    __FUNCTION__,
13830 					    old_map, old_map->pmap, old_entry,
13831 					    (uint64_t)old_entry->vme_start,
13832 					    (uint64_t)old_entry->vme_end,
13833 					    prot);
13834 				}
13835 
13836 				vm_object_pmap_protect(
13837 					VME_OBJECT(old_entry),
13838 					VME_OFFSET(old_entry),
13839 					(old_entry->vme_end -
13840 					old_entry->vme_start),
13841 					((old_entry->is_shared
13842 					|| old_map->mapped_in_other_pmaps)
13843 					? PMAP_NULL :
13844 					old_map->pmap),
13845 					VM_MAP_PAGE_SIZE(old_map),
13846 					old_entry->vme_start,
13847 					prot);
13848 
13849 				assert(old_entry->wired_count == 0);
13850 				old_entry->needs_copy = TRUE;
13851 			}
13852 			new_entry->needs_copy = new_entry_needs_copy;
13853 
13854 			/*
13855 			 *	Insert the entry at the end
13856 			 *	of the map.
13857 			 */
13858 
13859 			vm_map_store_entry_link(new_map,
13860 			    vm_map_last_entry(new_map),
13861 			    new_entry,
13862 			    VM_MAP_KERNEL_FLAGS_NONE);
13863 			new_size += entry_size;
13864 			break;
13865 
13866 slow_vm_map_fork_copy:
13867 			vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13868 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13869 				vm_map_copyin_flags |=
13870 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13871 			}
13872 			if (vm_map_fork_copy(old_map,
13873 			    &old_entry,
13874 			    new_map,
13875 			    vm_map_copyin_flags)) {
13876 				new_size += entry_size;
13877 			}
13878 			continue;
13879 		}
13880 		old_entry = old_entry->vme_next;
13881 	}
13882 
13883 #if PMAP_FORK_NEST
13884 	new_entry = vm_map_last_entry(new_map);
13885 	if (new_entry == vm_map_to_entry(new_map)) {
13886 		/* unnest all that was pre-nested */
13887 		vm_map_fork_unnest(new_pmap,
13888 		    pre_nested_start, pre_nested_end,
13889 		    vm_map_min(new_map), vm_map_max(new_map));
13890 	} else if (new_entry->vme_end < vm_map_max(new_map)) {
13891 		/* unnest hole at the end, if pre-nested */
13892 		vm_map_fork_unnest(new_pmap,
13893 		    pre_nested_start, pre_nested_end,
13894 		    new_entry->vme_end, vm_map_max(new_map));
13895 	}
13896 #endif /* PMAP_FORK_NEST */
13897 
13898 #if defined(__arm64__)
13899 	pmap_insert_commpage(new_map->pmap);
13900 #endif /* __arm64__ */
13901 
13902 	new_map->size = new_size;
13903 
13904 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13905 		vm_map_corpse_footprint_collect_done(new_map);
13906 	}
13907 
13908 	/* Propagate JIT entitlement for the pmap layer. */
13909 	if (pmap_get_jit_entitled(old_map->pmap)) {
13910 		/* Tell the pmap that it supports JIT. */
13911 		pmap_set_jit_entitled(new_map->pmap);
13912 	}
13913 
13914 	/* Propagate TPRO settings for the pmap layer */
13915 	if (pmap_get_tpro(old_map->pmap)) {
13916 		/* Tell the pmap that it supports TPRO */
13917 		pmap_set_tpro(new_map->pmap);
13918 	}
13919 
13920 
13921 	vm_map_unlock(new_map);
13922 	vm_map_unlock(old_map);
13923 	vm_map_deallocate(old_map);
13924 
13925 	return new_map;
13926 }
13927 
13928 /*
13929  * vm_map_exec:
13930  *
13931  *      Setup the "new_map" with the proper execution environment according
13932  *	to the type of executable (platform, 64bit, chroot environment).
13933  *	Map the comm page and shared region, etc...
13934  */
13935 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13936 vm_map_exec(
13937 	vm_map_t        new_map,
13938 	task_t          task,
13939 	boolean_t       is64bit,
13940 	void            *fsroot,
13941 	cpu_type_t      cpu,
13942 	cpu_subtype_t   cpu_subtype,
13943 	boolean_t       reslide,
13944 	boolean_t       is_driverkit,
13945 	uint32_t        rsr_version)
13946 {
13947 	SHARED_REGION_TRACE_DEBUG(
13948 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13949 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13950 		(void *)VM_KERNEL_ADDRPERM(new_map),
13951 		(void *)VM_KERNEL_ADDRPERM(task),
13952 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13953 		cpu,
13954 		cpu_subtype));
13955 	(void) vm_commpage_enter(new_map, task, is64bit);
13956 
13957 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13958 
13959 	SHARED_REGION_TRACE_DEBUG(
13960 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13961 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13962 		(void *)VM_KERNEL_ADDRPERM(new_map),
13963 		(void *)VM_KERNEL_ADDRPERM(task),
13964 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13965 		cpu,
13966 		cpu_subtype));
13967 
13968 	/*
13969 	 * Some devices have region(s) of memory that shouldn't get allocated by
13970 	 * user processes. The following code creates dummy vm_map_entry_t's for each
13971 	 * of the regions that needs to be reserved to prevent any allocations in
13972 	 * those regions.
13973 	 */
13974 	kern_return_t kr = KERN_FAILURE;
13975 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13976 	vmk_flags.vmkf_beyond_max = true;
13977 
13978 	const struct vm_reserved_region *regions = NULL;
13979 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
13980 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13981 
13982 	for (size_t i = 0; i < num_regions; ++i) {
13983 		vm_map_offset_t address = regions[i].vmrr_addr;
13984 
13985 		kr = vm_map_enter(
13986 			new_map,
13987 			&address,
13988 			regions[i].vmrr_size,
13989 			(vm_map_offset_t)0,
13990 			vmk_flags,
13991 			VM_OBJECT_NULL,
13992 			(vm_object_offset_t)0,
13993 			FALSE,
13994 			VM_PROT_NONE,
13995 			VM_PROT_NONE,
13996 			VM_INHERIT_COPY);
13997 
13998 		if (kr != KERN_SUCCESS) {
13999 			os_log_error(OS_LOG_DEFAULT, "Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
14000 			return KERN_FAILURE;
14001 		}
14002 	}
14003 
14004 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
14005 
14006 	return KERN_SUCCESS;
14007 }
14008 
14009 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
14010 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
14011 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
14012 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
14013 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
14014 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
14015 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
14016 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
14017 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
14018 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
14019 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
14020 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
14021 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
14022 /*
14023  *	vm_map_lookup_and_lock_object:
14024  *
14025  *	Finds the VM object, offset, and
14026  *	protection for a given virtual address in the
14027  *	specified map, assuming a page fault of the
14028  *	type specified.
14029  *
14030  *	Returns the (object, offset, protection) for
14031  *	this address, whether it is wired down, and whether
14032  *	this map has the only reference to the data in question.
14033  *	In order to later verify this lookup, a "version"
14034  *	is returned.
14035  *	If contended != NULL, *contended will be set to
14036  *	true iff the thread had to spin or block to acquire
14037  *	an exclusive lock.
14038  *
14039  *	The map MUST be locked by the caller and WILL be
14040  *	locked on exit.  In order to guarantee the
14041  *	existence of the returned object, it is returned
14042  *	locked.
14043  *
14044  *	If a lookup is requested with "write protection"
14045  *	specified, the map may be changed to perform virtual
14046  *	copying operations, although the data referenced will
14047  *	remain the same.
14048  *
14049  *  If fault_info is provided, then the information is
14050  *  initialized according to the properties of the map entry
14051  *  NB: only properties of the entry are initialized,
14052  *  namely:
14053  *    - user_tag
14054  *    - pmap_options
14055  *    - iokit_acct
14056  *    - behavior
14057  *    - lo_offset
14058  *    - hi_offset
14059  *    - no_cache
14060  *    - cs_bypass
14061  *    - csm_associated
14062  *    - resilient_media
14063  *    - vme_xnu_user_debug
14064  *    - vme_no_copy_on_read
14065  *    - used_for_tpro
14066  */
14067 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)14068 vm_map_lookup_and_lock_object(
14069 	vm_map_t                *var_map,       /* IN/OUT */
14070 	vm_map_offset_t         vaddr,
14071 	vm_prot_t               fault_type,
14072 	int                     object_lock_type,
14073 	vm_map_version_t        *out_version,   /* OUT */
14074 	vm_object_t             *object,        /* OUT */
14075 	vm_object_offset_t      *offset,        /* OUT */
14076 	vm_prot_t               *out_prot,      /* OUT */
14077 	boolean_t               *wired,         /* OUT */
14078 	vm_object_fault_info_t  fault_info,     /* OUT */
14079 	vm_map_t                *real_map,      /* OUT */
14080 	bool                    *contended)     /* OUT */
14081 {
14082 	vm_map_entry_t                  entry;
14083 	vm_map_t                        map = *var_map;
14084 	vm_map_t                        old_map = *var_map;
14085 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
14086 	vm_map_offset_t                 cow_parent_vaddr = 0;
14087 	vm_map_offset_t                 old_start = 0;
14088 	vm_map_offset_t                 old_end = 0;
14089 	vm_prot_t                       prot;
14090 	boolean_t                       mask_protections;
14091 	boolean_t                       force_copy;
14092 	boolean_t                       no_force_copy_if_executable;
14093 	boolean_t                       submap_needed_copy;
14094 	vm_prot_t                       original_fault_type;
14095 	vm_map_size_t                   fault_page_mask;
14096 
14097 	/*
14098 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
14099 	 * as a mask against the mapping's actual protections, not as an
14100 	 * absolute value.
14101 	 */
14102 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
14103 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
14104 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
14105 	fault_type &= VM_PROT_ALL;
14106 	original_fault_type = fault_type;
14107 	if (contended) {
14108 		*contended = false;
14109 	}
14110 
14111 	*real_map = map;
14112 
14113 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
14114 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
14115 
14116 RetryLookup:
14117 	fault_type = original_fault_type;
14118 
14119 	/*
14120 	 *	If the map has an interesting hint, try it before calling
14121 	 *	full blown lookup routine.
14122 	 */
14123 	entry = map->hint;
14124 
14125 	if ((entry == vm_map_to_entry(map)) ||
14126 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
14127 		vm_map_entry_t  tmp_entry;
14128 
14129 		/*
14130 		 *	Entry was either not a valid hint, or the vaddr
14131 		 *	was not contained in the entry, so do a full lookup.
14132 		 */
14133 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
14134 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14135 				vm_map_unlock(cow_sub_map_parent);
14136 			}
14137 			if ((*real_map != map)
14138 			    && (*real_map != cow_sub_map_parent)) {
14139 				vm_map_unlock(*real_map);
14140 			}
14141 			return KERN_INVALID_ADDRESS;
14142 		}
14143 
14144 		entry = tmp_entry;
14145 	}
14146 	if (map == old_map) {
14147 		old_start = entry->vme_start;
14148 		old_end = entry->vme_end;
14149 	}
14150 
14151 	/*
14152 	 *	Handle submaps.  Drop lock on upper map, submap is
14153 	 *	returned locked.
14154 	 */
14155 
14156 	submap_needed_copy = FALSE;
14157 submap_recurse:
14158 	if (entry->is_sub_map) {
14159 		vm_map_offset_t         local_vaddr;
14160 		vm_map_offset_t         end_delta;
14161 		vm_map_offset_t         start_delta;
14162 		vm_map_offset_t         top_entry_saved_start;
14163 		vm_object_offset_t      top_entry_saved_offset;
14164 		vm_map_entry_t          submap_entry, saved_submap_entry;
14165 		vm_object_offset_t      submap_entry_offset;
14166 		vm_object_size_t        submap_entry_size;
14167 		vm_prot_t               subentry_protection;
14168 		vm_prot_t               subentry_max_protection;
14169 		boolean_t               subentry_no_copy_on_read;
14170 		boolean_t               subentry_permanent;
14171 		boolean_t               subentry_csm_associated;
14172 #if __arm64e__
14173 		boolean_t               subentry_used_for_tpro;
14174 #endif /* __arm64e__ */
14175 		boolean_t               mapped_needs_copy = FALSE;
14176 		vm_map_version_t        version;
14177 
14178 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
14179 		    "map %p (%d) entry %p submap %p (%d)\n",
14180 		    map, VM_MAP_PAGE_SHIFT(map), entry,
14181 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
14182 
14183 		local_vaddr = vaddr;
14184 		top_entry_saved_start = entry->vme_start;
14185 		top_entry_saved_offset = VME_OFFSET(entry);
14186 
14187 		if ((entry->use_pmap &&
14188 		    !((fault_type & VM_PROT_WRITE) ||
14189 		    force_copy))) {
14190 			/* if real_map equals map we unlock below */
14191 			if ((*real_map != map) &&
14192 			    (*real_map != cow_sub_map_parent)) {
14193 				vm_map_unlock(*real_map);
14194 			}
14195 			*real_map = VME_SUBMAP(entry);
14196 		}
14197 
14198 		if (entry->needs_copy &&
14199 		    ((fault_type & VM_PROT_WRITE) ||
14200 		    force_copy)) {
14201 			if (!mapped_needs_copy) {
14202 				if (vm_map_lock_read_to_write(map)) {
14203 					vm_map_lock_read(map);
14204 					*real_map = map;
14205 					goto RetryLookup;
14206 				}
14207 				vm_map_lock_read(VME_SUBMAP(entry));
14208 				*var_map = VME_SUBMAP(entry);
14209 				cow_sub_map_parent = map;
14210 				/* reset base to map before cow object */
14211 				/* this is the map which will accept   */
14212 				/* the new cow object */
14213 				old_start = entry->vme_start;
14214 				old_end = entry->vme_end;
14215 				cow_parent_vaddr = vaddr;
14216 				mapped_needs_copy = TRUE;
14217 			} else {
14218 				vm_map_lock_read(VME_SUBMAP(entry));
14219 				*var_map = VME_SUBMAP(entry);
14220 				if ((cow_sub_map_parent != map) &&
14221 				    (*real_map != map)) {
14222 					vm_map_unlock(map);
14223 				}
14224 			}
14225 		} else {
14226 			if (entry->needs_copy) {
14227 				submap_needed_copy = TRUE;
14228 			}
14229 			vm_map_lock_read(VME_SUBMAP(entry));
14230 			*var_map = VME_SUBMAP(entry);
14231 			/* leave map locked if it is a target */
14232 			/* cow sub_map above otherwise, just  */
14233 			/* follow the maps down to the object */
14234 			/* here we unlock knowing we are not  */
14235 			/* revisiting the map.  */
14236 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
14237 				vm_map_unlock_read(map);
14238 			}
14239 		}
14240 
14241 		entry = NULL;
14242 		map = *var_map;
14243 
14244 		/* calculate the offset in the submap for vaddr */
14245 		local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
14246 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
14247 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
14248 		    (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
14249 
14250 RetrySubMap:
14251 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
14252 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14253 				vm_map_unlock(cow_sub_map_parent);
14254 			}
14255 			if ((*real_map != map)
14256 			    && (*real_map != cow_sub_map_parent)) {
14257 				vm_map_unlock(*real_map);
14258 			}
14259 			*real_map = map;
14260 			return KERN_INVALID_ADDRESS;
14261 		}
14262 
14263 		/* find the attenuated shadow of the underlying object */
14264 		/* on our target map */
14265 
14266 		/* in english the submap object may extend beyond the     */
14267 		/* region mapped by the entry or, may only fill a portion */
14268 		/* of it.  For our purposes, we only care if the object   */
14269 		/* doesn't fill.  In this case the area which will        */
14270 		/* ultimately be clipped in the top map will only need    */
14271 		/* to be as big as the portion of the underlying entry    */
14272 		/* which is mapped */
14273 		start_delta = submap_entry->vme_start > top_entry_saved_offset ?
14274 		    submap_entry->vme_start - top_entry_saved_offset : 0;
14275 
14276 		end_delta =
14277 		    (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
14278 		    submap_entry->vme_end ?
14279 		    0 : (top_entry_saved_offset +
14280 		    (old_end - old_start))
14281 		    - submap_entry->vme_end;
14282 
14283 		old_start += start_delta;
14284 		old_end -= end_delta;
14285 
14286 		if (submap_entry->is_sub_map) {
14287 			entry = submap_entry;
14288 			vaddr = local_vaddr;
14289 			goto submap_recurse;
14290 		}
14291 
14292 		if (((fault_type & VM_PROT_WRITE) ||
14293 		    force_copy)
14294 		    && cow_sub_map_parent) {
14295 			vm_object_t     sub_object, copy_object;
14296 			vm_object_offset_t copy_offset;
14297 			vm_map_offset_t local_start;
14298 			vm_map_offset_t local_end;
14299 			boolean_t       object_copied = FALSE;
14300 			vm_object_offset_t object_copied_offset = 0;
14301 			boolean_t       object_copied_needs_copy = FALSE;
14302 			kern_return_t   kr = KERN_SUCCESS;
14303 
14304 			if (vm_map_lock_read_to_write(map)) {
14305 				vm_map_lock_read(map);
14306 				old_start -= start_delta;
14307 				old_end += end_delta;
14308 				goto RetrySubMap;
14309 			}
14310 
14311 
14312 			sub_object = VME_OBJECT(submap_entry);
14313 			if (sub_object == VM_OBJECT_NULL) {
14314 				sub_object =
14315 				    vm_object_allocate(
14316 					(vm_map_size_t)
14317 					(submap_entry->vme_end -
14318 					submap_entry->vme_start), map->serial_id);
14319 				VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14320 				VME_OFFSET_SET(submap_entry, 0);
14321 				assert(!submap_entry->is_sub_map);
14322 				assert(submap_entry->use_pmap);
14323 			}
14324 			local_start =  local_vaddr -
14325 			    (cow_parent_vaddr - old_start);
14326 			local_end = local_vaddr +
14327 			    (old_end - cow_parent_vaddr);
14328 			vm_map_clip_start(map, submap_entry, local_start);
14329 			vm_map_clip_end(map, submap_entry, local_end);
14330 			if (submap_entry->is_sub_map) {
14331 				/* unnesting was done when clipping */
14332 				assert(!submap_entry->use_pmap);
14333 			}
14334 
14335 			/* This is the COW case, lets connect */
14336 			/* an entry in our space to the underlying */
14337 			/* object in the submap, bypassing the  */
14338 			/* submap. */
14339 			submap_entry_offset = VME_OFFSET(submap_entry);
14340 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14341 
14342 			if ((submap_entry->wired_count != 0 ||
14343 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14344 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
14345 			    no_force_copy_if_executable) {
14346 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14347 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14348 					vm_map_unlock(cow_sub_map_parent);
14349 				}
14350 				if ((*real_map != map)
14351 				    && (*real_map != cow_sub_map_parent)) {
14352 					vm_map_unlock(*real_map);
14353 				}
14354 				*real_map = map;
14355 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14356 				vm_map_lock_write_to_read(map);
14357 				kr = KERN_PROTECTION_FAILURE;
14358 				DTRACE_VM4(submap_no_copy_executable,
14359 				    vm_map_t, map,
14360 				    vm_object_offset_t, submap_entry_offset,
14361 				    vm_object_size_t, submap_entry_size,
14362 				    int, kr);
14363 				return kr;
14364 			}
14365 
14366 			if (submap_entry->wired_count != 0) {
14367 				vm_object_reference(sub_object);
14368 
14369 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14370 				    "submap_entry %p offset 0x%llx\n",
14371 				    submap_entry, VME_OFFSET(submap_entry));
14372 
14373 				DTRACE_VM6(submap_copy_slowly,
14374 				    vm_map_t, cow_sub_map_parent,
14375 				    vm_map_offset_t, vaddr,
14376 				    vm_map_t, map,
14377 				    vm_object_size_t, submap_entry_size,
14378 				    int, submap_entry->wired_count,
14379 				    int, sub_object->copy_strategy);
14380 
14381 				saved_submap_entry = submap_entry;
14382 				version.main_timestamp = map->timestamp;
14383 				vm_map_unlock(map); /* Increments timestamp by 1 */
14384 				submap_entry = VM_MAP_ENTRY_NULL;
14385 
14386 				vm_object_lock(sub_object);
14387 				kr = vm_object_copy_slowly(sub_object,
14388 				    submap_entry_offset,
14389 				    submap_entry_size,
14390 				    FALSE, /* interruptible */
14391 				    &copy_object);
14392 				object_copied = TRUE;
14393 				object_copied_offset = 0;
14394 				/* 4k: account for extra offset in physical page */
14395 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14396 				object_copied_needs_copy = FALSE;
14397 				vm_object_deallocate(sub_object);
14398 
14399 				vm_map_lock(map);
14400 
14401 				if (kr != KERN_SUCCESS &&
14402 				    kr != KERN_MEMORY_RESTART_COPY) {
14403 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14404 						vm_map_unlock(cow_sub_map_parent);
14405 					}
14406 					if ((*real_map != map)
14407 					    && (*real_map != cow_sub_map_parent)) {
14408 						vm_map_unlock(*real_map);
14409 					}
14410 					*real_map = map;
14411 					vm_object_deallocate(copy_object);
14412 					copy_object = VM_OBJECT_NULL;
14413 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14414 					vm_map_lock_write_to_read(map);
14415 					DTRACE_VM4(submap_copy_error_slowly,
14416 					    vm_object_t, sub_object,
14417 					    vm_object_offset_t, submap_entry_offset,
14418 					    vm_object_size_t, submap_entry_size,
14419 					    int, kr);
14420 					vm_map_lookup_and_lock_object_copy_slowly_error++;
14421 					return kr;
14422 				}
14423 
14424 				if ((kr == KERN_SUCCESS) &&
14425 				    (version.main_timestamp + 1) == map->timestamp) {
14426 					submap_entry = saved_submap_entry;
14427 				} else {
14428 					saved_submap_entry = NULL;
14429 					old_start -= start_delta;
14430 					old_end += end_delta;
14431 					vm_object_deallocate(copy_object);
14432 					copy_object = VM_OBJECT_NULL;
14433 					vm_map_lock_write_to_read(map);
14434 					vm_map_lookup_and_lock_object_copy_slowly_restart++;
14435 					goto RetrySubMap;
14436 				}
14437 				vm_map_lookup_and_lock_object_copy_slowly_count++;
14438 				vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14439 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14440 					vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14441 				}
14442 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14443 				submap_entry_offset = VME_OFFSET(submap_entry);
14444 				copy_object = VM_OBJECT_NULL;
14445 				object_copied_offset = submap_entry_offset;
14446 				object_copied_needs_copy = FALSE;
14447 				DTRACE_VM6(submap_copy_strategically,
14448 				    vm_map_t, cow_sub_map_parent,
14449 				    vm_map_offset_t, vaddr,
14450 				    vm_map_t, map,
14451 				    vm_object_size_t, submap_entry_size,
14452 				    int, submap_entry->wired_count,
14453 				    int, sub_object->copy_strategy);
14454 				kr = vm_object_copy_strategically(
14455 					sub_object,
14456 					submap_entry_offset,
14457 					submap_entry->vme_end - submap_entry->vme_start,
14458 					false, /* forking */
14459 					&copy_object,
14460 					&object_copied_offset,
14461 					&object_copied_needs_copy);
14462 				if (kr == KERN_MEMORY_RESTART_COPY) {
14463 					old_start -= start_delta;
14464 					old_end += end_delta;
14465 					vm_object_deallocate(copy_object);
14466 					copy_object = VM_OBJECT_NULL;
14467 					vm_map_lock_write_to_read(map);
14468 					vm_map_lookup_and_lock_object_copy_strategically_restart++;
14469 					goto RetrySubMap;
14470 				}
14471 				if (kr != KERN_SUCCESS) {
14472 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14473 						vm_map_unlock(cow_sub_map_parent);
14474 					}
14475 					if ((*real_map != map)
14476 					    && (*real_map != cow_sub_map_parent)) {
14477 						vm_map_unlock(*real_map);
14478 					}
14479 					*real_map = map;
14480 					vm_object_deallocate(copy_object);
14481 					copy_object = VM_OBJECT_NULL;
14482 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14483 					vm_map_lock_write_to_read(map);
14484 					DTRACE_VM4(submap_copy_error_strategically,
14485 					    vm_object_t, sub_object,
14486 					    vm_object_offset_t, submap_entry_offset,
14487 					    vm_object_size_t, submap_entry_size,
14488 					    int, kr);
14489 					vm_map_lookup_and_lock_object_copy_strategically_error++;
14490 					return kr;
14491 				}
14492 				assert(copy_object != VM_OBJECT_NULL);
14493 				assert(copy_object != sub_object);
14494 				object_copied = TRUE;
14495 				vm_map_lookup_and_lock_object_copy_strategically_count++;
14496 				vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14497 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14498 					vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14499 				}
14500 			} else {
14501 				/* set up shadow object */
14502 				object_copied = FALSE;
14503 				copy_object = sub_object;
14504 				vm_object_lock(sub_object);
14505 				vm_object_reference_locked(sub_object);
14506 				VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14507 				vm_object_unlock(sub_object);
14508 
14509 				assert(submap_entry->wired_count == 0);
14510 				submap_entry->needs_copy = TRUE;
14511 
14512 				prot = submap_entry->protection;
14513 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14514 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14515 					    __FUNCTION__,
14516 					    map, map->pmap, submap_entry,
14517 					    (uint64_t)submap_entry->vme_start,
14518 					    (uint64_t)submap_entry->vme_end,
14519 					    prot);
14520 				}
14521 				prot = prot & ~VM_PROT_WRITE;
14522 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14523 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14524 					    __FUNCTION__,
14525 					    map, map->pmap, submap_entry,
14526 					    (uint64_t)submap_entry->vme_start,
14527 					    (uint64_t)submap_entry->vme_end,
14528 					    prot);
14529 				}
14530 
14531 				if (override_nx(old_map,
14532 				    VME_ALIAS(submap_entry))
14533 				    && prot) {
14534 					prot |= VM_PROT_EXECUTE;
14535 				}
14536 
14537 				vm_object_pmap_protect(
14538 					sub_object,
14539 					VME_OFFSET(submap_entry),
14540 					submap_entry->vme_end -
14541 					submap_entry->vme_start,
14542 					(submap_entry->is_shared
14543 					|| map->mapped_in_other_pmaps) ?
14544 					PMAP_NULL : map->pmap,
14545 					VM_MAP_PAGE_SIZE(map),
14546 					submap_entry->vme_start,
14547 					prot);
14548 				vm_map_lookup_and_lock_object_copy_shadow_count++;
14549 				vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14550 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14551 					vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14552 				}
14553 			}
14554 
14555 			/*
14556 			 * Adjust the fault offset to the submap entry.
14557 			 */
14558 			copy_offset = (local_vaddr -
14559 			    submap_entry->vme_start +
14560 			    VME_OFFSET(submap_entry));
14561 
14562 			/* This works diffently than the   */
14563 			/* normal submap case. We go back  */
14564 			/* to the parent of the cow map and*/
14565 			/* clip out the target portion of  */
14566 			/* the sub_map, substituting the   */
14567 			/* new copy object,                */
14568 
14569 			subentry_protection = submap_entry->protection;
14570 			subentry_max_protection = submap_entry->max_protection;
14571 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14572 			subentry_permanent = submap_entry->vme_permanent;
14573 			subentry_csm_associated = submap_entry->csm_associated;
14574 #if __arm64e__
14575 			subentry_used_for_tpro = submap_entry->used_for_tpro;
14576 #endif // __arm64e__
14577 			vm_map_unlock(map);
14578 			submap_entry = NULL; /* not valid after map unlock */
14579 
14580 			local_start = old_start;
14581 			local_end = old_end;
14582 			map = cow_sub_map_parent;
14583 			*var_map = cow_sub_map_parent;
14584 			vaddr = cow_parent_vaddr;
14585 			cow_sub_map_parent = NULL;
14586 
14587 			if (!vm_map_lookup_entry(map,
14588 			    vaddr, &entry)) {
14589 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14590 					vm_map_unlock(cow_sub_map_parent);
14591 				}
14592 				if ((*real_map != map)
14593 				    && (*real_map != cow_sub_map_parent)) {
14594 					vm_map_unlock(*real_map);
14595 				}
14596 				*real_map = map;
14597 				vm_object_deallocate(
14598 					copy_object);
14599 				copy_object = VM_OBJECT_NULL;
14600 				vm_map_lock_write_to_read(map);
14601 				DTRACE_VM4(submap_lookup_post_unlock,
14602 				    uint64_t, (uint64_t)entry->vme_start,
14603 				    uint64_t, (uint64_t)entry->vme_end,
14604 				    vm_map_offset_t, vaddr,
14605 				    int, object_copied);
14606 				return KERN_INVALID_ADDRESS;
14607 			}
14608 
14609 			/* clip out the portion of space */
14610 			/* mapped by the sub map which   */
14611 			/* corresponds to the underlying */
14612 			/* object */
14613 
14614 			/*
14615 			 * Clip (and unnest) the smallest nested chunk
14616 			 * possible around the faulting address...
14617 			 */
14618 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14619 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
14620 			/*
14621 			 * ... but don't go beyond the "old_start" to "old_end"
14622 			 * range, to avoid spanning over another VM region
14623 			 * with a possibly different VM object and/or offset.
14624 			 */
14625 			if (local_start < old_start) {
14626 				local_start = old_start;
14627 			}
14628 			if (local_end > old_end) {
14629 				local_end = old_end;
14630 			}
14631 			/*
14632 			 * Adjust copy_offset to the start of the range.
14633 			 */
14634 			copy_offset -= (vaddr - local_start);
14635 
14636 			vm_map_clip_start(map, entry, local_start);
14637 			vm_map_clip_end(map, entry, local_end);
14638 			if (entry->is_sub_map) {
14639 				/* unnesting was done when clipping */
14640 				assert(!entry->use_pmap);
14641 			}
14642 
14643 			/* substitute copy object for */
14644 			/* shared map entry           */
14645 			vm_map_deallocate(VME_SUBMAP(entry));
14646 			assert(!entry->iokit_acct);
14647 			entry->use_pmap = TRUE;
14648 			VME_OBJECT_SET(entry, copy_object, false, 0);
14649 
14650 			/* propagate the submap entry's protections */
14651 			if (entry->protection != VM_PROT_READ) {
14652 				/*
14653 				 * Someone has already altered the top entry's
14654 				 * protections via vm_protect(VM_PROT_COPY).
14655 				 * Respect these new values and ignore the
14656 				 * submap entry's protections.
14657 				 */
14658 			} else {
14659 				/*
14660 				 * Regular copy-on-write: propagate the submap
14661 				 * entry's protections to the top map entry.
14662 				 */
14663 				entry->protection |= subentry_protection;
14664 			}
14665 			entry->max_protection |= subentry_max_protection;
14666 			/* propagate some attributes from subentry */
14667 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14668 			entry->vme_permanent = subentry_permanent;
14669 			entry->csm_associated = subentry_csm_associated;
14670 #if __arm64e__
14671 			/* propagate TPRO iff the destination map has TPRO enabled */
14672 			if (subentry_used_for_tpro) {
14673 				if (vm_map_tpro(map)) {
14674 					entry->used_for_tpro = subentry_used_for_tpro;
14675 				} else {
14676 					/* "permanent" came from being TPRO */
14677 					entry->vme_permanent = FALSE;
14678 				}
14679 			}
14680 #endif /* __arm64e */
14681 			if ((entry->protection & VM_PROT_WRITE) &&
14682 			    (entry->protection & VM_PROT_EXECUTE) &&
14683 #if XNU_TARGET_OS_OSX
14684 			    map->pmap != kernel_pmap &&
14685 			    (vm_map_cs_enforcement(map)
14686 #if __arm64__
14687 			    || !VM_MAP_IS_EXOTIC(map)
14688 #endif /* __arm64__ */
14689 			    ) &&
14690 #endif /* XNU_TARGET_OS_OSX */
14691 #if CODE_SIGNING_MONITOR
14692 			    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14693 #endif
14694 			    !(entry->used_for_jit) &&
14695 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
14696 				DTRACE_VM3(cs_wx,
14697 				    uint64_t, (uint64_t)entry->vme_start,
14698 				    uint64_t, (uint64_t)entry->vme_end,
14699 				    vm_prot_t, entry->protection);
14700 				printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14701 				    proc_selfpid(),
14702 				    (get_bsdtask_info(current_task())
14703 				    ? proc_name_address(get_bsdtask_info(current_task()))
14704 				    : "?"),
14705 				    __FUNCTION__, __LINE__,
14706 #if DEVELOPMENT || DEBUG
14707 				    (uint64_t)entry->vme_start,
14708 				    (uint64_t)entry->vme_end,
14709 #else /* DEVELOPMENT || DEBUG */
14710 				    (uint64_t)0,
14711 				    (uint64_t)0,
14712 #endif /* DEVELOPMENT || DEBUG */
14713 				    entry->protection);
14714 				entry->protection &= ~VM_PROT_EXECUTE;
14715 			}
14716 
14717 			if (object_copied) {
14718 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14719 				entry->needs_copy = object_copied_needs_copy;
14720 				entry->is_shared = FALSE;
14721 			} else {
14722 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14723 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14724 				assert(entry->wired_count == 0);
14725 				VME_OFFSET_SET(entry, copy_offset);
14726 				entry->needs_copy = TRUE;
14727 				if (map != old_map) {
14728 					entry->is_shared = TRUE;
14729 				}
14730 			}
14731 			if (entry->inheritance == VM_INHERIT_SHARE) {
14732 				entry->inheritance = VM_INHERIT_COPY;
14733 			}
14734 
14735 			vm_map_lock_write_to_read(map);
14736 		} else {
14737 			if ((cow_sub_map_parent)
14738 			    && (cow_sub_map_parent != *real_map)
14739 			    && (cow_sub_map_parent != map)) {
14740 				vm_map_unlock(cow_sub_map_parent);
14741 			}
14742 			entry = submap_entry;
14743 			vaddr = local_vaddr;
14744 		}
14745 	}
14746 
14747 	/*
14748 	 *	Check whether this task is allowed to have
14749 	 *	this page.
14750 	 */
14751 
14752 	prot = entry->protection;
14753 
14754 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14755 		/*
14756 		 * HACK -- if not a stack, then allow execution
14757 		 */
14758 		prot |= VM_PROT_EXECUTE;
14759 	}
14760 
14761 #if __arm64e__
14762 	/*
14763 	 * If the entry we're dealing with is TPRO and we have a write
14764 	 * fault, inject VM_PROT_WRITE into protections. This allows us
14765 	 * to maintain RO permissions when not marked as TPRO.
14766 	 */
14767 	if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14768 		prot |= VM_PROT_WRITE;
14769 	}
14770 #endif /* __arm64e__ */
14771 	if (mask_protections) {
14772 		fault_type &= prot;
14773 		if (fault_type == VM_PROT_NONE) {
14774 			goto protection_failure;
14775 		}
14776 	}
14777 	if (((fault_type & prot) != fault_type)
14778 #if __arm64__
14779 	    /* prefetch abort in execute-only page */
14780 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14781 #elif defined(__x86_64__)
14782 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
14783 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14784 #endif
14785 	    ) {
14786 protection_failure:
14787 		if (*real_map != map) {
14788 			vm_map_unlock(*real_map);
14789 		}
14790 		*real_map = map;
14791 
14792 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
14793 			log_stack_execution_failure((addr64_t)vaddr, prot);
14794 		}
14795 
14796 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14797 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14798 		/*
14799 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14800 		 *
14801 		 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14802 		 */
14803 		return KERN_PROTECTION_FAILURE;
14804 	}
14805 
14806 	/*
14807 	 *	If this page is not pageable, we have to get
14808 	 *	it for all possible accesses.
14809 	 */
14810 
14811 	*wired = (entry->wired_count != 0);
14812 	if (*wired) {
14813 		fault_type = prot;
14814 	}
14815 
14816 	/*
14817 	 *	If the entry was copy-on-write, we either ...
14818 	 */
14819 
14820 	if (entry->needs_copy) {
14821 		/*
14822 		 *	If we want to write the page, we may as well
14823 		 *	handle that now since we've got the map locked.
14824 		 *
14825 		 *	If we don't need to write the page, we just
14826 		 *	demote the permissions allowed.
14827 		 */
14828 
14829 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14830 			/*
14831 			 *	Make a new object, and place it in the
14832 			 *	object chain.  Note that no new references
14833 			 *	have appeared -- one just moved from the
14834 			 *	map to the new object.
14835 			 */
14836 
14837 			if (vm_map_lock_read_to_write(map)) {
14838 				vm_map_lock_read(map);
14839 				goto RetryLookup;
14840 			}
14841 
14842 			if (VME_OBJECT(entry)->shadowed == FALSE) {
14843 				vm_object_lock(VME_OBJECT(entry));
14844 				VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14845 				vm_object_unlock(VME_OBJECT(entry));
14846 			}
14847 			VME_OBJECT_SHADOW(entry,
14848 			    (vm_map_size_t) (entry->vme_end -
14849 			    entry->vme_start),
14850 			    vm_map_always_shadow(map));
14851 			entry->needs_copy = FALSE;
14852 
14853 			vm_map_lock_write_to_read(map);
14854 		}
14855 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14856 			/*
14857 			 *	We're attempting to read a copy-on-write
14858 			 *	page -- don't allow writes.
14859 			 */
14860 
14861 			prot &= (~VM_PROT_WRITE);
14862 		}
14863 	}
14864 
14865 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14866 		/*
14867 		 * We went through a "needs_copy" submap without triggering
14868 		 * a copy, so granting write access to the page would bypass
14869 		 * that submap's "needs_copy".
14870 		 */
14871 		assert(!(fault_type & VM_PROT_WRITE));
14872 		assert(!*wired);
14873 		assert(!force_copy);
14874 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14875 		prot &= ~VM_PROT_WRITE;
14876 	}
14877 
14878 	/*
14879 	 *	Create an object if necessary.
14880 	 */
14881 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14882 		if (vm_map_lock_read_to_write(map)) {
14883 			vm_map_lock_read(map);
14884 			goto RetryLookup;
14885 		}
14886 
14887 		VME_OBJECT_SET(entry,
14888 		    vm_object_allocate(
14889 			    (vm_map_size_t)(entry->vme_end -
14890 			    entry->vme_start),
14891 			    map->serial_id
14892 			    ), false, 0);
14893 		VME_OFFSET_SET(entry, 0);
14894 		assert(entry->use_pmap);
14895 		vm_map_lock_write_to_read(map);
14896 	}
14897 
14898 	/*
14899 	 *	Return the object/offset from this entry.  If the entry
14900 	 *	was copy-on-write or empty, it has been fixed up.  Also
14901 	 *	return the protection.
14902 	 */
14903 
14904 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14905 	*object = VME_OBJECT(entry);
14906 	*out_prot = prot;
14907 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14908 
14909 	if (fault_info) {
14910 		/*
14911 		 * Initialize fault information according to the entry being faulted
14912 		 * from.
14913 		 */
14914 		fault_info->user_tag = VME_ALIAS(entry);
14915 		fault_info->pmap_options = 0;
14916 		if (entry->iokit_acct ||
14917 		    (!entry->is_sub_map && !entry->use_pmap)) {
14918 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14919 		}
14920 		if (fault_info->behavior == VM_BEHAVIOR_DEFAULT) {
14921 			fault_info->behavior = entry->behavior;
14922 		}
14923 		fault_info->lo_offset = VME_OFFSET(entry);
14924 		fault_info->hi_offset =
14925 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14926 		fault_info->no_cache  = entry->no_cache;
14927 		fault_info->io_sync = FALSE;
14928 		fault_info->cs_bypass = (entry->used_for_jit ||
14929 #if CODE_SIGNING_MONITOR
14930 		    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14931 #endif
14932 		    entry->vme_resilient_codesign);
14933 		fault_info->mark_zf_absent = FALSE;
14934 		fault_info->batch_pmap_op = FALSE;
14935 		/*
14936 		 * The pmap layer will validate this page
14937 		 * before allowing it to be executed from.
14938 		 */
14939 #if CODE_SIGNING_MONITOR
14940 		fault_info->csm_associated = entry->csm_associated;
14941 #else
14942 		fault_info->csm_associated = FALSE;
14943 #endif
14944 
14945 		fault_info->resilient_media = entry->vme_resilient_media;
14946 		fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14947 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14948 #if __arm64e__
14949 		fault_info->fi_used_for_tpro = entry->used_for_tpro;
14950 #else /* __arm64e__ */
14951 		fault_info->fi_used_for_tpro = FALSE;
14952 #endif
14953 		if (entry->translated_allow_execute) {
14954 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14955 		}
14956 	}
14957 
14958 	/*
14959 	 *	Lock the object to prevent it from disappearing
14960 	 */
14961 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14962 		if (contended == NULL) {
14963 			vm_object_lock(*object);
14964 		} else {
14965 			*contended = vm_object_lock_check_contended(*object);
14966 		}
14967 	} else {
14968 		vm_object_lock_shared(*object);
14969 	}
14970 
14971 	/*
14972 	 *	Save the version number
14973 	 */
14974 
14975 	out_version->main_timestamp = map->timestamp;
14976 
14977 	return KERN_SUCCESS;
14978 }
14979 
14980 
14981 /*
14982  *	vm_map_verify:
14983  *
14984  *	Verifies that the map in question has not changed
14985  *	since the given version. The map has to be locked
14986  *	("shared" mode is fine) before calling this function
14987  *	and it will be returned locked too.
14988  */
14989 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14990 vm_map_verify(
14991 	vm_map_t                map,
14992 	vm_map_version_t        *version)       /* REF */
14993 {
14994 	boolean_t       result;
14995 
14996 	vm_map_lock_assert_held(map);
14997 	result = (map->timestamp == version->main_timestamp);
14998 
14999 	return result;
15000 }
15001 
15002 
15003 /*
15004  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
15005  *	Goes away after regular vm_region_recurse function migrates to
15006  *	64 bits
15007  *	vm_region_recurse: A form of vm_region which follows the
15008  *	submaps in a target map
15009  *
15010  */
15011 
15012 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)15013 vm_map_region_recurse_64(
15014 	vm_map_t                map,
15015 	vm_map_offset_ut       *address_u,      /* IN/OUT */
15016 	vm_map_size_ut         *size_u,         /* OUT */
15017 	natural_t              *nesting_depth,  /* IN/OUT */
15018 	vm_region_submap_info_64_t submap_info, /* IN/OUT */
15019 	mach_msg_type_number_t *count)          /* IN/OUT */
15020 {
15021 	mach_msg_type_number_t  original_count;
15022 	vm_region_extended_info_data_t  extended;
15023 	vm_map_entry_t                  tmp_entry;
15024 	vm_map_offset_t                 user_address;
15025 	unsigned int                    user_max_depth;
15026 
15027 	/*
15028 	 * "curr_entry" is the VM map entry preceding or including the
15029 	 * address we're looking for.
15030 	 * "curr_map" is the map or sub-map containing "curr_entry".
15031 	 * "curr_address" is the equivalent of the top map's "user_address"
15032 	 * in the current map.
15033 	 * "curr_offset" is the cumulated offset of "curr_map" in the
15034 	 * target task's address space.
15035 	 * "curr_depth" is the depth of "curr_map" in the chain of
15036 	 * sub-maps.
15037 	 *
15038 	 * "curr_max_below" and "curr_max_above" limit the range (around
15039 	 * "curr_address") we should take into account in the current (sub)map.
15040 	 * They limit the range to what's visible through the map entries
15041 	 * we've traversed from the top map to the current map.
15042 	 *
15043 	 */
15044 	vm_map_entry_t                  curr_entry;
15045 	vm_map_t                        curr_entry_submap;
15046 	vm_map_address_t                curr_entry_start;
15047 	vm_object_offset_t              curr_entry_offset;
15048 	vm_map_address_t                curr_address;
15049 	vm_map_offset_t                 curr_offset;
15050 	vm_map_t                        curr_map;
15051 	unsigned int                    curr_depth;
15052 	vm_map_offset_t                 curr_max_below, curr_max_above;
15053 	vm_map_offset_t                 curr_skip;
15054 
15055 	/*
15056 	 * "next_" is the same as "curr_" but for the VM region immediately
15057 	 * after the address we're looking for.  We need to keep track of this
15058 	 * too because we want to return info about that region if the
15059 	 * address we're looking for is not mapped.
15060 	 */
15061 	vm_map_entry_t                  next_entry;
15062 	vm_map_offset_t                 next_offset;
15063 	vm_map_offset_t                 next_address;
15064 	vm_map_t                        next_map;
15065 	unsigned int                    next_depth;
15066 	vm_map_offset_t                 next_max_below, next_max_above;
15067 	vm_map_offset_t                 next_skip;
15068 
15069 	boolean_t                       look_for_pages;
15070 	vm_region_submap_short_info_64_t short_info;
15071 	boolean_t                       do_region_footprint;
15072 	int                             effective_page_size, effective_page_shift;
15073 	boolean_t                       submap_needed_copy;
15074 
15075 	if (map == VM_MAP_NULL) {
15076 		/* no address space to work on */
15077 		return KERN_INVALID_ARGUMENT;
15078 	}
15079 
15080 	user_address = vm_sanitize_addr(map, *address_u);
15081 
15082 	effective_page_shift = vm_self_region_page_shift(map);
15083 	effective_page_size = (1 << effective_page_shift);
15084 
15085 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
15086 		/*
15087 		 * "info" structure is not big enough and
15088 		 * would overflow
15089 		 */
15090 		return KERN_INVALID_ARGUMENT;
15091 	}
15092 
15093 	do_region_footprint = task_self_region_footprint();
15094 	original_count = *count;
15095 
15096 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
15097 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
15098 		look_for_pages = FALSE;
15099 		short_info = (vm_region_submap_short_info_64_t) submap_info;
15100 		submap_info = NULL;
15101 	} else {
15102 		look_for_pages = TRUE;
15103 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
15104 		short_info = NULL;
15105 
15106 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15107 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
15108 		}
15109 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15110 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
15111 		}
15112 	}
15113 
15114 	user_max_depth = *nesting_depth;
15115 	submap_needed_copy = FALSE;
15116 
15117 	if (not_in_kdp) {
15118 		vm_map_lock_read(map);
15119 	}
15120 
15121 recurse_again:
15122 	curr_entry = NULL;
15123 	curr_map = map;
15124 	curr_address = user_address;
15125 	curr_offset = 0;
15126 	curr_skip = 0;
15127 	curr_depth = 0;
15128 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
15129 	curr_max_below = curr_address;
15130 
15131 	next_entry = NULL;
15132 	next_map = NULL;
15133 	next_address = 0;
15134 	next_offset = 0;
15135 	next_skip = 0;
15136 	next_depth = 0;
15137 	next_max_above = (vm_map_offset_t) -1;
15138 	next_max_below = (vm_map_offset_t) -1;
15139 
15140 	for (;;) {
15141 		if (vm_map_lookup_entry(curr_map,
15142 		    curr_address,
15143 		    &tmp_entry)) {
15144 			/* tmp_entry contains the address we're looking for */
15145 			curr_entry = tmp_entry;
15146 		} else {
15147 			vm_map_offset_t skip;
15148 			/*
15149 			 * The address is not mapped.  "tmp_entry" is the
15150 			 * map entry preceding the address.  We want the next
15151 			 * one, if it exists.
15152 			 */
15153 			curr_entry = tmp_entry->vme_next;
15154 
15155 			if (curr_entry == vm_map_to_entry(curr_map) ||
15156 			    (curr_entry->vme_start >=
15157 			    curr_address + curr_max_above)) {
15158 				/* no next entry at this level: stop looking */
15159 				if (not_in_kdp) {
15160 					vm_map_unlock_read(curr_map);
15161 				}
15162 				curr_entry = NULL;
15163 				curr_map = NULL;
15164 				curr_skip = 0;
15165 				curr_offset = 0;
15166 				curr_depth = 0;
15167 				curr_max_above = 0;
15168 				curr_max_below = 0;
15169 				break;
15170 			}
15171 
15172 			/* adjust current address and offset */
15173 			skip = curr_entry->vme_start - curr_address;
15174 			curr_address = curr_entry->vme_start;
15175 			curr_skip += skip;
15176 			curr_offset += skip;
15177 			curr_max_above -= skip;
15178 			curr_max_below = 0;
15179 		}
15180 
15181 		/*
15182 		 * Is the next entry at this level closer to the address (or
15183 		 * deeper in the submap chain) than the one we had
15184 		 * so far ?
15185 		 */
15186 		tmp_entry = curr_entry->vme_next;
15187 		if (tmp_entry == vm_map_to_entry(curr_map)) {
15188 			/* no next entry at this level */
15189 		} else if (tmp_entry->vme_start >=
15190 		    curr_address + curr_max_above) {
15191 			/*
15192 			 * tmp_entry is beyond the scope of what we mapped of
15193 			 * this submap in the upper level: ignore it.
15194 			 */
15195 		} else if ((next_entry == NULL) ||
15196 		    (tmp_entry->vme_start + curr_offset <=
15197 		    next_entry->vme_start + next_offset)) {
15198 			/*
15199 			 * We didn't have a "next_entry" or this one is
15200 			 * closer to the address we're looking for:
15201 			 * use this "tmp_entry" as the new "next_entry".
15202 			 */
15203 			if (next_entry != NULL) {
15204 				/* unlock the last "next_map" */
15205 				if (next_map != curr_map && not_in_kdp) {
15206 					vm_map_unlock_read(next_map);
15207 				}
15208 			}
15209 			next_entry = tmp_entry;
15210 			next_map = curr_map;
15211 			next_depth = curr_depth;
15212 			next_address = next_entry->vme_start;
15213 			next_skip = curr_skip;
15214 			next_skip += (next_address - curr_address);
15215 			next_offset = curr_offset;
15216 			next_offset += (next_address - curr_address);
15217 			next_max_above = MIN(next_max_above, curr_max_above);
15218 			next_max_above = MIN(next_max_above,
15219 			    next_entry->vme_end - next_address);
15220 			next_max_below = MIN(next_max_below, curr_max_below);
15221 			next_max_below = MIN(next_max_below,
15222 			    next_address - next_entry->vme_start);
15223 		}
15224 
15225 		/*
15226 		 * "curr_max_{above,below}" allow us to keep track of the
15227 		 * portion of the submap that is actually mapped at this level:
15228 		 * the rest of that submap is irrelevant to us, since it's not
15229 		 * mapped here.
15230 		 * The relevant portion of the map starts at
15231 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
15232 		 */
15233 		curr_max_above = MIN(curr_max_above,
15234 		    curr_entry->vme_end - curr_address);
15235 		curr_max_below = MIN(curr_max_below,
15236 		    curr_address - curr_entry->vme_start);
15237 
15238 		if (!curr_entry->is_sub_map ||
15239 		    curr_depth >= user_max_depth) {
15240 			/*
15241 			 * We hit a leaf map or we reached the maximum depth
15242 			 * we could, so stop looking.  Keep the current map
15243 			 * locked.
15244 			 */
15245 			break;
15246 		}
15247 
15248 		/*
15249 		 * Get down to the next submap level.
15250 		 */
15251 
15252 		if (curr_entry->needs_copy) {
15253 			/* everything below this is effectively copy-on-write */
15254 			submap_needed_copy = TRUE;
15255 		}
15256 
15257 		/*
15258 		 * Lock the next level and unlock the current level,
15259 		 * unless we need to keep it locked to access the "next_entry"
15260 		 * later.
15261 		 */
15262 		curr_entry_submap = VME_SUBMAP(curr_entry);
15263 		curr_entry_start = curr_entry->vme_start;
15264 		curr_entry_offset = VME_OFFSET(curr_entry);
15265 		curr_entry = VM_MAP_ENTRY_NULL; /* no longer valid after unlocking the map */
15266 		if (not_in_kdp) {
15267 			vm_map_lock_read(curr_entry_submap);
15268 		}
15269 		if (curr_map == next_map) {
15270 			/* keep "next_map" locked in case we need it */
15271 		} else {
15272 			/* release this map */
15273 			if (not_in_kdp) {
15274 				vm_map_unlock_read(curr_map);
15275 			}
15276 		}
15277 
15278 		/*
15279 		 * Adjust the offset.  "curr_entry" mapped the submap
15280 		 * at relative address "curr_entry_start" in the
15281 		 * curr_map but skips the first "curr_entry_offset"
15282 		 * bytes of the submap.
15283 		 * "curr_offset" always represents the offset of a virtual
15284 		 * address in the curr_map relative to the absolute address
15285 		 * space (i.e. the top-level VM map).
15286 		 */
15287 		curr_offset += curr_entry_offset - curr_entry_start;
15288 		curr_address = user_address + curr_offset;
15289 		/* switch to the submap */
15290 		curr_map = curr_entry_submap;
15291 		curr_depth++;
15292 	}
15293 
15294 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
15295 // so probably should be a real 32b ID vs. ptr.
15296 // Current users just check for equality
15297 
15298 	if (curr_entry == NULL) {
15299 		/* no VM region contains the address... */
15300 
15301 		if (do_region_footprint && /* we want footprint numbers */
15302 		    next_entry == NULL && /* & there are no more regions */
15303 		    /* & we haven't already provided our fake region: */
15304 		    user_address <= vm_map_last_entry(map)->vme_end) {
15305 			ledger_amount_t ledger_resident, ledger_compressed;
15306 
15307 			/*
15308 			 * Add a fake memory region to account for
15309 			 * purgeable and/or ledger-tagged memory that
15310 			 * counts towards this task's memory footprint,
15311 			 * i.e. the resident/compressed pages of non-volatile
15312 			 * objects owned by that task.
15313 			 */
15314 			task_ledgers_footprint(map->pmap->ledger,
15315 			    &ledger_resident,
15316 			    &ledger_compressed);
15317 			if (ledger_resident + ledger_compressed == 0) {
15318 				/* no purgeable memory usage to report */
15319 				return KERN_INVALID_ADDRESS;
15320 			}
15321 			/* fake region to show nonvolatile footprint */
15322 			if (look_for_pages) {
15323 				submap_info->protection = VM_PROT_DEFAULT;
15324 				submap_info->max_protection = VM_PROT_DEFAULT;
15325 				submap_info->inheritance = VM_INHERIT_DEFAULT;
15326 				submap_info->offset = 0;
15327 				submap_info->user_tag = -1;
15328 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15329 				submap_info->pages_shared_now_private = 0;
15330 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15331 				submap_info->pages_dirtied = submap_info->pages_resident;
15332 				submap_info->ref_count = 1;
15333 				submap_info->shadow_depth = 0;
15334 				submap_info->external_pager = 0;
15335 				submap_info->share_mode = SM_PRIVATE;
15336 				if (submap_needed_copy) {
15337 					submap_info->share_mode = SM_COW;
15338 				}
15339 				submap_info->is_submap = 0;
15340 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15341 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15342 				submap_info->user_wired_count = 0;
15343 				submap_info->pages_reusable = 0;
15344 			} else {
15345 				short_info->user_tag = -1;
15346 				short_info->offset = 0;
15347 				short_info->protection = VM_PROT_DEFAULT;
15348 				short_info->inheritance = VM_INHERIT_DEFAULT;
15349 				short_info->max_protection = VM_PROT_DEFAULT;
15350 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
15351 				short_info->user_wired_count = 0;
15352 				short_info->is_submap = 0;
15353 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15354 				short_info->external_pager = 0;
15355 				short_info->shadow_depth = 0;
15356 				short_info->share_mode = SM_PRIVATE;
15357 				if (submap_needed_copy) {
15358 					short_info->share_mode = SM_COW;
15359 				}
15360 				short_info->ref_count = 1;
15361 			}
15362 			*nesting_depth = 0;
15363 			*address_u = vm_sanitize_wrap_addr(vm_map_last_entry(map)->vme_end);
15364 			*size_u    = vm_sanitize_wrap_size(ledger_resident + ledger_compressed);
15365 			return KERN_SUCCESS;
15366 		}
15367 
15368 		if (next_entry == NULL) {
15369 			/* ... and no VM region follows it either */
15370 			return KERN_INVALID_ADDRESS;
15371 		}
15372 		/* ... gather info about the next VM region */
15373 		curr_entry = next_entry;
15374 		curr_map = next_map;    /* still locked ... */
15375 		curr_address = next_address;
15376 		curr_skip = next_skip;
15377 		curr_offset = next_offset;
15378 		curr_depth = next_depth;
15379 		curr_max_above = next_max_above;
15380 		curr_max_below = next_max_below;
15381 	} else {
15382 		/* we won't need "next_entry" after all */
15383 		if (next_entry != NULL) {
15384 			/* release "next_map" */
15385 			if (next_map != curr_map && not_in_kdp) {
15386 				vm_map_unlock_read(next_map);
15387 			}
15388 		}
15389 	}
15390 	next_entry = NULL;
15391 	next_map = NULL;
15392 	next_offset = 0;
15393 	next_skip = 0;
15394 	next_depth = 0;
15395 	next_max_below = -1;
15396 	next_max_above = -1;
15397 
15398 	if (curr_entry->is_sub_map &&
15399 	    curr_depth < user_max_depth) {
15400 		/*
15401 		 * We're not as deep as we could be:  we must have
15402 		 * gone back up after not finding anything mapped
15403 		 * below the original top-level map entry's.
15404 		 * Let's move "curr_address" forward and recurse again.
15405 		 */
15406 		user_address = curr_address;
15407 		goto recurse_again;
15408 	}
15409 
15410 	*nesting_depth = curr_depth;
15411 	*address_u = vm_sanitize_wrap_addr(
15412 		user_address + curr_skip - curr_max_below);
15413 	*size_u    = vm_sanitize_wrap_size(curr_max_above + curr_max_below);
15414 
15415 	if (look_for_pages) {
15416 		submap_info->user_tag = VME_ALIAS(curr_entry);
15417 		submap_info->offset = VME_OFFSET(curr_entry);
15418 		submap_info->protection = curr_entry->protection;
15419 		submap_info->inheritance = curr_entry->inheritance;
15420 		submap_info->max_protection = curr_entry->max_protection;
15421 		submap_info->behavior = curr_entry->behavior;
15422 		submap_info->user_wired_count = curr_entry->user_wired_count;
15423 		submap_info->is_submap = curr_entry->is_sub_map;
15424 		if (curr_entry->is_sub_map) {
15425 			submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15426 		} else {
15427 			submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15428 		}
15429 	} else {
15430 		short_info->user_tag = VME_ALIAS(curr_entry);
15431 		short_info->offset = VME_OFFSET(curr_entry);
15432 		short_info->protection = curr_entry->protection;
15433 		short_info->inheritance = curr_entry->inheritance;
15434 		short_info->max_protection = curr_entry->max_protection;
15435 		short_info->behavior = curr_entry->behavior;
15436 		short_info->user_wired_count = curr_entry->user_wired_count;
15437 		short_info->is_submap = curr_entry->is_sub_map;
15438 		if (curr_entry->is_sub_map) {
15439 			short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15440 		} else {
15441 			short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15442 		}
15443 	}
15444 
15445 	extended.pages_resident = 0;
15446 	extended.pages_swapped_out = 0;
15447 	extended.pages_shared_now_private = 0;
15448 	extended.pages_dirtied = 0;
15449 	extended.pages_reusable = 0;
15450 	extended.external_pager = 0;
15451 	extended.shadow_depth = 0;
15452 	extended.share_mode = SM_EMPTY;
15453 	extended.ref_count = 0;
15454 
15455 	if (not_in_kdp) {
15456 		if (!curr_entry->is_sub_map) {
15457 			vm_map_offset_t range_start, range_end;
15458 			range_start = MAX((curr_address - curr_max_below),
15459 			    curr_entry->vme_start);
15460 			range_end = MIN((curr_address + curr_max_above),
15461 			    curr_entry->vme_end);
15462 			vm_map_region_walk(curr_map,
15463 			    range_start,
15464 			    curr_entry,
15465 			    (VME_OFFSET(curr_entry) +
15466 			    (range_start -
15467 			    curr_entry->vme_start)),
15468 			    range_end - range_start,
15469 			    &extended,
15470 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15471 			if (submap_needed_copy) {
15472 				extended.share_mode = SM_COW;
15473 			}
15474 		} else {
15475 			if (curr_entry->use_pmap) {
15476 				extended.share_mode = SM_TRUESHARED;
15477 			} else {
15478 				extended.share_mode = SM_PRIVATE;
15479 			}
15480 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15481 		}
15482 	}
15483 
15484 	if (look_for_pages) {
15485 		submap_info->pages_resident = extended.pages_resident;
15486 		submap_info->pages_swapped_out = extended.pages_swapped_out;
15487 		submap_info->pages_shared_now_private =
15488 		    extended.pages_shared_now_private;
15489 		submap_info->pages_dirtied = extended.pages_dirtied;
15490 		submap_info->external_pager = extended.external_pager;
15491 		submap_info->shadow_depth = extended.shadow_depth;
15492 		submap_info->share_mode = extended.share_mode;
15493 		submap_info->ref_count = extended.ref_count;
15494 
15495 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15496 			submap_info->pages_reusable = extended.pages_reusable;
15497 		}
15498 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15499 			if (curr_entry->is_sub_map) {
15500 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15501 			} else if (VME_OBJECT(curr_entry)) {
15502 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15503 			} else {
15504 				submap_info->object_id_full = 0ull;
15505 			}
15506 		}
15507 	} else {
15508 		short_info->external_pager = extended.external_pager;
15509 		short_info->shadow_depth = extended.shadow_depth;
15510 		short_info->share_mode = extended.share_mode;
15511 		short_info->ref_count = extended.ref_count;
15512 	}
15513 
15514 	if (not_in_kdp) {
15515 		vm_map_unlock_read(curr_map);
15516 	}
15517 
15518 	return KERN_SUCCESS;
15519 }
15520 
15521 /*
15522  *	vm_region:
15523  *
15524  *	User call to obtain information about a region in
15525  *	a task's address map. Currently, only one flavor is
15526  *	supported.
15527  *
15528  *	XXX The reserved and behavior fields cannot be filled
15529  *	    in until the vm merge from the IK is completed, and
15530  *	    vm_reserve is implemented.
15531  */
15532 
15533 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15534 vm_map_region(
15535 	vm_map_t                map,
15536 	vm_map_offset_ut       *address_u,      /* IN/OUT */
15537 	vm_map_size_ut         *size_u,         /* OUT */
15538 	vm_region_flavor_t      flavor,         /* IN */
15539 	vm_region_info_t        info,           /* OUT */
15540 	mach_msg_type_number_t *count,          /* IN/OUT */
15541 	mach_port_t            *object_name)    /* OUT */
15542 {
15543 	vm_map_entry_t          tmp_entry;
15544 	vm_map_entry_t          entry;
15545 	vm_map_offset_t         start;
15546 
15547 	if (map == VM_MAP_NULL) {
15548 		return KERN_INVALID_ARGUMENT;
15549 	}
15550 
15551 	start = vm_sanitize_addr(map, *address_u);
15552 
15553 	switch (flavor) {
15554 	case VM_REGION_BASIC_INFO:
15555 		/* legacy for old 32-bit objects info */
15556 	{
15557 		vm_region_basic_info_t  basic;
15558 
15559 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
15560 			return KERN_INVALID_ARGUMENT;
15561 		}
15562 
15563 		basic = (vm_region_basic_info_t) info;
15564 		*count = VM_REGION_BASIC_INFO_COUNT;
15565 
15566 		vm_map_lock_read(map);
15567 
15568 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15569 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15570 				vm_map_unlock_read(map);
15571 				return KERN_INVALID_ADDRESS;
15572 			}
15573 		} else {
15574 			entry = tmp_entry;
15575 		}
15576 
15577 		start = entry->vme_start;
15578 
15579 		basic->offset = (uint32_t)VME_OFFSET(entry);
15580 		basic->protection = entry->protection;
15581 		basic->inheritance = entry->inheritance;
15582 		basic->max_protection = entry->max_protection;
15583 		basic->behavior = entry->behavior;
15584 		basic->user_wired_count = entry->user_wired_count;
15585 		basic->reserved = entry->is_sub_map;
15586 
15587 		*address_u = vm_sanitize_wrap_addr(start);
15588 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15589 
15590 		if (object_name) {
15591 			*object_name = IP_NULL;
15592 		}
15593 		if (entry->is_sub_map) {
15594 			basic->shared = FALSE;
15595 		} else {
15596 			basic->shared = entry->is_shared;
15597 		}
15598 
15599 		vm_map_unlock_read(map);
15600 		return KERN_SUCCESS;
15601 	}
15602 
15603 	case VM_REGION_BASIC_INFO_64:
15604 	{
15605 		vm_region_basic_info_64_t       basic;
15606 
15607 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15608 			return KERN_INVALID_ARGUMENT;
15609 		}
15610 
15611 		basic = (vm_region_basic_info_64_t) info;
15612 		*count = VM_REGION_BASIC_INFO_COUNT_64;
15613 
15614 		vm_map_lock_read(map);
15615 
15616 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15617 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15618 				vm_map_unlock_read(map);
15619 				return KERN_INVALID_ADDRESS;
15620 			}
15621 		} else {
15622 			entry = tmp_entry;
15623 		}
15624 
15625 		start = entry->vme_start;
15626 
15627 		basic->offset = VME_OFFSET(entry);
15628 		basic->protection = entry->protection;
15629 		basic->inheritance = entry->inheritance;
15630 		basic->max_protection = entry->max_protection;
15631 		basic->behavior = entry->behavior;
15632 		basic->user_wired_count = entry->user_wired_count;
15633 		basic->reserved = entry->is_sub_map;
15634 
15635 		*address_u = vm_sanitize_wrap_addr(start);
15636 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15637 
15638 		if (object_name) {
15639 			*object_name = IP_NULL;
15640 		}
15641 		if (entry->is_sub_map) {
15642 			basic->shared = FALSE;
15643 		} else {
15644 			basic->shared = entry->is_shared;
15645 		}
15646 
15647 		vm_map_unlock_read(map);
15648 		return KERN_SUCCESS;
15649 	}
15650 	case VM_REGION_EXTENDED_INFO:
15651 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15652 			return KERN_INVALID_ARGUMENT;
15653 		}
15654 		OS_FALLTHROUGH;
15655 	case VM_REGION_EXTENDED_INFO__legacy:
15656 	{
15657 		vm_region_extended_info_t       extended;
15658 		mach_msg_type_number_t original_count;
15659 		int effective_page_size, effective_page_shift;
15660 
15661 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15662 			return KERN_INVALID_ARGUMENT;
15663 		}
15664 
15665 		extended = (vm_region_extended_info_t) info;
15666 
15667 		effective_page_shift = vm_self_region_page_shift(map);
15668 		effective_page_size = (1 << effective_page_shift);
15669 
15670 		vm_map_lock_read(map);
15671 
15672 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15673 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15674 				vm_map_unlock_read(map);
15675 				return KERN_INVALID_ADDRESS;
15676 			}
15677 		} else {
15678 			entry = tmp_entry;
15679 		}
15680 		start = entry->vme_start;
15681 
15682 		extended->protection = entry->protection;
15683 		extended->user_tag = VME_ALIAS(entry);
15684 		extended->pages_resident = 0;
15685 		extended->pages_swapped_out = 0;
15686 		extended->pages_shared_now_private = 0;
15687 		extended->pages_dirtied = 0;
15688 		extended->external_pager = 0;
15689 		extended->shadow_depth = 0;
15690 
15691 		original_count = *count;
15692 		if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15693 			*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15694 		} else {
15695 			extended->pages_reusable = 0;
15696 			*count = VM_REGION_EXTENDED_INFO_COUNT;
15697 		}
15698 
15699 		vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15700 
15701 		if (object_name) {
15702 			*object_name = IP_NULL;
15703 		}
15704 
15705 		*address_u = vm_sanitize_wrap_addr(start);
15706 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15707 
15708 		vm_map_unlock_read(map);
15709 		return KERN_SUCCESS;
15710 	}
15711 	case VM_REGION_TOP_INFO:
15712 	{
15713 		vm_region_top_info_t    top;
15714 
15715 		if (*count < VM_REGION_TOP_INFO_COUNT) {
15716 			return KERN_INVALID_ARGUMENT;
15717 		}
15718 
15719 		top = (vm_region_top_info_t) info;
15720 		*count = VM_REGION_TOP_INFO_COUNT;
15721 
15722 		vm_map_lock_read(map);
15723 
15724 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15725 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15726 				vm_map_unlock_read(map);
15727 				return KERN_INVALID_ADDRESS;
15728 			}
15729 		} else {
15730 			entry = tmp_entry;
15731 		}
15732 		start = entry->vme_start;
15733 
15734 		top->private_pages_resident = 0;
15735 		top->shared_pages_resident = 0;
15736 
15737 		vm_map_region_top_walk(entry, top);
15738 
15739 		if (object_name) {
15740 			*object_name = IP_NULL;
15741 		}
15742 
15743 		*address_u = vm_sanitize_wrap_addr(start);
15744 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15745 
15746 		vm_map_unlock_read(map);
15747 		return KERN_SUCCESS;
15748 	}
15749 	default:
15750 		return KERN_INVALID_ARGUMENT;
15751 	}
15752 }
15753 
15754 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
15755 	MIN((entry_size),                                               \
15756 	    ((obj)->all_reusable ?                                      \
15757 	     (obj)->wired_page_count :                                  \
15758 	     (obj)->resident_page_count - (obj)->reusable_page_count))
15759 
15760 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15761 vm_map_region_top_walk(
15762 	vm_map_entry_t             entry,
15763 	vm_region_top_info_t       top)
15764 {
15765 	if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15766 		top->share_mode = SM_EMPTY;
15767 		top->ref_count = 0;
15768 		top->obj_id = 0;
15769 		return;
15770 	}
15771 
15772 	{
15773 		struct  vm_object *obj, *tmp_obj;
15774 		int             ref_count;
15775 		uint32_t        entry_size;
15776 
15777 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15778 
15779 		obj = VME_OBJECT(entry);
15780 
15781 		vm_object_lock(obj);
15782 
15783 		if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15784 		    obj->paging_in_progress) {
15785 			ref_count--;
15786 		}
15787 
15788 		assert(obj->reusable_page_count <= obj->resident_page_count);
15789 		if (obj->shadow) {
15790 			if (ref_count == 1) {
15791 				top->private_pages_resident =
15792 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15793 			} else {
15794 				top->shared_pages_resident =
15795 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15796 			}
15797 			top->ref_count  = ref_count;
15798 			top->share_mode = SM_COW;
15799 
15800 			while ((tmp_obj = obj->shadow)) {
15801 				vm_object_lock(tmp_obj);
15802 				vm_object_unlock(obj);
15803 				obj = tmp_obj;
15804 
15805 				if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15806 				    obj->paging_in_progress) {
15807 					ref_count--;
15808 				}
15809 
15810 				assert(obj->reusable_page_count <= obj->resident_page_count);
15811 				top->shared_pages_resident +=
15812 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15813 				top->ref_count += ref_count - 1;
15814 			}
15815 		} else {
15816 			if (entry->superpage_size) {
15817 				top->share_mode = SM_LARGE_PAGE;
15818 				top->shared_pages_resident = 0;
15819 				top->private_pages_resident = entry_size;
15820 			} else if (entry->needs_copy) {
15821 				top->share_mode = SM_COW;
15822 				top->shared_pages_resident =
15823 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15824 			} else {
15825 				if (ref_count == 1 ||
15826 				    (ref_count == 2 && obj->named)) {
15827 					top->share_mode = SM_PRIVATE;
15828 					top->private_pages_resident =
15829 					    OBJ_RESIDENT_COUNT(obj,
15830 					    entry_size);
15831 				} else {
15832 					top->share_mode = SM_SHARED;
15833 					top->shared_pages_resident =
15834 					    OBJ_RESIDENT_COUNT(obj,
15835 					    entry_size);
15836 				}
15837 			}
15838 			top->ref_count = ref_count;
15839 		}
15840 
15841 		vm_object_unlock(obj);
15842 
15843 		/* XXX K64: obj_id will be truncated */
15844 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15845 	}
15846 }
15847 
15848 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15849 vm_map_region_walk(
15850 	vm_map_t                        map,
15851 	vm_map_offset_t                 va,
15852 	vm_map_entry_t                  entry,
15853 	vm_object_offset_t              offset,
15854 	vm_object_size_t                range,
15855 	vm_region_extended_info_t       extended,
15856 	boolean_t                       look_for_pages,
15857 	mach_msg_type_number_t count)
15858 {
15859 	struct vm_object *obj, *tmp_obj;
15860 	vm_map_offset_t       last_offset;
15861 	int               i;
15862 	int               ref_count;
15863 	struct vm_object        *shadow_object;
15864 	unsigned short          shadow_depth;
15865 	boolean_t         do_region_footprint;
15866 	int                     effective_page_size, effective_page_shift;
15867 	vm_map_offset_t         effective_page_mask;
15868 
15869 	do_region_footprint = task_self_region_footprint();
15870 
15871 	if ((entry->is_sub_map) ||
15872 	    (VME_OBJECT(entry) == 0) ||
15873 	    (VME_OBJECT(entry)->phys_contiguous &&
15874 	    !entry->superpage_size)) {
15875 		extended->share_mode = SM_EMPTY;
15876 		extended->ref_count = 0;
15877 		return;
15878 	}
15879 
15880 	if (entry->superpage_size) {
15881 		extended->shadow_depth = 0;
15882 		extended->share_mode = SM_LARGE_PAGE;
15883 		extended->ref_count = 1;
15884 		extended->external_pager = 0;
15885 
15886 		/* TODO4K: Superpage in 4k mode? */
15887 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15888 		extended->shadow_depth = 0;
15889 		return;
15890 	}
15891 
15892 	effective_page_shift = vm_self_region_page_shift(map);
15893 	effective_page_size = (1 << effective_page_shift);
15894 	effective_page_mask = effective_page_size - 1;
15895 
15896 	offset = vm_map_trunc_page(offset, effective_page_mask);
15897 
15898 	obj = VME_OBJECT(entry);
15899 
15900 	vm_object_lock(obj);
15901 
15902 	if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15903 	    obj->paging_in_progress) {
15904 		ref_count--;
15905 	}
15906 
15907 	if (look_for_pages) {
15908 		for (last_offset = offset + range;
15909 		    offset < last_offset;
15910 		    offset += effective_page_size, va += effective_page_size) {
15911 			if (do_region_footprint) {
15912 				int disp;
15913 
15914 				disp = 0;
15915 				if (map->has_corpse_footprint) {
15916 					/*
15917 					 * Query the page info data we saved
15918 					 * while forking the corpse.
15919 					 */
15920 					vm_map_corpse_footprint_query_page_info(
15921 						map,
15922 						va,
15923 						&disp);
15924 				} else {
15925 					/*
15926 					 * Query the pmap.
15927 					 */
15928 					vm_map_footprint_query_page_info(
15929 						map,
15930 						entry,
15931 						va,
15932 						&disp);
15933 				}
15934 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15935 					extended->pages_resident++;
15936 				}
15937 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15938 					extended->pages_reusable++;
15939 				}
15940 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15941 					extended->pages_dirtied++;
15942 				}
15943 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15944 					extended->pages_swapped_out++;
15945 				}
15946 				continue;
15947 			}
15948 
15949 			vm_map_region_look_for_page(map, va, obj,
15950 			    vm_object_trunc_page(offset), ref_count,
15951 			    0, extended, count);
15952 		}
15953 
15954 		if (do_region_footprint) {
15955 			goto collect_object_info;
15956 		}
15957 	} else {
15958 collect_object_info:
15959 		shadow_object = obj->shadow;
15960 		shadow_depth = 0;
15961 
15962 		if (!(obj->internal)) {
15963 			extended->external_pager = 1;
15964 		}
15965 
15966 		if (shadow_object != VM_OBJECT_NULL) {
15967 			vm_object_lock(shadow_object);
15968 			for (;
15969 			    shadow_object != VM_OBJECT_NULL;
15970 			    shadow_depth++) {
15971 				vm_object_t     next_shadow;
15972 
15973 				if (!(shadow_object->internal)) {
15974 					extended->external_pager = 1;
15975 				}
15976 
15977 				next_shadow = shadow_object->shadow;
15978 				if (next_shadow) {
15979 					vm_object_lock(next_shadow);
15980 				}
15981 				vm_object_unlock(shadow_object);
15982 				shadow_object = next_shadow;
15983 			}
15984 		}
15985 		extended->shadow_depth = shadow_depth;
15986 	}
15987 
15988 	if (extended->shadow_depth || entry->needs_copy) {
15989 		extended->share_mode = SM_COW;
15990 	} else {
15991 		if (ref_count == 1) {
15992 			extended->share_mode = SM_PRIVATE;
15993 		} else {
15994 			if (obj->true_share) {
15995 				extended->share_mode = SM_TRUESHARED;
15996 			} else {
15997 				extended->share_mode = SM_SHARED;
15998 			}
15999 		}
16000 	}
16001 	extended->ref_count = ref_count - extended->shadow_depth;
16002 
16003 	for (i = 0; i < extended->shadow_depth; i++) {
16004 		if ((tmp_obj = obj->shadow) == 0) {
16005 			break;
16006 		}
16007 		vm_object_lock(tmp_obj);
16008 		vm_object_unlock(obj);
16009 
16010 		if ((ref_count = os_ref_get_count_raw(&tmp_obj->ref_count)) > 1 &&
16011 		    tmp_obj->paging_in_progress) {
16012 			ref_count--;
16013 		}
16014 
16015 		extended->ref_count += ref_count;
16016 		obj = tmp_obj;
16017 	}
16018 	vm_object_unlock(obj);
16019 
16020 	if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
16021 		extended->share_mode = SM_PRIVATE;
16022 	} else if (extended->share_mode == SM_SHARED && !(task_self_region_info_flags() & VM_REGION_INFO_FLAGS_NO_ALIASED)) {
16023 		vm_map_entry_t       cur;
16024 		vm_map_entry_t       last;
16025 		int      my_refs;
16026 
16027 		obj = VME_OBJECT(entry);
16028 		last = vm_map_to_entry(map);
16029 		my_refs = 0;
16030 
16031 		if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
16032 		    obj->paging_in_progress) {
16033 			ref_count--;
16034 		}
16035 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
16036 			if (vm_map_region_has_obj_ref(cur, obj)) {
16037 				my_refs++;
16038 			}
16039 		}
16040 
16041 		if (my_refs == ref_count) {
16042 			extended->share_mode = SM_PRIVATE_ALIASED;
16043 		} else if (my_refs > 1) {
16044 			extended->share_mode = SM_SHARED_ALIASED;
16045 		}
16046 	}
16047 }
16048 
16049 
16050 /* object is locked on entry and locked on return */
16051 
16052 
16053 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)16054 vm_map_region_look_for_page(
16055 	__unused vm_map_t               map,
16056 	__unused vm_map_offset_t        va,
16057 	vm_object_t                     object,
16058 	vm_object_offset_t              offset,
16059 	int                             max_refcnt,
16060 	unsigned short                  depth,
16061 	vm_region_extended_info_t       extended,
16062 	mach_msg_type_number_t count)
16063 {
16064 	vm_page_t       p;
16065 	vm_object_t     shadow;
16066 	int             ref_count;
16067 	vm_object_t     caller_object;
16068 
16069 	shadow = object->shadow;
16070 	caller_object = object;
16071 
16072 
16073 	while (TRUE) {
16074 		if (!(object->internal)) {
16075 			extended->external_pager = 1;
16076 		}
16077 
16078 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
16079 			if (shadow && (max_refcnt == 1)) {
16080 				extended->pages_shared_now_private++;
16081 			}
16082 
16083 			if (!vm_page_is_fictitious(p) &&
16084 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
16085 				extended->pages_dirtied++;
16086 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
16087 				if (p->vmp_reusable || object->all_reusable) {
16088 					extended->pages_reusable++;
16089 				}
16090 			}
16091 
16092 			extended->pages_resident++;
16093 
16094 			if (object != caller_object) {
16095 				vm_object_unlock(object);
16096 			}
16097 
16098 			return;
16099 		}
16100 		if (object->internal &&
16101 		    object->alive &&
16102 		    !object->terminating &&
16103 		    object->pager_ready) {
16104 			if (vm_object_compressor_pager_state_get(object, offset)
16105 			    == VM_EXTERNAL_STATE_EXISTS) {
16106 				/* the pager has that page */
16107 				extended->pages_swapped_out++;
16108 				if (object != caller_object) {
16109 					vm_object_unlock(object);
16110 				}
16111 				return;
16112 			}
16113 		}
16114 
16115 		if (shadow) {
16116 			vm_object_lock(shadow);
16117 			if ((ref_count = os_ref_get_count_raw(&shadow->ref_count)) > 1 &&
16118 			    shadow->paging_in_progress) {
16119 				ref_count--;
16120 			}
16121 
16122 			if (++depth > extended->shadow_depth) {
16123 				extended->shadow_depth = depth;
16124 			}
16125 
16126 			if (ref_count > max_refcnt) {
16127 				max_refcnt = ref_count;
16128 			}
16129 
16130 			if (object != caller_object) {
16131 				vm_object_unlock(object);
16132 			}
16133 
16134 			offset = offset + object->vo_shadow_offset;
16135 			object = shadow;
16136 			shadow = object->shadow;
16137 			continue;
16138 		}
16139 		if (object != caller_object) {
16140 			vm_object_unlock(object);
16141 		}
16142 		break;
16143 	}
16144 }
16145 
16146 static inline boolean_t
vm_map_region_has_obj_ref(vm_map_entry_t entry,vm_object_t object)16147 vm_map_region_has_obj_ref(
16148 	vm_map_entry_t    entry,
16149 	vm_object_t       object)
16150 {
16151 	vm_object_t cur_obj;
16152 	vm_object_t shadow_obj;
16153 
16154 	if (entry->is_sub_map) {
16155 		return FALSE;
16156 	}
16157 
16158 	cur_obj = VME_OBJECT(entry);
16159 	if (cur_obj == VM_OBJECT_NULL) {
16160 		return FALSE;
16161 	} else if (cur_obj == object) {
16162 		return TRUE;
16163 	}
16164 
16165 	/*
16166 	 * Avoid locks for first shadow check, otherwise diagnostic tools will
16167 	 * spend most of their time obtaining locks in this function when analyzing
16168 	 * processes with many VM entries which may commonly have no shadow chain.
16169 	 *
16170 	 * This is acceptable because:
16171 	 *  - Shadow's fields are not accessed outside of its lock
16172 	 *  - Objects are unlikely to be modified due to:
16173 	 *	  - Many diagnostic tools suspend the task
16174 	 *	  - VM map is locked
16175 	 *	- The rare incorrect return from this function turns a guess into a
16176 	 *	  slightly worse guess
16177 	 *	- Entire shadow chain is not locked as a whole, so can still change
16178 	 *	  while traversing, resulting in incorrect guess even with locking
16179 	 */
16180 	shadow_obj = cur_obj->shadow;
16181 	if (shadow_obj == VM_OBJECT_NULL) {
16182 		return FALSE;
16183 	} else if (shadow_obj == object) {
16184 		return TRUE;
16185 	}
16186 
16187 	vm_object_lock(cur_obj);
16188 
16189 	while ((shadow_obj = cur_obj->shadow)) {
16190 		/* check if object was found before grabbing a lock */
16191 		if (shadow_obj == object) {
16192 			vm_object_unlock(cur_obj);
16193 			return TRUE;
16194 		}
16195 
16196 		vm_object_lock(shadow_obj);
16197 		vm_object_unlock(cur_obj);
16198 		cur_obj = shadow_obj;
16199 	}
16200 
16201 	/* exhausted the shadow chain */
16202 	vm_object_unlock(cur_obj);
16203 	return FALSE;
16204 }
16205 
16206 
16207 /*
16208  *	Routine:	vm_map_simplify
16209  *
16210  *	Description:
16211  *		Attempt to simplify the map representation in
16212  *		the vicinity of the given starting address.
16213  *	Note:
16214  *		This routine is intended primarily to keep the
16215  *		kernel maps more compact -- they generally don't
16216  *		benefit from the "expand a map entry" technology
16217  *		at allocation time because the adjacent entry
16218  *		is often wired down.
16219  */
16220 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)16221 vm_map_simplify_entry(
16222 	vm_map_t        map,
16223 	vm_map_entry_t  this_entry)
16224 {
16225 	vm_map_entry_t  prev_entry;
16226 
16227 	prev_entry = this_entry->vme_prev;
16228 
16229 	if ((this_entry != vm_map_to_entry(map)) &&
16230 	    (prev_entry != vm_map_to_entry(map)) &&
16231 
16232 	    (prev_entry->vme_end == this_entry->vme_start) &&
16233 
16234 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
16235 	    (prev_entry->vme_object_value == this_entry->vme_object_value) &&
16236 	    (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
16237 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
16238 	    prev_entry->vme_start))
16239 	    == VME_OFFSET(this_entry)) &&
16240 
16241 	    (prev_entry->behavior == this_entry->behavior) &&
16242 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
16243 	    (prev_entry->protection == this_entry->protection) &&
16244 	    (prev_entry->max_protection == this_entry->max_protection) &&
16245 	    (prev_entry->inheritance == this_entry->inheritance) &&
16246 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
16247 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
16248 	    (prev_entry->no_cache == this_entry->no_cache) &&
16249 	    (prev_entry->vme_permanent == this_entry->vme_permanent) &&
16250 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
16251 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
16252 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
16253 #if __arm64e__
16254 	    (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
16255 #endif
16256 	    (prev_entry->csm_associated == this_entry->csm_associated) &&
16257 	    (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
16258 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
16259 	    (prev_entry->vme_resilient_codesign ==
16260 	    this_entry->vme_resilient_codesign) &&
16261 	    (prev_entry->vme_resilient_media ==
16262 	    this_entry->vme_resilient_media) &&
16263 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
16264 	    (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
16265 
16266 	    (prev_entry->wired_count == this_entry->wired_count) &&
16267 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
16268 
16269 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
16270 	    (prev_entry->in_transition == FALSE) &&
16271 	    (this_entry->in_transition == FALSE) &&
16272 	    (prev_entry->needs_wakeup == FALSE) &&
16273 	    (this_entry->needs_wakeup == FALSE) &&
16274 	    (prev_entry->is_shared == this_entry->is_shared) &&
16275 	    (prev_entry->superpage_size == FALSE) &&
16276 	    (this_entry->superpage_size == FALSE)
16277 	    ) {
16278 		if (prev_entry->vme_permanent) {
16279 			assert(this_entry->vme_permanent);
16280 			prev_entry->vme_permanent = false;
16281 		}
16282 		vm_map_store_entry_unlink(map, prev_entry, true);
16283 		assert(prev_entry->vme_start < this_entry->vme_end);
16284 		if (prev_entry->map_aligned) {
16285 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
16286 			    VM_MAP_PAGE_MASK(map)));
16287 		}
16288 		this_entry->vme_start = prev_entry->vme_start;
16289 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
16290 
16291 		if (map->holelistenabled) {
16292 			vm_map_store_update_first_free(map, this_entry, TRUE);
16293 		}
16294 
16295 		if (prev_entry->is_sub_map) {
16296 			vm_map_deallocate(VME_SUBMAP(prev_entry));
16297 		} else {
16298 			vm_object_deallocate(VME_OBJECT(prev_entry));
16299 		}
16300 		vm_map_entry_dispose(prev_entry);
16301 		SAVE_HINT_MAP_WRITE(map, this_entry);
16302 	}
16303 }
16304 
16305 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)16306 vm_map_simplify(
16307 	vm_map_t        map,
16308 	vm_map_offset_t start)
16309 {
16310 	vm_map_entry_t  this_entry;
16311 
16312 	vm_map_lock(map);
16313 	if (vm_map_lookup_entry(map, start, &this_entry)) {
16314 		vm_map_simplify_entry(map, this_entry);
16315 		vm_map_simplify_entry(map, this_entry->vme_next);
16316 	}
16317 	vm_map_unlock(map);
16318 }
16319 
16320 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16321 vm_map_simplify_range(
16322 	vm_map_t        map,
16323 	vm_map_offset_t start,
16324 	vm_map_offset_t end)
16325 {
16326 	vm_map_entry_t  entry;
16327 
16328 	/*
16329 	 * The map should be locked (for "write") by the caller.
16330 	 */
16331 
16332 	if (start >= end) {
16333 		/* invalid address range */
16334 		return;
16335 	}
16336 
16337 	start = vm_map_trunc_page(start,
16338 	    VM_MAP_PAGE_MASK(map));
16339 	end = vm_map_round_page(end,
16340 	    VM_MAP_PAGE_MASK(map));
16341 
16342 	if (!vm_map_lookup_entry(map, start, &entry)) {
16343 		/* "start" is not mapped and "entry" ends before "start" */
16344 		if (entry == vm_map_to_entry(map)) {
16345 			/* start with first entry in the map */
16346 			entry = vm_map_first_entry(map);
16347 		} else {
16348 			/* start with next entry */
16349 			entry = entry->vme_next;
16350 		}
16351 	}
16352 
16353 	while (entry != vm_map_to_entry(map) &&
16354 	    entry->vme_start <= end) {
16355 		/* try and coalesce "entry" with its previous entry */
16356 		vm_map_simplify_entry(map, entry);
16357 		entry = entry->vme_next;
16358 	}
16359 }
16360 
16361 static __attribute__((always_inline, warn_unused_result))
16362 kern_return_t
vm_map_machine_attribute_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,mach_vm_offset_t * start,mach_vm_offset_t * end,vm_map_size_t * size)16363 vm_map_machine_attribute_sanitize(
16364 	vm_map_t                map,
16365 	vm_map_offset_ut        start_u,
16366 	vm_map_offset_ut        end_u,
16367 	mach_vm_offset_t       *start,
16368 	mach_vm_offset_t       *end,
16369 	vm_map_size_t          *size)
16370 {
16371 	return vm_sanitize_addr_end(start_u, end_u,
16372 	           VM_SANITIZE_CALLER_VM_MAP_MACHINE_ATTRIBUTE, map,
16373 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
16374 	           size);
16375 }
16376 
16377 
16378 /*
16379  *	Routine:	vm_map_machine_attribute
16380  *	Purpose:
16381  *		Provide machine-specific attributes to mappings,
16382  *		such as cachability etc. for machines that provide
16383  *		them.  NUMA architectures and machines with big/strange
16384  *		caches will use this.
16385  *	Note:
16386  *		Responsibilities for locking and checking are handled here,
16387  *		everything else in the pmap module. If any non-volatile
16388  *		information must be kept, the pmap module should handle
16389  *		it itself. [This assumes that attributes do not
16390  *		need to be inherited, which seems ok to me]
16391  */
16392 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16393 vm_map_machine_attribute(
16394 	vm_map_t                map,
16395 	vm_map_offset_ut        start_u,
16396 	vm_map_offset_ut        end_u,
16397 	vm_machine_attribute_t  attribute,
16398 	vm_machine_attribute_val_t *value) /* IN/OUT */
16399 {
16400 	mach_vm_offset_t start, end;
16401 	vm_map_size_t    sync_size;
16402 	kern_return_t    ret;
16403 	vm_map_entry_t   entry;
16404 
16405 	ret = vm_map_machine_attribute_sanitize(map,
16406 	    start_u,
16407 	    end_u,
16408 	    &start,
16409 	    &end,
16410 	    &sync_size);
16411 	if (__improbable(ret != KERN_SUCCESS)) {
16412 		return vm_sanitize_get_kr(ret);
16413 	}
16414 
16415 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
16416 		return KERN_INVALID_ADDRESS;
16417 	}
16418 
16419 	vm_map_lock(map);
16420 
16421 	if (attribute != MATTR_CACHE) {
16422 		/* If we don't have to find physical addresses, we */
16423 		/* don't have to do an explicit traversal here.    */
16424 		ret = pmap_attribute(map->pmap, start, end - start,
16425 		    attribute, value);
16426 		vm_map_unlock(map);
16427 		return ret;
16428 	}
16429 
16430 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
16431 
16432 	while (sync_size) {
16433 		if (vm_map_lookup_entry(map, start, &entry)) {
16434 			vm_map_size_t   sub_size;
16435 			if ((entry->vme_end - start) > sync_size) {
16436 				sub_size = sync_size;
16437 				sync_size = 0;
16438 			} else {
16439 				sub_size = entry->vme_end - start;
16440 				sync_size -= sub_size;
16441 			}
16442 			if (entry->is_sub_map) {
16443 				vm_map_offset_t sub_start;
16444 				vm_map_offset_t sub_end;
16445 
16446 				sub_start = (start - entry->vme_start)
16447 				    + VME_OFFSET(entry);
16448 				sub_end = sub_start + sub_size;
16449 				vm_map_machine_attribute(
16450 					VME_SUBMAP(entry),
16451 					sub_start,
16452 					sub_end,
16453 					attribute, value);
16454 			} else if (VME_OBJECT(entry)) {
16455 				vm_page_t               m;
16456 				vm_object_t             object;
16457 				vm_object_t             base_object;
16458 				vm_object_t             last_object;
16459 				vm_object_offset_t      offset;
16460 				vm_object_offset_t      base_offset;
16461 				vm_map_size_t           range;
16462 				range = sub_size;
16463 				offset = (start - entry->vme_start)
16464 				    + VME_OFFSET(entry);
16465 				offset = vm_object_trunc_page(offset);
16466 				base_offset = offset;
16467 				object = VME_OBJECT(entry);
16468 				base_object = object;
16469 				last_object = NULL;
16470 
16471 				vm_object_lock(object);
16472 
16473 				while (range) {
16474 					m = vm_page_lookup(
16475 						object, offset);
16476 
16477 					if (m && !vm_page_is_fictitious(m)) {
16478 						ret =
16479 						    pmap_attribute_cache_sync(
16480 							VM_PAGE_GET_PHYS_PAGE(m),
16481 							PAGE_SIZE,
16482 							attribute, value);
16483 					} else if (object->shadow) {
16484 						offset = offset + object->vo_shadow_offset;
16485 						last_object = object;
16486 						object = object->shadow;
16487 						vm_object_lock(last_object->shadow);
16488 						vm_object_unlock(last_object);
16489 						continue;
16490 					}
16491 					if (range < PAGE_SIZE) {
16492 						range = 0;
16493 					} else {
16494 						range -= PAGE_SIZE;
16495 					}
16496 
16497 					if (base_object != object) {
16498 						vm_object_unlock(object);
16499 						vm_object_lock(base_object);
16500 						object = base_object;
16501 					}
16502 					/* Bump to the next page */
16503 					base_offset += PAGE_SIZE;
16504 					offset = base_offset;
16505 				}
16506 				vm_object_unlock(object);
16507 			}
16508 			start += sub_size;
16509 		} else {
16510 			vm_map_unlock(map);
16511 			return KERN_FAILURE;
16512 		}
16513 	}
16514 
16515 	vm_map_unlock(map);
16516 
16517 	return ret;
16518 }
16519 
16520 /*
16521  *	vm_map_behavior_set:
16522  *
16523  *	Sets the paging reference behavior of the specified address
16524  *	range in the target map.  Paging reference behavior affects
16525  *	how pagein operations resulting from faults on the map will be
16526  *	clustered.
16527  */
16528 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16529 vm_map_behavior_set(
16530 	vm_map_t        map,
16531 	vm_map_offset_t start,
16532 	vm_map_offset_t end,
16533 	vm_behavior_t   new_behavior)
16534 {
16535 	vm_map_entry_t  entry;
16536 	vm_map_entry_t  temp_entry;
16537 
16538 	if (start > end ||
16539 	    start < vm_map_min(map) ||
16540 	    end > vm_map_max(map)) {
16541 		return KERN_NO_SPACE;
16542 	}
16543 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16544 		return KERN_INVALID_ADDRESS;
16545 	}
16546 
16547 	switch (new_behavior) {
16548 	/*
16549 	 * This first block of behaviors all set a persistent state on the specified
16550 	 * memory range.  All we have to do here is to record the desired behavior
16551 	 * in the vm_map_entry_t's.
16552 	 */
16553 
16554 	case VM_BEHAVIOR_DEFAULT:
16555 	case VM_BEHAVIOR_RANDOM:
16556 	case VM_BEHAVIOR_SEQUENTIAL:
16557 	case VM_BEHAVIOR_RSEQNTL:
16558 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16559 		vm_map_lock(map);
16560 
16561 		/*
16562 		 *	The entire address range must be valid for the map.
16563 		 *      Note that vm_map_range_check() does a
16564 		 *	vm_map_lookup_entry() internally and returns the
16565 		 *	entry containing the start of the address range if
16566 		 *	the entire range is valid.
16567 		 */
16568 		if (vm_map_range_check(map, start, end, &temp_entry)) {
16569 			entry = temp_entry;
16570 			vm_map_clip_start(map, entry, start);
16571 		} else {
16572 			vm_map_unlock(map);
16573 			return KERN_INVALID_ADDRESS;
16574 		}
16575 
16576 		if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16577 			/* zeroing requires write access */
16578 			temp_entry = entry;
16579 			for (;
16580 			    entry != vm_map_to_entry(map) && (entry->vme_start < end);
16581 			    entry = entry->vme_next) {
16582 				if (!(entry->protection & VM_PROT_WRITE) ||
16583 #if __arm64e__
16584 				    entry->used_for_tpro ||
16585 #endif /* __arm64e__ */
16586 				    entry->used_for_jit) {
16587 					vm_map_unlock(map);
16588 					return KERN_PROTECTION_FAILURE;
16589 				}
16590 			}
16591 			entry = temp_entry;
16592 		}
16593 
16594 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16595 			vm_map_clip_end(map, entry, end);
16596 			if (entry->is_sub_map) {
16597 				assert(!entry->use_pmap);
16598 			}
16599 
16600 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16601 				assert(entry->protection & VM_PROT_WRITE);
16602 #if __arm64e__
16603 				assert(!entry->used_for_tpro);
16604 #endif /* __arm64e__ */
16605 				assert(!entry->used_for_jit);
16606 				entry->zero_wired_pages = TRUE;
16607 			} else {
16608 				entry->behavior = new_behavior;
16609 			}
16610 			entry = entry->vme_next;
16611 		}
16612 
16613 		vm_map_unlock(map);
16614 		break;
16615 
16616 	/*
16617 	 * The rest of these are different from the above in that they cause
16618 	 * an immediate action to take place as opposed to setting a behavior that
16619 	 * affects future actions.
16620 	 */
16621 
16622 	case VM_BEHAVIOR_WILLNEED:
16623 		return vm_map_willneed(map, start, end);
16624 
16625 	case VM_BEHAVIOR_DONTNEED:
16626 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16627 
16628 	case VM_BEHAVIOR_FREE:
16629 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16630 
16631 	case VM_BEHAVIOR_REUSABLE:
16632 		return vm_map_reusable_pages(map, start, end);
16633 
16634 	case VM_BEHAVIOR_REUSE:
16635 		return vm_map_reuse_pages(map, start, end);
16636 
16637 	case VM_BEHAVIOR_CAN_REUSE:
16638 		return vm_map_can_reuse(map, start, end);
16639 
16640 #if MACH_ASSERT
16641 	case VM_BEHAVIOR_PAGEOUT:
16642 		return vm_map_pageout(map, start, end);
16643 #endif /* MACH_ASSERT */
16644 
16645 	case VM_BEHAVIOR_ZERO:
16646 		return vm_map_zero(map, start, end);
16647 
16648 	default:
16649 		return KERN_INVALID_ARGUMENT;
16650 	}
16651 
16652 	return KERN_SUCCESS;
16653 }
16654 
16655 
16656 /*
16657  * Internals for madvise(MADV_WILLNEED) system call.
16658  *
16659  * The implementation is to do:-
16660  * a) read-ahead if the mapping corresponds to a mapped regular file
16661  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16662  */
16663 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16664 vm_map_willneed(
16665 	vm_map_t        map,
16666 	vm_map_offset_t start,
16667 	vm_map_offset_t end
16668 	)
16669 {
16670 	vm_map_entry_t entry;
16671 	kern_return_t kr;
16672 	vm_object_size_t len;
16673 	vm_size_t region_size;
16674 
16675 	KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_START,
16676 	    start, end);
16677 	struct vm_object_fault_info fault_info = {
16678 		.interruptible = THREAD_UNINT,
16679 		.behavior = VM_BEHAVIOR_SEQUENTIAL,
16680 		/* Do not activate pages after faulting */
16681 		.stealth = true,
16682 		/* Don't wait for busy pages */
16683 		.fi_no_sleep = true,
16684 	};
16685 
16686 	/*
16687 	 * The MADV_WILLNEED operation doesn't require any changes to the
16688 	 * vm_map_entry_t's, so the read lock is sufficient.
16689 	 */
16690 
16691 	vm_map_lock_read(map);
16692 
16693 	/*
16694 	 * The madvise semantics require that the address range be fully
16695 	 * allocated with no holes.  Otherwise, we're required to return
16696 	 * an error.
16697 	 */
16698 
16699 	if (!vm_map_range_check(map, start, end, &entry)) {
16700 		vm_map_unlock_read(map);
16701 		kr = KERN_INVALID_ADDRESS;
16702 		goto done;
16703 	}
16704 
16705 	/*
16706 	 * Examine each vm_map_entry_t in the range.
16707 	 */
16708 	while (start < end) {
16709 		/*
16710 		 * Set the length so we don't go beyond the end of the
16711 		 * map_entry or beyond the end of the range we were given.
16712 		 * This range could span also multiple map entries all of which
16713 		 * map different files, so make sure we only do the right amount
16714 		 * of I/O for each object.  Note that it's possible for there
16715 		 * to be multiple map entries all referring to the same object
16716 		 * but with different page permissions, but it's not worth
16717 		 * trying to optimize that case.
16718 		 */
16719 		len = MIN(entry->vme_end - start, end - start);
16720 
16721 		vm_map_offset_t addr = start;
16722 
16723 		vm_size_t effective_page_mask = MIN(vm_map_page_mask(map), PAGE_MASK);
16724 		vm_map_offset_t effective_page_size = effective_page_mask + 1;
16725 
16726 		/*
16727 		 * Write-fault if the entry supports it to preclude subsequent soft-faults
16728 		 */
16729 		vm_prot_t fault_prot = entry->protection & VM_PROT_WRITE ?
16730 		    VM_PROT_WRITE : VM_PROT_READ;
16731 
16732 		vm_map_unlock_read(map);
16733 
16734 		region_size = len;
16735 		while (region_size) {
16736 			/*
16737 			 * Provide a hint for how much clustering we would like. Note that
16738 			 * each individual fault will limit the size of each request to
16739 			 * MAX_UPL_TRANSFER_BYTES.
16740 			 */
16741 			fault_info.cluster_size = region_size;
16742 			kr = vm_pre_fault_with_info(
16743 				map,
16744 				vm_map_trunc_page(addr, effective_page_mask),
16745 				fault_prot,
16746 				&fault_info);
16747 			if (kr == KERN_ALREADY_WAITING) {
16748 				/*
16749 				 * The page is busy being faulted/paged by another thread.
16750 				 */
16751 				KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_NONE,
16752 				    task_pid(current_task()), addr, kr);
16753 				kr = KERN_SUCCESS;
16754 			} else if (kr != KERN_SUCCESS) {
16755 				goto done;
16756 			}
16757 			region_size -= effective_page_size;
16758 			addr += effective_page_size;
16759 		}
16760 
16761 		start += len;
16762 		if (start >= end) {
16763 			kr = KERN_SUCCESS;
16764 			goto done;
16765 		}
16766 
16767 		if (thread_should_abort(current_thread())) {
16768 			kr = KERN_ABORTED;
16769 			goto done;
16770 		}
16771 
16772 		/* look up next entry */
16773 		vm_map_lock_read(map);
16774 		if (!vm_map_lookup_entry(map, start, &entry)) {
16775 			/*
16776 			 * There's a new hole in the address range.
16777 			 */
16778 			vm_map_unlock_read(map);
16779 			kr = KERN_INVALID_ADDRESS;
16780 			goto done;
16781 		}
16782 	}
16783 
16784 	vm_map_unlock_read(map);
16785 done:
16786 	KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16787 	    start, kr);
16788 	return kr;
16789 }
16790 
16791 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16792 vm_map_entry_is_reusable(
16793 	vm_map_entry_t entry)
16794 {
16795 	/* Only user map entries */
16796 
16797 	vm_object_t object;
16798 
16799 	if (entry->is_sub_map) {
16800 		return FALSE;
16801 	}
16802 
16803 	switch (VME_ALIAS(entry)) {
16804 	case VM_MEMORY_MALLOC:
16805 	case VM_MEMORY_MALLOC_SMALL:
16806 	case VM_MEMORY_MALLOC_LARGE:
16807 	case VM_MEMORY_REALLOC:
16808 	case VM_MEMORY_MALLOC_TINY:
16809 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16810 	case VM_MEMORY_MALLOC_LARGE_REUSED:
16811 		/*
16812 		 * This is a malloc() memory region: check if it's still
16813 		 * in its original state and can be re-used for more
16814 		 * malloc() allocations.
16815 		 */
16816 		break;
16817 	default:
16818 		/*
16819 		 * Not a malloc() memory region: let the caller decide if
16820 		 * it's re-usable.
16821 		 */
16822 		return TRUE;
16823 	}
16824 
16825 	if (/*entry->is_shared ||*/
16826 		entry->is_sub_map ||
16827 		entry->in_transition ||
16828 		entry->protection != VM_PROT_DEFAULT ||
16829 		entry->max_protection != VM_PROT_ALL ||
16830 		entry->inheritance != VM_INHERIT_DEFAULT ||
16831 		entry->no_cache ||
16832 		entry->vme_permanent ||
16833 		entry->superpage_size != FALSE ||
16834 		entry->zero_wired_pages ||
16835 		entry->wired_count != 0 ||
16836 		entry->user_wired_count != 0) {
16837 		return FALSE;
16838 	}
16839 
16840 	object = VME_OBJECT(entry);
16841 	if (object == VM_OBJECT_NULL) {
16842 		return TRUE;
16843 	}
16844 	if (
16845 #if 0
16846 		/*
16847 		 * Let's proceed even if the VM object is potentially
16848 		 * shared.
16849 		 * We check for this later when processing the actual
16850 		 * VM pages, so the contents will be safe if shared.
16851 		 *
16852 		 * But we can still mark this memory region as "reusable" to
16853 		 * acknowledge that the caller did let us know that the memory
16854 		 * could be re-used and should not be penalized for holding
16855 		 * on to it.  This allows its "resident size" to not include
16856 		 * the reusable range.
16857 		 */
16858 		object->ref_count == 1 &&
16859 #endif
16860 		object->vo_copy == VM_OBJECT_NULL &&
16861 		object->shadow == VM_OBJECT_NULL &&
16862 		object->internal &&
16863 		object->purgable == VM_PURGABLE_DENY &&
16864 		HAS_DEFAULT_CACHEABILITY(object->wimg_bits & VM_WIMG_MASK) &&
16865 		!object->code_signed) {
16866 		return TRUE;
16867 	}
16868 	return FALSE;
16869 }
16870 
16871 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16872 vm_map_reuse_pages(
16873 	vm_map_t        map,
16874 	vm_map_offset_t start,
16875 	vm_map_offset_t end)
16876 {
16877 	vm_map_entry_t                  entry;
16878 	vm_object_t                     object;
16879 	vm_object_offset_t              start_offset, end_offset;
16880 
16881 	/*
16882 	 * The MADV_REUSE operation doesn't require any changes to the
16883 	 * vm_map_entry_t's, so the read lock is sufficient.
16884 	 */
16885 
16886 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16887 		/*
16888 		 * XXX TODO4K
16889 		 * need to figure out what reusable means for a
16890 		 * portion of a native page.
16891 		 */
16892 		return KERN_SUCCESS;
16893 	}
16894 
16895 	vm_map_lock_read(map);
16896 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16897 
16898 	/*
16899 	 * The madvise semantics require that the address range be fully
16900 	 * allocated with no holes.  Otherwise, we're required to return
16901 	 * an error.
16902 	 */
16903 
16904 	if (!vm_map_range_check(map, start, end, &entry)) {
16905 		vm_map_unlock_read(map);
16906 		vm_page_stats_reusable.reuse_pages_failure++;
16907 		return KERN_INVALID_ADDRESS;
16908 	}
16909 
16910 	/*
16911 	 * Examine each vm_map_entry_t in the range.
16912 	 */
16913 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16914 	    entry = entry->vme_next) {
16915 		/*
16916 		 * Sanity check on the VM map entry.
16917 		 */
16918 		if (!vm_map_entry_is_reusable(entry)) {
16919 			vm_map_unlock_read(map);
16920 			vm_page_stats_reusable.reuse_pages_failure++;
16921 			return KERN_INVALID_ADDRESS;
16922 		}
16923 
16924 		/*
16925 		 * The first time through, the start address could be anywhere
16926 		 * within the vm_map_entry we found.  So adjust the offset to
16927 		 * correspond.
16928 		 */
16929 		if (entry->vme_start < start) {
16930 			start_offset = start - entry->vme_start;
16931 		} else {
16932 			start_offset = 0;
16933 		}
16934 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16935 		start_offset += VME_OFFSET(entry);
16936 		end_offset += VME_OFFSET(entry);
16937 
16938 		object = VME_OBJECT(entry);
16939 		if (object != VM_OBJECT_NULL) {
16940 			vm_object_lock(object);
16941 			vm_object_reuse_pages(object, start_offset, end_offset,
16942 			    TRUE);
16943 			vm_object_unlock(object);
16944 		}
16945 
16946 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16947 			/*
16948 			 * XXX
16949 			 * We do not hold the VM map exclusively here.
16950 			 * The "alias" field is not that critical, so it's
16951 			 * safe to update it here, as long as it is the only
16952 			 * one that can be modified while holding the VM map
16953 			 * "shared".
16954 			 */
16955 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16956 		}
16957 	}
16958 
16959 	vm_map_unlock_read(map);
16960 	vm_page_stats_reusable.reuse_pages_success++;
16961 	return KERN_SUCCESS;
16962 }
16963 
16964 
16965 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16966 vm_map_reusable_pages(
16967 	vm_map_t        map,
16968 	vm_map_offset_t start,
16969 	vm_map_offset_t end)
16970 {
16971 	vm_map_entry_t                  entry;
16972 	vm_object_t                     object;
16973 	vm_object_offset_t              start_offset, end_offset;
16974 	vm_map_offset_t                 pmap_offset;
16975 
16976 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16977 		/*
16978 		 * XXX TODO4K
16979 		 * need to figure out what reusable means for a portion
16980 		 * of a native page.
16981 		 */
16982 		return KERN_SUCCESS;
16983 	}
16984 
16985 	/*
16986 	 * The MADV_REUSABLE operation doesn't require any changes to the
16987 	 * vm_map_entry_t's, so the read lock is sufficient.
16988 	 */
16989 
16990 	vm_map_lock_read(map);
16991 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16992 
16993 	/*
16994 	 * The madvise semantics require that the address range be fully
16995 	 * allocated with no holes.  Otherwise, we're required to return
16996 	 * an error.
16997 	 */
16998 
16999 	if (!vm_map_range_check(map, start, end, &entry)) {
17000 		vm_map_unlock_read(map);
17001 		vm_page_stats_reusable.reusable_pages_failure++;
17002 		return KERN_INVALID_ADDRESS;
17003 	}
17004 
17005 	/*
17006 	 * Examine each vm_map_entry_t in the range.
17007 	 */
17008 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17009 	    entry = entry->vme_next) {
17010 		int kill_pages = 0;
17011 		boolean_t kill_no_write = FALSE;
17012 
17013 		/*
17014 		 * Sanity check on the VM map entry.
17015 		 */
17016 		if (!vm_map_entry_is_reusable(entry)) {
17017 			vm_map_unlock_read(map);
17018 			vm_page_stats_reusable.reusable_pages_failure++;
17019 			return KERN_INVALID_ADDRESS;
17020 		}
17021 
17022 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
17023 #if __arm64e__
17024 		    && !entry->used_for_tpro
17025 #endif
17026 		    ) {
17027 			/* not writable: can't discard contents */
17028 			vm_map_unlock_read(map);
17029 			vm_page_stats_reusable.reusable_nonwritable++;
17030 			vm_page_stats_reusable.reusable_pages_failure++;
17031 			return KERN_PROTECTION_FAILURE;
17032 		}
17033 
17034 		/*
17035 		 * The first time through, the start address could be anywhere
17036 		 * within the vm_map_entry we found.  So adjust the offset to
17037 		 * correspond.
17038 		 */
17039 		if (entry->vme_start < start) {
17040 			start_offset = start - entry->vme_start;
17041 			pmap_offset = start;
17042 		} else {
17043 			start_offset = 0;
17044 			pmap_offset = entry->vme_start;
17045 		}
17046 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17047 		start_offset += VME_OFFSET(entry);
17048 		end_offset += VME_OFFSET(entry);
17049 
17050 		object = VME_OBJECT(entry);
17051 		if (object == VM_OBJECT_NULL) {
17052 			continue;
17053 		}
17054 
17055 		if ((entry->protection & VM_PROT_EXECUTE) ||
17056 		    entry->vme_xnu_user_debug) {
17057 			/*
17058 			 * Executable or user debug pages might be write-protected by
17059 			 * hardware, so do not attempt to write to these pages.
17060 			 */
17061 			kill_no_write = TRUE;
17062 		}
17063 
17064 		vm_object_lock(object);
17065 		if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
17066 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
17067 		    object->vo_copy == VM_OBJECT_NULL)) &&
17068 		    object->shadow == VM_OBJECT_NULL &&
17069 		    /*
17070 		     * "iokit_acct" entries are billed for their virtual size
17071 		     * (rather than for their resident pages only), so they
17072 		     * wouldn't benefit from making pages reusable, and it
17073 		     * would be hard to keep track of pages that are both
17074 		     * "iokit_acct" and "reusable" in the pmap stats and
17075 		     * ledgers.
17076 		     */
17077 		    !(entry->iokit_acct ||
17078 		    (!entry->is_sub_map && !entry->use_pmap))) {
17079 			if (os_ref_get_count_raw(&object->ref_count) != 1) {
17080 				vm_page_stats_reusable.reusable_shared++;
17081 			}
17082 			kill_pages = 1;
17083 		} else {
17084 			kill_pages = -1;
17085 		}
17086 		if (kill_pages != -1) {
17087 			vm_object_deactivate_pages(object,
17088 			    start_offset,
17089 			    end_offset - start_offset,
17090 			    kill_pages,
17091 			    TRUE /*reusable_pages*/,
17092 			    kill_no_write,
17093 			    map->pmap,
17094 			    pmap_offset);
17095 		} else {
17096 			vm_page_stats_reusable.reusable_pages_shared++;
17097 			DTRACE_VM4(vm_map_reusable_pages_shared,
17098 			    unsigned int, VME_ALIAS(entry),
17099 			    vm_map_t, map,
17100 			    vm_map_entry_t, entry,
17101 			    vm_object_t, object);
17102 		}
17103 		vm_object_unlock(object);
17104 
17105 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
17106 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
17107 			/*
17108 			 * XXX
17109 			 * We do not hold the VM map exclusively here.
17110 			 * The "alias" field is not that critical, so it's
17111 			 * safe to update it here, as long as it is the only
17112 			 * one that can be modified while holding the VM map
17113 			 * "shared".
17114 			 */
17115 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
17116 		}
17117 	}
17118 
17119 	vm_map_unlock_read(map);
17120 	vm_page_stats_reusable.reusable_pages_success++;
17121 	return KERN_SUCCESS;
17122 }
17123 
17124 
17125 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17126 vm_map_can_reuse(
17127 	vm_map_t        map,
17128 	vm_map_offset_t start,
17129 	vm_map_offset_t end)
17130 {
17131 	vm_map_entry_t                  entry;
17132 
17133 	/*
17134 	 * The MADV_REUSABLE operation doesn't require any changes to the
17135 	 * vm_map_entry_t's, so the read lock is sufficient.
17136 	 */
17137 
17138 	vm_map_lock_read(map);
17139 	assert(map->pmap != kernel_pmap);       /* protect alias access */
17140 
17141 	/*
17142 	 * The madvise semantics require that the address range be fully
17143 	 * allocated with no holes.  Otherwise, we're required to return
17144 	 * an error.
17145 	 */
17146 
17147 	if (!vm_map_range_check(map, start, end, &entry)) {
17148 		vm_map_unlock_read(map);
17149 		vm_page_stats_reusable.can_reuse_failure++;
17150 		return KERN_INVALID_ADDRESS;
17151 	}
17152 
17153 	/*
17154 	 * Examine each vm_map_entry_t in the range.
17155 	 */
17156 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17157 	    entry = entry->vme_next) {
17158 		/*
17159 		 * Sanity check on the VM map entry.
17160 		 */
17161 		if (!vm_map_entry_is_reusable(entry)) {
17162 			vm_map_unlock_read(map);
17163 			vm_page_stats_reusable.can_reuse_failure++;
17164 			return KERN_INVALID_ADDRESS;
17165 		}
17166 	}
17167 
17168 	vm_map_unlock_read(map);
17169 	vm_page_stats_reusable.can_reuse_success++;
17170 	return KERN_SUCCESS;
17171 }
17172 
17173 
17174 #if MACH_ASSERT
17175 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17176 vm_map_pageout(
17177 	vm_map_t        map,
17178 	vm_map_offset_t start,
17179 	vm_map_offset_t end)
17180 {
17181 	vm_map_entry_t                  entry;
17182 
17183 	/*
17184 	 * The MADV_PAGEOUT operation doesn't require any changes to the
17185 	 * vm_map_entry_t's, so the read lock is sufficient.
17186 	 */
17187 
17188 	vm_map_lock_read(map);
17189 
17190 	/*
17191 	 * The madvise semantics require that the address range be fully
17192 	 * allocated with no holes.  Otherwise, we're required to return
17193 	 * an error.
17194 	 */
17195 
17196 	if (!vm_map_range_check(map, start, end, &entry)) {
17197 		vm_map_unlock_read(map);
17198 		return KERN_INVALID_ADDRESS;
17199 	}
17200 
17201 	/*
17202 	 * Examine each vm_map_entry_t in the range.
17203 	 */
17204 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17205 	    entry = entry->vme_next) {
17206 		vm_object_t     object;
17207 
17208 		/*
17209 		 * Sanity check on the VM map entry.
17210 		 */
17211 		if (entry->is_sub_map) {
17212 			vm_map_t submap;
17213 			vm_map_offset_t submap_start;
17214 			vm_map_offset_t submap_end;
17215 			vm_map_entry_t submap_entry;
17216 
17217 			submap = VME_SUBMAP(entry);
17218 			submap_start = VME_OFFSET(entry);
17219 			submap_end = submap_start + (entry->vme_end -
17220 			    entry->vme_start);
17221 
17222 			vm_map_lock_read(submap);
17223 
17224 			if (!vm_map_range_check(submap,
17225 			    submap_start,
17226 			    submap_end,
17227 			    &submap_entry)) {
17228 				vm_map_unlock_read(submap);
17229 				vm_map_unlock_read(map);
17230 				return KERN_INVALID_ADDRESS;
17231 			}
17232 
17233 			if (submap_entry->is_sub_map) {
17234 				vm_map_unlock_read(submap);
17235 				continue;
17236 			}
17237 
17238 			object = VME_OBJECT(submap_entry);
17239 			if (object == VM_OBJECT_NULL || !object->internal) {
17240 				vm_map_unlock_read(submap);
17241 				continue;
17242 			}
17243 
17244 			vm_object_pageout(object);
17245 
17246 			vm_map_unlock_read(submap);
17247 			submap = VM_MAP_NULL;
17248 			submap_entry = VM_MAP_ENTRY_NULL;
17249 			continue;
17250 		}
17251 
17252 		object = VME_OBJECT(entry);
17253 		if (object == VM_OBJECT_NULL || !object->internal) {
17254 			continue;
17255 		}
17256 
17257 		vm_object_pageout(object);
17258 	}
17259 
17260 	vm_map_unlock_read(map);
17261 	return KERN_SUCCESS;
17262 }
17263 #endif /* MACH_ASSERT */
17264 
17265 /*
17266  * This function determines if the zero operation can be run on the
17267  * respective entry. Additional checks on the object are in
17268  * vm_object_zero_preflight.
17269  */
17270 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)17271 vm_map_zero_entry_preflight(vm_map_entry_t entry)
17272 {
17273 	/*
17274 	 * Zeroing is restricted to writable non-executable entries and non-JIT
17275 	 * regions.
17276 	 */
17277 	if (!(entry->protection & VM_PROT_WRITE) ||
17278 	    (entry->protection & VM_PROT_EXECUTE) ||
17279 	    entry->used_for_jit ||
17280 	    entry->vme_xnu_user_debug) {
17281 		return KERN_PROTECTION_FAILURE;
17282 	}
17283 
17284 	/*
17285 	 * Zeroing for copy on write isn't yet supported. Zeroing is also not
17286 	 * allowed for submaps.
17287 	 */
17288 	if (entry->needs_copy || entry->is_sub_map) {
17289 		return KERN_NO_ACCESS;
17290 	}
17291 
17292 	return KERN_SUCCESS;
17293 }
17294 
17295 /*
17296  * This function translates entry's start and end to offsets in the object
17297  */
17298 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)17299 vm_map_get_bounds_in_object(
17300 	vm_map_entry_t      entry,
17301 	vm_map_offset_t     start,
17302 	vm_map_offset_t     end,
17303 	vm_map_offset_t    *start_offset,
17304 	vm_map_offset_t    *end_offset)
17305 {
17306 	if (entry->vme_start < start) {
17307 		*start_offset = start - entry->vme_start;
17308 	} else {
17309 		*start_offset = 0;
17310 	}
17311 	*end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17312 	*start_offset += VME_OFFSET(entry);
17313 	*end_offset += VME_OFFSET(entry);
17314 }
17315 
17316 /*
17317  * This function iterates through the entries in the requested range
17318  * and zeroes any resident pages in the corresponding objects. Compressed
17319  * pages are dropped instead of being faulted in and zeroed.
17320  */
17321 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17322 vm_map_zero(
17323 	vm_map_t        map,
17324 	vm_map_offset_t start,
17325 	vm_map_offset_t end)
17326 {
17327 	vm_map_entry_t                  entry;
17328 	vm_map_offset_t                 cur = start;
17329 	kern_return_t                   ret;
17330 
17331 	/*
17332 	 * This operation isn't supported where the map page size is less than
17333 	 * the hardware page size. Caller will need to handle error and
17334 	 * explicitly zero memory if needed.
17335 	 */
17336 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17337 		return KERN_NO_ACCESS;
17338 	}
17339 
17340 	/*
17341 	 * The MADV_ZERO operation doesn't require any changes to the
17342 	 * vm_map_entry_t's, so the read lock is sufficient.
17343 	 */
17344 	vm_map_lock_read(map);
17345 	assert(map->pmap != kernel_pmap);       /* protect alias access */
17346 
17347 	/*
17348 	 * The madvise semantics require that the address range be fully
17349 	 * allocated with no holes. Otherwise, we're required to return
17350 	 * an error. This check needs to be redone if the map has changed.
17351 	 */
17352 	if (!vm_map_range_check(map, cur, end, &entry)) {
17353 		vm_map_unlock_read(map);
17354 		return KERN_INVALID_ADDRESS;
17355 	}
17356 
17357 	/*
17358 	 * Examine each vm_map_entry_t in the range.
17359 	 */
17360 	while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
17361 		vm_map_offset_t cur_offset;
17362 		vm_map_offset_t end_offset;
17363 		unsigned int last_timestamp = map->timestamp;
17364 		vm_object_t object = VME_OBJECT(entry);
17365 
17366 		ret = vm_map_zero_entry_preflight(entry);
17367 		if (ret != KERN_SUCCESS) {
17368 			vm_map_unlock_read(map);
17369 			return ret;
17370 		}
17371 
17372 		if (object == VM_OBJECT_NULL) {
17373 			entry = entry->vme_next;
17374 			continue;
17375 		}
17376 
17377 		vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
17378 		vm_object_lock(object);
17379 		/*
17380 		 * Take a reference on the object as vm_object_zero will drop the object
17381 		 * lock when it encounters a busy page.
17382 		 */
17383 		vm_object_reference_locked(object);
17384 		vm_map_unlock_read(map);
17385 
17386 		ret = vm_object_zero(object, cur_offset, end_offset);
17387 		vm_object_unlock(object);
17388 		vm_object_deallocate(object);
17389 		if (ret != KERN_SUCCESS) {
17390 			return ret;
17391 		}
17392 		/*
17393 		 * Update cur as vm_object_zero has succeeded.
17394 		 */
17395 		cur += (end_offset - cur_offset);
17396 		if (cur == end) {
17397 			return KERN_SUCCESS;
17398 		}
17399 
17400 		/*
17401 		 * If the map timestamp has changed, restart by relooking up cur in the
17402 		 * map
17403 		 */
17404 		vm_map_lock_read(map);
17405 		if (last_timestamp != map->timestamp) {
17406 			/*
17407 			 * Relookup cur in the map
17408 			 */
17409 			if (!vm_map_range_check(map, cur, end, &entry)) {
17410 				vm_map_unlock_read(map);
17411 				return KERN_INVALID_ADDRESS;
17412 			}
17413 			continue;
17414 		}
17415 		/*
17416 		 * If the map hasn't changed proceed with the next entry
17417 		 */
17418 		entry = entry->vme_next;
17419 	}
17420 
17421 	vm_map_unlock_read(map);
17422 	return KERN_SUCCESS;
17423 }
17424 
17425 
17426 /*
17427  *	Routine:	vm_map_entry_insert
17428  *
17429  *	Description:	This routine inserts a new vm_entry in a locked map.
17430  */
17431 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17432 vm_map_entry_insert(
17433 	vm_map_t                map,
17434 	vm_map_entry_t          insp_entry,
17435 	vm_map_offset_t         start,
17436 	vm_map_offset_t         end,
17437 	vm_object_t             object,
17438 	vm_object_offset_t      offset,
17439 	vm_map_kernel_flags_t   vmk_flags,
17440 	boolean_t               needs_copy,
17441 	vm_prot_t               cur_protection,
17442 	vm_prot_t               max_protection,
17443 	vm_inherit_t            inheritance,
17444 	boolean_t               clear_map_aligned)
17445 {
17446 	vm_map_entry_t  new_entry;
17447 	boolean_t map_aligned = FALSE;
17448 
17449 	assert(insp_entry != (vm_map_entry_t)0);
17450 	vm_map_lock_assert_exclusive(map);
17451 
17452 	__assert_only vm_object_offset_t      end_offset = 0;
17453 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17454 
17455 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17456 		map_aligned = TRUE;
17457 	}
17458 	if (clear_map_aligned &&
17459 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17460 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17461 		map_aligned = FALSE;
17462 	}
17463 	if (map_aligned) {
17464 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17465 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17466 	} else {
17467 		assert(page_aligned(start));
17468 		assert(page_aligned(end));
17469 	}
17470 	assert(start < end);
17471 
17472 	new_entry = vm_map_entry_create(map);
17473 
17474 	new_entry->vme_start = start;
17475 	new_entry->vme_end = end;
17476 
17477 	if (vmk_flags.vmkf_submap) {
17478 		new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17479 		VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17480 	} else {
17481 		VME_OBJECT_SET(new_entry, object, false, 0);
17482 	}
17483 	VME_OFFSET_SET(new_entry, offset);
17484 	VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17485 
17486 	new_entry->map_aligned = map_aligned;
17487 	new_entry->needs_copy = needs_copy;
17488 	new_entry->inheritance = inheritance;
17489 	new_entry->protection = cur_protection;
17490 	new_entry->max_protection = max_protection;
17491 	/*
17492 	 * submap: "use_pmap" means "nested".
17493 	 * default: false.
17494 	 *
17495 	 * object: "use_pmap" means "use pmap accounting" for footprint.
17496 	 * default: true.
17497 	 */
17498 	new_entry->use_pmap = !vmk_flags.vmkf_submap;
17499 	new_entry->no_cache = vmk_flags.vmf_no_cache;
17500 	new_entry->vme_permanent = vmk_flags.vmf_permanent;
17501 	new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17502 	new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17503 	new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17504 
17505 	if (vmk_flags.vmkf_map_jit) {
17506 		if (!(map->jit_entry_exists) ||
17507 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17508 			new_entry->used_for_jit = TRUE;
17509 			map->jit_entry_exists = TRUE;
17510 		}
17511 	}
17512 
17513 	/*
17514 	 *	Insert the new entry into the list.
17515 	 */
17516 
17517 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17518 	map->size += end - start;
17519 
17520 	/*
17521 	 *	Update the free space hint and the lookup hint.
17522 	 */
17523 
17524 	SAVE_HINT_MAP_WRITE(map, new_entry);
17525 	return new_entry;
17526 }
17527 
17528 /*
17529  *	Routine:	vm_map_remap_extract
17530  *
17531  *	Description:	This routine returns a vm_entry list from a map.
17532  */
17533 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17534 vm_map_remap_extract(
17535 	vm_map_t                map,
17536 	vm_map_offset_t         addr,
17537 	vm_map_size_t           size,
17538 	boolean_t               copy,
17539 	vm_map_copy_t           map_copy,
17540 	vm_prot_t               *cur_protection,   /* IN/OUT */
17541 	vm_prot_t               *max_protection,   /* IN/OUT */
17542 	/* What, no behavior? */
17543 	vm_inherit_t            inheritance,
17544 	vm_map_kernel_flags_t   vmk_flags)
17545 {
17546 	struct vm_map_header   *map_header = &map_copy->cpy_hdr;
17547 	kern_return_t           result;
17548 	vm_map_size_t           mapped_size;
17549 	vm_map_size_t           tmp_size;
17550 	vm_map_entry_t          src_entry;     /* result of last map lookup */
17551 	vm_map_entry_t          new_entry;
17552 	vm_object_offset_t      offset;
17553 	vm_map_offset_t         map_address;
17554 	vm_map_offset_t         src_start;     /* start of entry to map */
17555 	vm_map_offset_t         src_end;       /* end of region to be mapped */
17556 	vm_object_t             object;
17557 	vm_map_version_t        version;
17558 	boolean_t               src_needs_copy;
17559 	boolean_t               new_entry_needs_copy;
17560 	vm_map_entry_t          saved_src_entry;
17561 	boolean_t               src_entry_was_wired;
17562 	vm_prot_t               max_prot_for_prot_copy;
17563 	vm_map_offset_t         effective_page_mask;
17564 	bool                    pageable, same_map;
17565 	boolean_t               vm_remap_legacy;
17566 	vm_prot_t               required_cur_prot, required_max_prot;
17567 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
17568 	boolean_t               saved_used_for_jit;  /* Saved used_for_jit. */
17569 
17570 	pageable = vmk_flags.vmkf_copy_pageable;
17571 	same_map = vmk_flags.vmkf_copy_same_map;
17572 
17573 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17574 
17575 	assert(map != VM_MAP_NULL);
17576 	assert(size != 0);
17577 	assert(size == vm_map_round_page(size, effective_page_mask));
17578 	assert(inheritance == VM_INHERIT_NONE ||
17579 	    inheritance == VM_INHERIT_COPY ||
17580 	    inheritance == VM_INHERIT_SHARE);
17581 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17582 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17583 	assert((*cur_protection & *max_protection) == *cur_protection);
17584 
17585 	/*
17586 	 *	Compute start and end of region.
17587 	 */
17588 	src_start = vm_map_trunc_page(addr, effective_page_mask);
17589 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
17590 
17591 	/*
17592 	 *	Initialize map_header.
17593 	 */
17594 	map_header->nentries = 0;
17595 	map_header->entries_pageable = pageable;
17596 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17597 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17598 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17599 	vm_map_store_init(map_header);
17600 
17601 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
17602 		/*
17603 		 * Special case for vm_map_protect(VM_PROT_COPY):
17604 		 * we want to set the new mappings' max protection to the
17605 		 * specified *max_protection...
17606 		 */
17607 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17608 		/* ... but we want to use the vm_remap() legacy mode */
17609 		vmk_flags.vmkf_remap_legacy_mode = true;
17610 		*max_protection = VM_PROT_NONE;
17611 		*cur_protection = VM_PROT_NONE;
17612 	} else {
17613 		max_prot_for_prot_copy = VM_PROT_NONE;
17614 	}
17615 
17616 	if (vmk_flags.vmkf_remap_legacy_mode) {
17617 		/*
17618 		 * vm_remap() legacy mode:
17619 		 * Extract all memory regions in the specified range and
17620 		 * collect the strictest set of protections allowed on the
17621 		 * entire range, so the caller knows what they can do with
17622 		 * the remapped range.
17623 		 * We start with VM_PROT_ALL and we'll remove the protections
17624 		 * missing from each memory region.
17625 		 */
17626 		vm_remap_legacy = TRUE;
17627 		*cur_protection = VM_PROT_ALL;
17628 		*max_protection = VM_PROT_ALL;
17629 		required_cur_prot = VM_PROT_NONE;
17630 		required_max_prot = VM_PROT_NONE;
17631 	} else {
17632 		/*
17633 		 * vm_remap_new() mode:
17634 		 * Extract all memory regions in the specified range and
17635 		 * ensure that they have at least the protections specified
17636 		 * by the caller via *cur_protection and *max_protection.
17637 		 * The resulting mapping should have these protections.
17638 		 */
17639 		vm_remap_legacy = FALSE;
17640 		if (copy) {
17641 			required_cur_prot = VM_PROT_NONE;
17642 			required_max_prot = VM_PROT_READ;
17643 		} else {
17644 			required_cur_prot = *cur_protection;
17645 			required_max_prot = *max_protection;
17646 		}
17647 	}
17648 
17649 	map_address = 0;
17650 	mapped_size = 0;
17651 	result = KERN_SUCCESS;
17652 
17653 	/*
17654 	 *	The specified source virtual space might correspond to
17655 	 *	multiple map entries, need to loop on them.
17656 	 */
17657 	vm_map_lock(map);
17658 
17659 	if (map->pmap == kernel_pmap) {
17660 		map_copy->is_kernel_range = true;
17661 		map_copy->orig_range = kmem_addr_get_range(addr, size);
17662 #if CONFIG_MAP_RANGES
17663 	} else if (map->uses_user_ranges) {
17664 		map_copy->is_user_range = true;
17665 		map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17666 #endif /* CONFIG_MAP_RANGES */
17667 	}
17668 
17669 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17670 		/*
17671 		 * This address space uses sub-pages so the range might
17672 		 * not be re-mappable in an address space with larger
17673 		 * pages. Re-assemble any broken-up VM map entries to
17674 		 * improve our chances of making it work.
17675 		 */
17676 		vm_map_simplify_range(map, src_start, src_end);
17677 	}
17678 	while (mapped_size != size) {
17679 		vm_map_size_t   entry_size;
17680 
17681 		/*
17682 		 *	Find the beginning of the region.
17683 		 */
17684 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17685 			result = KERN_INVALID_ADDRESS;
17686 			break;
17687 		}
17688 
17689 		if (src_start < src_entry->vme_start ||
17690 		    (mapped_size && src_start != src_entry->vme_start)) {
17691 			result = KERN_INVALID_ADDRESS;
17692 			break;
17693 		}
17694 
17695 		tmp_size = size - mapped_size;
17696 		if (src_end > src_entry->vme_end) {
17697 			tmp_size -= (src_end - src_entry->vme_end);
17698 		}
17699 
17700 		entry_size = (vm_map_size_t)(src_entry->vme_end -
17701 		    src_entry->vme_start);
17702 
17703 		if (src_entry->is_sub_map &&
17704 		    vmk_flags.vmkf_copy_single_object) {
17705 			vm_map_t submap;
17706 			vm_map_offset_t submap_start;
17707 			vm_map_size_t submap_size;
17708 			boolean_t submap_needs_copy;
17709 
17710 			/*
17711 			 * No check for "required protection" on "src_entry"
17712 			 * because the protections that matter are the ones
17713 			 * on the submap's VM map entry, which will be checked
17714 			 * during the call to vm_map_remap_extract() below.
17715 			 */
17716 			object = VM_OBJECT_NULL;
17717 
17718 			submap_size = src_entry->vme_end - src_start;
17719 			if (submap_size > size) {
17720 				submap_size = size;
17721 			}
17722 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17723 			submap = VME_SUBMAP(src_entry);
17724 			if (copy) {
17725 				/*
17726 				 * The caller wants a copy-on-write re-mapping,
17727 				 * so let's extract from the submap accordingly.
17728 				 */
17729 				submap_needs_copy = TRUE;
17730 			} else if (src_entry->needs_copy) {
17731 				/*
17732 				 * The caller wants a shared re-mapping but the
17733 				 * submap is mapped with "needs_copy", so its
17734 				 * contents can't be shared as is. Extract the
17735 				 * contents of the submap as "copy-on-write".
17736 				 * The re-mapping won't be shared with the
17737 				 * original mapping but this is equivalent to
17738 				 * what happened with the original "remap from
17739 				 * submap" code.
17740 				 * The shared region is mapped "needs_copy", for
17741 				 * example.
17742 				 */
17743 				submap_needs_copy = TRUE;
17744 			} else {
17745 				/*
17746 				 * The caller wants a shared re-mapping and
17747 				 * this mapping can be shared (no "needs_copy"),
17748 				 * so let's extract from the submap accordingly.
17749 				 * Kernel submaps are mapped without
17750 				 * "needs_copy", for example.
17751 				 */
17752 				submap_needs_copy = FALSE;
17753 			}
17754 			vm_map_reference(submap);
17755 			vm_map_unlock(map);
17756 			src_entry = NULL;
17757 			if (vm_remap_legacy) {
17758 				*cur_protection = VM_PROT_NONE;
17759 				*max_protection = VM_PROT_NONE;
17760 			}
17761 
17762 			DTRACE_VM7(remap_submap_recurse,
17763 			    vm_map_t, map,
17764 			    vm_map_offset_t, addr,
17765 			    vm_map_size_t, size,
17766 			    boolean_t, copy,
17767 			    vm_map_offset_t, submap_start,
17768 			    vm_map_size_t, submap_size,
17769 			    boolean_t, submap_needs_copy);
17770 
17771 			result = vm_map_remap_extract(submap,
17772 			    submap_start,
17773 			    submap_size,
17774 			    submap_needs_copy,
17775 			    map_copy,
17776 			    cur_protection,
17777 			    max_protection,
17778 			    inheritance,
17779 			    vmk_flags);
17780 			vm_map_deallocate(submap);
17781 
17782 			if (result == KERN_SUCCESS &&
17783 			    submap_needs_copy &&
17784 			    !copy) {
17785 				/*
17786 				 * We were asked for a "shared"
17787 				 * re-mapping but had to ask for a
17788 				 * "copy-on-write" remapping of the
17789 				 * submap's mapping to honor the
17790 				 * submap's "needs_copy".
17791 				 * We now need to resolve that
17792 				 * pending "copy-on-write" to
17793 				 * get something we can share.
17794 				 */
17795 				vm_map_entry_t copy_entry;
17796 				vm_object_offset_t copy_offset;
17797 				vm_map_size_t copy_size;
17798 				vm_object_t copy_object;
17799 				copy_entry = vm_map_copy_first_entry(map_copy);
17800 				copy_size = copy_entry->vme_end - copy_entry->vme_start;
17801 				copy_object = VME_OBJECT(copy_entry);
17802 				copy_offset = VME_OFFSET(copy_entry);
17803 				if (copy_object == VM_OBJECT_NULL) {
17804 					assert(copy_offset == 0);
17805 					assert(!copy_entry->needs_copy);
17806 					if (copy_entry->max_protection == VM_PROT_NONE) {
17807 						assert(copy_entry->protection == VM_PROT_NONE);
17808 						/* nothing to share */
17809 					} else {
17810 						assert(copy_offset == 0);
17811 						copy_object = vm_object_allocate(copy_size, submap->serial_id);
17812 						VME_OFFSET_SET(copy_entry, 0);
17813 						VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17814 						assert(copy_entry->use_pmap);
17815 					}
17816 				} else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17817 					/* already shareable */
17818 					assert(!copy_entry->needs_copy);
17819 				} else if (copy_entry->needs_copy ||
17820 				    copy_object->shadowed ||
17821 				    (copy_object->internal &&
17822 				    !copy_object->true_share &&
17823 				    !copy_entry->is_shared &&
17824 				    copy_object->vo_size > copy_size)) {
17825 					VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17826 					assert(copy_entry->use_pmap);
17827 					if (copy_entry->needs_copy) {
17828 						/* already write-protected */
17829 					} else {
17830 						vm_prot_t prot;
17831 						prot = copy_entry->protection & ~VM_PROT_WRITE;
17832 						vm_object_pmap_protect(copy_object,
17833 						    copy_offset,
17834 						    copy_size,
17835 						    PMAP_NULL,
17836 						    PAGE_SIZE,
17837 						    0,
17838 						    prot);
17839 					}
17840 					copy_entry->needs_copy = FALSE;
17841 				}
17842 				copy_object = VME_OBJECT(copy_entry);
17843 				copy_offset = VME_OFFSET(copy_entry);
17844 				if (copy_object &&
17845 				    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17846 					copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17847 					copy_object->true_share = TRUE;
17848 				}
17849 			}
17850 
17851 			return result;
17852 		}
17853 
17854 		if (src_entry->is_sub_map) {
17855 			/* protections for submap mapping are irrelevant here */
17856 		} else if (((src_entry->protection & required_cur_prot) !=
17857 		    required_cur_prot) ||
17858 		    ((src_entry->max_protection & required_max_prot) !=
17859 		    required_max_prot)) {
17860 			if (vmk_flags.vmkf_copy_single_object &&
17861 			    mapped_size != 0) {
17862 				/*
17863 				 * Single object extraction.
17864 				 * We can't extract more with the required
17865 				 * protection but we've extracted some, so
17866 				 * stop there and declare success.
17867 				 * The caller should check the size of
17868 				 * the copy entry we've extracted.
17869 				 */
17870 				result = KERN_SUCCESS;
17871 			} else {
17872 				/*
17873 				 * VM range extraction.
17874 				 * Required proctection is not available
17875 				 * for this part of the range: fail.
17876 				 */
17877 				result = KERN_PROTECTION_FAILURE;
17878 			}
17879 			break;
17880 		}
17881 
17882 		if (src_entry->is_sub_map) {
17883 			vm_map_t submap;
17884 			vm_map_offset_t submap_start;
17885 			vm_map_size_t submap_size;
17886 			vm_map_copy_t submap_copy;
17887 			vm_prot_t submap_curprot, submap_maxprot;
17888 			boolean_t submap_needs_copy;
17889 
17890 			/*
17891 			 * No check for "required protection" on "src_entry"
17892 			 * because the protections that matter are the ones
17893 			 * on the submap's VM map entry, which will be checked
17894 			 * during the call to vm_map_copy_extract() below.
17895 			 */
17896 			object = VM_OBJECT_NULL;
17897 			submap_copy = VM_MAP_COPY_NULL;
17898 
17899 			/* find equivalent range in the submap */
17900 			submap = VME_SUBMAP(src_entry);
17901 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17902 			submap_size = tmp_size;
17903 			if (copy) {
17904 				/*
17905 				 * The caller wants a copy-on-write re-mapping,
17906 				 * so let's extract from the submap accordingly.
17907 				 */
17908 				submap_needs_copy = TRUE;
17909 			} else if (src_entry->needs_copy) {
17910 				/*
17911 				 * The caller wants a shared re-mapping but the
17912 				 * submap is mapped with "needs_copy", so its
17913 				 * contents can't be shared as is. Extract the
17914 				 * contents of the submap as "copy-on-write".
17915 				 * The re-mapping won't be shared with the
17916 				 * original mapping but this is equivalent to
17917 				 * what happened with the original "remap from
17918 				 * submap" code.
17919 				 * The shared region is mapped "needs_copy", for
17920 				 * example.
17921 				 */
17922 				submap_needs_copy = TRUE;
17923 			} else {
17924 				/*
17925 				 * The caller wants a shared re-mapping and
17926 				 * this mapping can be shared (no "needs_copy"),
17927 				 * so let's extract from the submap accordingly.
17928 				 * Kernel submaps are mapped without
17929 				 * "needs_copy", for example.
17930 				 */
17931 				submap_needs_copy = FALSE;
17932 			}
17933 			/* extra ref to keep submap alive */
17934 			vm_map_reference(submap);
17935 
17936 			DTRACE_VM7(remap_submap_recurse,
17937 			    vm_map_t, map,
17938 			    vm_map_offset_t, addr,
17939 			    vm_map_size_t, size,
17940 			    boolean_t, copy,
17941 			    vm_map_offset_t, submap_start,
17942 			    vm_map_size_t, submap_size,
17943 			    boolean_t, submap_needs_copy);
17944 
17945 			/*
17946 			 * The map can be safely unlocked since we
17947 			 * already hold a reference on the submap.
17948 			 *
17949 			 * No timestamp since we don't care if the map
17950 			 * gets modified while we're down in the submap.
17951 			 * We'll resume the extraction at src_start + tmp_size
17952 			 * anyway.
17953 			 */
17954 			vm_map_unlock(map);
17955 			src_entry = NULL; /* not valid once map is unlocked */
17956 
17957 			if (vm_remap_legacy) {
17958 				submap_curprot = VM_PROT_NONE;
17959 				submap_maxprot = VM_PROT_NONE;
17960 				if (max_prot_for_prot_copy) {
17961 					submap_maxprot = max_prot_for_prot_copy;
17962 				}
17963 			} else {
17964 				assert(!max_prot_for_prot_copy);
17965 				submap_curprot = *cur_protection;
17966 				submap_maxprot = *max_protection;
17967 			}
17968 			result = vm_map_copy_extract(submap,
17969 			    submap_start,
17970 			    submap_size,
17971 			    submap_needs_copy,
17972 			    &submap_copy,
17973 			    &submap_curprot,
17974 			    &submap_maxprot,
17975 			    inheritance,
17976 			    vmk_flags);
17977 
17978 			/* release extra ref on submap */
17979 			vm_map_deallocate(submap);
17980 			submap = VM_MAP_NULL;
17981 
17982 			if (result != KERN_SUCCESS) {
17983 				vm_map_lock(map);
17984 				break;
17985 			}
17986 
17987 			/* transfer submap_copy entries to map_header */
17988 			while (vm_map_copy_first_entry(submap_copy) !=
17989 			    vm_map_copy_to_entry(submap_copy)) {
17990 				vm_map_entry_t copy_entry;
17991 				vm_map_size_t copy_entry_size;
17992 
17993 				copy_entry = vm_map_copy_first_entry(submap_copy);
17994 
17995 				/*
17996 				 * Prevent kernel_object from being exposed to
17997 				 * user space.
17998 				 */
17999 				if (__improbable(copy_entry->vme_kernel_object)) {
18000 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
18001 					    proc_selfpid(),
18002 					    (get_bsdtask_info(current_task())
18003 					    ? proc_name_address(get_bsdtask_info(current_task()))
18004 					    : "?"));
18005 					DTRACE_VM(extract_kernel_only);
18006 					result = KERN_INVALID_RIGHT;
18007 					vm_map_copy_discard(submap_copy);
18008 					submap_copy = VM_MAP_COPY_NULL;
18009 					vm_map_lock(map);
18010 					break;
18011 				}
18012 
18013 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
18014 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
18015 				copy_entry->vme_start = map_address;
18016 				copy_entry->vme_end = map_address + copy_entry_size;
18017 				map_address += copy_entry_size;
18018 				mapped_size += copy_entry_size;
18019 				src_start += copy_entry_size;
18020 				assert(src_start <= src_end);
18021 				_vm_map_store_entry_link(map_header,
18022 				    map_header->links.prev,
18023 				    copy_entry);
18024 			}
18025 			/* done with submap_copy */
18026 			vm_map_copy_discard(submap_copy);
18027 
18028 			if (vm_remap_legacy) {
18029 				*cur_protection &= submap_curprot;
18030 				*max_protection &= submap_maxprot;
18031 			}
18032 
18033 			/* re-acquire the map lock and continue to next entry */
18034 			vm_map_lock(map);
18035 			continue;
18036 		} else {
18037 			object = VME_OBJECT(src_entry);
18038 
18039 			/*
18040 			 * Prevent kernel_object from being exposed to
18041 			 * user space.
18042 			 */
18043 			if (__improbable(is_kernel_object(object))) {
18044 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
18045 				    proc_selfpid(),
18046 				    (get_bsdtask_info(current_task())
18047 				    ? proc_name_address(get_bsdtask_info(current_task()))
18048 				    : "?"));
18049 				DTRACE_VM(extract_kernel_only);
18050 				result = KERN_INVALID_RIGHT;
18051 				break;
18052 			}
18053 
18054 			if (src_entry->iokit_acct) {
18055 				/*
18056 				 * This entry uses "IOKit accounting".
18057 				 */
18058 			} else if (object != VM_OBJECT_NULL &&
18059 			    object->internal &&
18060 			    (object->purgable != VM_PURGABLE_DENY ||
18061 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
18062 				/*
18063 				 * Purgeable objects have their own accounting:
18064 				 * no pmap accounting for them.
18065 				 */
18066 				assertf(!src_entry->use_pmap,
18067 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
18068 				    map,
18069 				    src_entry,
18070 				    (uint64_t)src_entry->vme_start,
18071 				    (uint64_t)src_entry->vme_end,
18072 				    src_entry->protection,
18073 				    src_entry->max_protection,
18074 				    VME_ALIAS(src_entry));
18075 			} else {
18076 				/*
18077 				 * Not IOKit or purgeable:
18078 				 * must be accounted by pmap stats.
18079 				 */
18080 				assertf(src_entry->use_pmap,
18081 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
18082 				    map,
18083 				    src_entry,
18084 				    (uint64_t)src_entry->vme_start,
18085 				    (uint64_t)src_entry->vme_end,
18086 				    src_entry->protection,
18087 				    src_entry->max_protection,
18088 				    VME_ALIAS(src_entry));
18089 			}
18090 
18091 			if (object == VM_OBJECT_NULL) {
18092 				assert(!src_entry->needs_copy);
18093 				if (src_entry->max_protection == VM_PROT_NONE) {
18094 					assert(src_entry->protection == VM_PROT_NONE);
18095 					/*
18096 					 * No VM object and no permissions:
18097 					 * this must be a reserved range with
18098 					 * nothing to share or copy.
18099 					 * There could also be all sorts of
18100 					 * pmap shenanigans within that reserved
18101 					 * range, so let's just copy the map
18102 					 * entry as is to remap a similar
18103 					 * reserved range.
18104 					 */
18105 					offset = 0; /* no object => no offset */
18106 					goto copy_src_entry;
18107 				}
18108 				object = vm_object_allocate(entry_size, map->serial_id);
18109 				VME_OFFSET_SET(src_entry, 0);
18110 				VME_OBJECT_SET(src_entry, object, false, 0);
18111 				assert(src_entry->use_pmap);
18112 				assert(!map->mapped_in_other_pmaps);
18113 			} else if (src_entry->wired_count ||
18114 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
18115 				/*
18116 				 * A wired memory region should not have
18117 				 * any pending copy-on-write and needs to
18118 				 * keep pointing at the VM object that
18119 				 * contains the wired pages.
18120 				 * If we're sharing this memory (copy=false),
18121 				 * we'll share this VM object.
18122 				 * If we're copying this memory (copy=true),
18123 				 * we'll call vm_object_copy_slowly() below
18124 				 * and use the new VM object for the remapping.
18125 				 *
18126 				 * Or, we are already using an asymmetric
18127 				 * copy, and therefore we already have
18128 				 * the right object.
18129 				 */
18130 				assert(!src_entry->needs_copy);
18131 			} else if (src_entry->needs_copy || object->shadowed ||
18132 			    (object->internal && !object->true_share &&
18133 			    !src_entry->is_shared &&
18134 			    object->vo_size > entry_size)) {
18135 				bool is_writable;
18136 
18137 				VME_OBJECT_SHADOW(src_entry, entry_size,
18138 				    vm_map_always_shadow(map));
18139 				assert(src_entry->use_pmap);
18140 
18141 				is_writable = false;
18142 				if (src_entry->protection & VM_PROT_WRITE) {
18143 					is_writable = true;
18144 #if __arm64e__
18145 				} else if (src_entry->used_for_tpro) {
18146 					is_writable = true;
18147 #endif /* __arm64e__ */
18148 				}
18149 				if (!src_entry->needs_copy && is_writable) {
18150 					vm_prot_t prot;
18151 
18152 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18153 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18154 						    __FUNCTION__,
18155 						    map, map->pmap,
18156 						    src_entry,
18157 						    (uint64_t)src_entry->vme_start,
18158 						    (uint64_t)src_entry->vme_end,
18159 						    src_entry->protection);
18160 					}
18161 
18162 					prot = src_entry->protection & ~VM_PROT_WRITE;
18163 
18164 					if (override_nx(map,
18165 					    VME_ALIAS(src_entry))
18166 					    && prot) {
18167 						prot |= VM_PROT_EXECUTE;
18168 					}
18169 
18170 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18171 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18172 						    __FUNCTION__,
18173 						    map, map->pmap,
18174 						    src_entry,
18175 						    (uint64_t)src_entry->vme_start,
18176 						    (uint64_t)src_entry->vme_end,
18177 						    prot);
18178 					}
18179 
18180 					if (map->mapped_in_other_pmaps) {
18181 						vm_object_pmap_protect(
18182 							VME_OBJECT(src_entry),
18183 							VME_OFFSET(src_entry),
18184 							entry_size,
18185 							PMAP_NULL,
18186 							PAGE_SIZE,
18187 							src_entry->vme_start,
18188 							prot);
18189 #if MACH_ASSERT
18190 					} else if (__improbable(map->pmap == PMAP_NULL)) {
18191 						/*
18192 						 * Some VM tests (in vm_tests.c)
18193 						 * sometimes want to use a VM
18194 						 * map without a pmap.
18195 						 * Otherwise, this should never
18196 						 * happen.
18197 						 */
18198 						if (!thread_get_test_option(test_option_vm_map_allow_null_pmap)) {
18199 							panic("null pmap");
18200 						}
18201 #endif /* MACH_ASSERT */
18202 					} else {
18203 						pmap_protect(vm_map_pmap(map),
18204 						    src_entry->vme_start,
18205 						    src_entry->vme_end,
18206 						    prot);
18207 					}
18208 				}
18209 
18210 				object = VME_OBJECT(src_entry);
18211 				src_entry->needs_copy = FALSE;
18212 			}
18213 
18214 
18215 			vm_object_lock(object);
18216 			vm_object_reference_locked(object); /* object ref. for new entry */
18217 			assert(!src_entry->needs_copy);
18218 			if (object->copy_strategy ==
18219 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
18220 				/*
18221 				 * If we want to share this object (copy==0),
18222 				 * it needs to be COPY_DELAY.
18223 				 * If we want to copy this object (copy==1),
18224 				 * we can't just set "needs_copy" on our side
18225 				 * and expect the other side to do the same
18226 				 * (symmetrically), so we can't let the object
18227 				 * stay COPY_SYMMETRIC.
18228 				 * So we always switch from COPY_SYMMETRIC to
18229 				 * COPY_DELAY.
18230 				 */
18231 				object->copy_strategy =
18232 				    MEMORY_OBJECT_COPY_DELAY;
18233 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
18234 			}
18235 			vm_object_unlock(object);
18236 		}
18237 
18238 		offset = (VME_OFFSET(src_entry) +
18239 		    (src_start - src_entry->vme_start));
18240 
18241 copy_src_entry:
18242 
18243 
18244 		new_entry = _vm_map_entry_create(map_header);
18245 		vm_map_entry_copy(map, new_entry, src_entry);
18246 		if (new_entry->is_sub_map) {
18247 			/* clr address space specifics */
18248 			new_entry->use_pmap = FALSE;
18249 		} else if (copy) {
18250 			/*
18251 			 * We're dealing with a copy-on-write operation,
18252 			 * so the resulting mapping should not inherit the
18253 			 * original mapping's accounting settings.
18254 			 * "use_pmap" should be reset to its default (TRUE)
18255 			 * so that the new mapping gets accounted for in
18256 			 * the task's memory footprint.
18257 			 */
18258 			new_entry->use_pmap = TRUE;
18259 		}
18260 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
18261 		assert(!new_entry->iokit_acct);
18262 
18263 		new_entry->map_aligned = FALSE;
18264 
18265 		new_entry->vme_start = map_address;
18266 		new_entry->vme_end = map_address + tmp_size;
18267 		assert(new_entry->vme_start < new_entry->vme_end);
18268 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
18269 			/* security: keep "permanent" and "csm_associated" */
18270 			new_entry->vme_permanent = src_entry->vme_permanent;
18271 			new_entry->csm_associated = src_entry->csm_associated;
18272 			/*
18273 			 * Remapping for vm_map_protect(VM_PROT_COPY)
18274 			 * to convert a read-only mapping into a
18275 			 * copy-on-write version of itself but
18276 			 * with write access:
18277 			 * keep the original inheritance but let's not
18278 			 * add VM_PROT_WRITE to the max protection yet
18279 			 * since we want to do more security checks against
18280 			 * the target map.
18281 			 */
18282 			new_entry->inheritance = src_entry->inheritance;
18283 			new_entry->protection &= max_prot_for_prot_copy;
18284 
18285 #ifdef __arm64e__
18286 			/*
18287 			 * Remapping for vm_map_protect(VM_PROT_COPY) to remap a TPRO
18288 			 * region to be explicitly writable without TPRO is only permitted
18289 			 * if TPRO enforcement has been overridden.
18290 			 *
18291 			 * In this case we ensure any entries reset the TPRO state
18292 			 * and we permit the region to be downgraded from permanent.
18293 			 */
18294 			if (new_entry->used_for_tpro) {
18295 				if (vmk_flags.vmkf_tpro_enforcement_override) {
18296 					new_entry->used_for_tpro = FALSE;
18297 					new_entry->vme_permanent = FALSE;
18298 				} else {
18299 					result = KERN_PROTECTION_FAILURE;
18300 					vm_object_deallocate(object);
18301 					vm_map_entry_dispose(new_entry);
18302 					new_entry = VM_MAP_ENTRY_NULL;
18303 					break;
18304 				}
18305 			}
18306 #endif
18307 		} else {
18308 			new_entry->inheritance = inheritance;
18309 			if (!vm_remap_legacy) {
18310 				new_entry->protection = *cur_protection;
18311 				new_entry->max_protection = *max_protection;
18312 			}
18313 		}
18314 
18315 		VME_OFFSET_SET(new_entry, offset);
18316 
18317 		/*
18318 		 * The new region has to be copied now if required.
18319 		 */
18320 RestartCopy:
18321 		if (!copy) {
18322 			if (src_entry->used_for_jit == TRUE) {
18323 				if (same_map) {
18324 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
18325 					/*
18326 					 * Cannot allow an entry describing a JIT
18327 					 * region to be shared across address spaces.
18328 					 */
18329 					result = KERN_INVALID_ARGUMENT;
18330 					vm_object_deallocate(object);
18331 					vm_map_entry_dispose(new_entry);
18332 					new_entry = VM_MAP_ENTRY_NULL;
18333 					break;
18334 				}
18335 			}
18336 
18337 			if (!src_entry->is_sub_map &&
18338 			    VME_OBJECT(src_entry) == VM_OBJECT_NULL) {
18339 				/* no accessible memory; nothing to share */
18340 				assert(src_entry->protection == VM_PROT_NONE);
18341 				assert(src_entry->max_protection == VM_PROT_NONE);
18342 				src_entry->is_shared = FALSE;
18343 			} else {
18344 				src_entry->is_shared = TRUE;
18345 			}
18346 			if (!new_entry->is_sub_map &&
18347 			    VME_OBJECT(new_entry) == VM_OBJECT_NULL) {
18348 				/* no accessible memory; nothing to share */
18349 				assert(new_entry->protection == VM_PROT_NONE);
18350 				assert(new_entry->max_protection == VM_PROT_NONE);
18351 				new_entry->is_shared = FALSE;
18352 			} else {
18353 				new_entry->is_shared = TRUE;
18354 			}
18355 			if (!(new_entry->is_sub_map)) {
18356 				new_entry->needs_copy = FALSE;
18357 			}
18358 		} else if (src_entry->is_sub_map) {
18359 			/* make this a COW sub_map if not already */
18360 			assert(new_entry->wired_count == 0);
18361 			new_entry->needs_copy = TRUE;
18362 			object = VM_OBJECT_NULL;
18363 		} else if (src_entry->wired_count == 0 &&
18364 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
18365 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
18366 		    VME_OFFSET(new_entry),
18367 		    (new_entry->vme_end -
18368 		    new_entry->vme_start),
18369 		    &src_needs_copy,
18370 		    &new_entry_needs_copy)) {
18371 			new_entry->needs_copy = new_entry_needs_copy;
18372 			new_entry->is_shared = FALSE;
18373 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18374 
18375 			/*
18376 			 * Handle copy_on_write semantics.
18377 			 */
18378 			if (src_needs_copy && !src_entry->needs_copy) {
18379 				vm_prot_t prot;
18380 
18381 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18382 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18383 					    __FUNCTION__,
18384 					    map, map->pmap, src_entry,
18385 					    (uint64_t)src_entry->vme_start,
18386 					    (uint64_t)src_entry->vme_end,
18387 					    src_entry->protection);
18388 				}
18389 
18390 				prot = src_entry->protection & ~VM_PROT_WRITE;
18391 
18392 				if (override_nx(map,
18393 				    VME_ALIAS(src_entry))
18394 				    && prot) {
18395 					prot |= VM_PROT_EXECUTE;
18396 				}
18397 
18398 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18399 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18400 					    __FUNCTION__,
18401 					    map, map->pmap, src_entry,
18402 					    (uint64_t)src_entry->vme_start,
18403 					    (uint64_t)src_entry->vme_end,
18404 					    prot);
18405 				}
18406 
18407 				vm_object_pmap_protect(object,
18408 				    offset,
18409 				    entry_size,
18410 				    ((src_entry->is_shared
18411 				    || map->mapped_in_other_pmaps) ?
18412 				    PMAP_NULL : map->pmap),
18413 				    VM_MAP_PAGE_SIZE(map),
18414 				    src_entry->vme_start,
18415 				    prot);
18416 
18417 				assert(src_entry->wired_count == 0);
18418 				src_entry->needs_copy = TRUE;
18419 			}
18420 			/*
18421 			 * Throw away the old object reference of the new entry.
18422 			 */
18423 			vm_object_deallocate(object);
18424 		} else {
18425 			new_entry->is_shared = FALSE;
18426 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18427 
18428 			src_entry_was_wired = (src_entry->wired_count > 0);
18429 			saved_src_entry = src_entry;
18430 			src_entry = VM_MAP_ENTRY_NULL;
18431 
18432 			/*
18433 			 * The map can be safely unlocked since we
18434 			 * already hold a reference on the object.
18435 			 *
18436 			 * Record the timestamp of the map for later
18437 			 * verification, and unlock the map.
18438 			 */
18439 			version.main_timestamp = map->timestamp;
18440 			vm_map_unlock(map);     /* Increments timestamp once! */
18441 
18442 			/*
18443 			 * Perform the copy.
18444 			 */
18445 			if (src_entry_was_wired > 0 ||
18446 			    (debug4k_no_cow_copyin &&
18447 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18448 				vm_object_lock(object);
18449 				result = vm_object_copy_slowly(
18450 					object,
18451 					offset,
18452 					(new_entry->vme_end -
18453 					new_entry->vme_start),
18454 					THREAD_UNINT,
18455 					&new_copy_object);
18456 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18457 				saved_used_for_jit = new_entry->used_for_jit;
18458 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18459 				new_entry->used_for_jit = saved_used_for_jit;
18460 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18461 				new_entry->needs_copy = FALSE;
18462 			} else {
18463 				vm_object_offset_t new_offset;
18464 
18465 				new_offset = VME_OFFSET(new_entry);
18466 				result = vm_object_copy_strategically(
18467 					object,
18468 					offset,
18469 					(new_entry->vme_end -
18470 					new_entry->vme_start),
18471 					false, /* forking */
18472 					&new_copy_object,
18473 					&new_offset,
18474 					&new_entry_needs_copy);
18475 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18476 				saved_used_for_jit = new_entry->used_for_jit;
18477 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18478 				new_entry->used_for_jit = saved_used_for_jit;
18479 				if (new_offset != VME_OFFSET(new_entry)) {
18480 					VME_OFFSET_SET(new_entry, new_offset);
18481 				}
18482 
18483 				new_entry->needs_copy = new_entry_needs_copy;
18484 			}
18485 
18486 			/*
18487 			 * Throw away the old object reference of the new entry.
18488 			 */
18489 			vm_object_deallocate(object);
18490 
18491 			if (result != KERN_SUCCESS &&
18492 			    result != KERN_MEMORY_RESTART_COPY) {
18493 				vm_map_entry_dispose(new_entry);
18494 				vm_map_lock(map);
18495 				break;
18496 			}
18497 
18498 			/*
18499 			 * Verify that the map has not substantially
18500 			 * changed while the copy was being made.
18501 			 */
18502 
18503 			vm_map_lock(map);
18504 			if (version.main_timestamp + 1 != map->timestamp) {
18505 				/*
18506 				 * Simple version comparison failed.
18507 				 *
18508 				 * Retry the lookup and verify that the
18509 				 * same object/offset are still present.
18510 				 */
18511 				saved_src_entry = VM_MAP_ENTRY_NULL;
18512 				vm_object_deallocate(VME_OBJECT(new_entry));
18513 				vm_map_entry_dispose(new_entry);
18514 				if (result == KERN_MEMORY_RESTART_COPY) {
18515 					result = KERN_SUCCESS;
18516 				}
18517 				continue;
18518 			}
18519 			/* map hasn't changed: src_entry is still valid */
18520 			src_entry = saved_src_entry;
18521 			saved_src_entry = VM_MAP_ENTRY_NULL;
18522 
18523 			if (result == KERN_MEMORY_RESTART_COPY) {
18524 				vm_object_reference(object);
18525 				goto RestartCopy;
18526 			}
18527 		}
18528 
18529 		_vm_map_store_entry_link(map_header,
18530 		    map_header->links.prev, new_entry);
18531 
18532 		/* protections for submap mapping are irrelevant here */
18533 		if (vm_remap_legacy && !src_entry->is_sub_map) {
18534 			*cur_protection &= src_entry->protection;
18535 			*max_protection &= src_entry->max_protection;
18536 		}
18537 
18538 		map_address += tmp_size;
18539 		mapped_size += tmp_size;
18540 		src_start += tmp_size;
18541 
18542 		if (vmk_flags.vmkf_copy_single_object) {
18543 			if (mapped_size != size) {
18544 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18545 				    map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18546 				if (src_entry->vme_next != vm_map_to_entry(map) &&
18547 				    src_entry->vme_next->vme_object_value ==
18548 				    src_entry->vme_object_value) {
18549 					/* XXX TODO4K */
18550 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
18551 				}
18552 			}
18553 			break;
18554 		}
18555 	} /* end while */
18556 
18557 	vm_map_unlock(map);
18558 	if (result != KERN_SUCCESS) {
18559 		/*
18560 		 * Free all allocated elements.
18561 		 */
18562 		for (src_entry = map_header->links.next;
18563 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18564 		    src_entry = new_entry) {
18565 			new_entry = src_entry->vme_next;
18566 			_vm_map_store_entry_unlink(map_header, src_entry, false);
18567 			if (src_entry->is_sub_map) {
18568 				vm_map_deallocate(VME_SUBMAP(src_entry));
18569 			} else {
18570 				vm_object_deallocate(VME_OBJECT(src_entry));
18571 			}
18572 			vm_map_entry_dispose(src_entry);
18573 		}
18574 	}
18575 	return result;
18576 }
18577 
18578 bool
vm_map_is_exotic(vm_map_t map)18579 vm_map_is_exotic(
18580 	vm_map_t map)
18581 {
18582 	return VM_MAP_IS_EXOTIC(map);
18583 }
18584 
18585 bool
vm_map_is_alien(vm_map_t map)18586 vm_map_is_alien(
18587 	vm_map_t map)
18588 {
18589 	return VM_MAP_IS_ALIEN(map);
18590 }
18591 
18592 #if XNU_TARGET_OS_OSX
18593 void
vm_map_mark_alien(vm_map_t map)18594 vm_map_mark_alien(
18595 	vm_map_t map)
18596 {
18597 	vm_map_lock(map);
18598 	map->is_alien = true;
18599 	vm_map_unlock(map);
18600 }
18601 
18602 void
vm_map_single_jit(vm_map_t map)18603 vm_map_single_jit(
18604 	vm_map_t map)
18605 {
18606 	vm_map_lock(map);
18607 	map->single_jit = true;
18608 	vm_map_unlock(map);
18609 }
18610 #endif /* XNU_TARGET_OS_OSX */
18611 
18612 
18613 /*
18614  * Callers of this function must call vm_map_copy_require on
18615  * previously created vm_map_copy_t or pass a newly created
18616  * one to ensure that it hasn't been forged.
18617  */
18618 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18619 vm_map_copy_to_physcopy(
18620 	vm_map_copy_t   copy_map,
18621 	vm_map_t        target_map)
18622 {
18623 	vm_map_size_t           size;
18624 	vm_map_entry_t          entry;
18625 	vm_map_entry_t          new_entry;
18626 	vm_object_t             new_object;
18627 	unsigned int            pmap_flags;
18628 	pmap_t                  new_pmap;
18629 	vm_map_t                new_map;
18630 	vm_map_address_t        src_start, src_end, src_cur;
18631 	vm_map_address_t        dst_start, dst_end, dst_cur;
18632 	kern_return_t           kr;
18633 	void                    *kbuf;
18634 
18635 	/*
18636 	 * Perform the equivalent of vm_allocate() and memcpy().
18637 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
18638 	 */
18639 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18640 
18641 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18642 
18643 	/* create a new pmap to map "copy_map" */
18644 	pmap_flags = 0;
18645 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18646 #if PMAP_CREATE_FORCE_4K_PAGES
18647 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18648 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18649 	pmap_flags |= PMAP_CREATE_64BIT;
18650 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18651 	if (new_pmap == NULL) {
18652 		return KERN_RESOURCE_SHORTAGE;
18653 	}
18654 
18655 	/* allocate new VM object */
18656 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18657 	new_object = vm_object_allocate(size, VM_MAP_SERIAL_NONE);
18658 	assert(new_object);
18659 
18660 	/* allocate new VM map entry */
18661 	new_entry = vm_map_copy_entry_create(copy_map);
18662 	assert(new_entry);
18663 
18664 	/* finish initializing new VM map entry */
18665 	new_entry->protection = VM_PROT_DEFAULT;
18666 	new_entry->max_protection = VM_PROT_DEFAULT;
18667 	new_entry->use_pmap = TRUE;
18668 
18669 	/* make new VM map entry point to new VM object */
18670 	new_entry->vme_start = 0;
18671 	new_entry->vme_end = size;
18672 	VME_OBJECT_SET(new_entry, new_object, false, 0);
18673 	VME_OFFSET_SET(new_entry, 0);
18674 
18675 	/* create a new pageable VM map to map "copy_map" */
18676 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18677 	    VM_MAP_CREATE_PAGEABLE);
18678 	assert(new_map);
18679 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18680 
18681 	/* map "copy_map" in the new VM map */
18682 	src_start = 0;
18683 	kr = vm_map_copyout_internal(
18684 		new_map,
18685 		&src_start,
18686 		copy_map,
18687 		copy_map->size,
18688 		FALSE, /* consume_on_success */
18689 		VM_PROT_DEFAULT,
18690 		VM_PROT_DEFAULT,
18691 		VM_INHERIT_DEFAULT);
18692 	assert(kr == KERN_SUCCESS);
18693 	src_end = src_start + copy_map->size;
18694 
18695 	/* map "new_object" in the new VM map */
18696 	vm_object_reference(new_object);
18697 	dst_start = 0;
18698 	kr = vm_map_enter(new_map,
18699 	    &dst_start,
18700 	    size,
18701 	    0,               /* mask */
18702 	    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18703 	    new_object,
18704 	    0,               /* offset */
18705 	    FALSE,               /* needs copy */
18706 	    VM_PROT_DEFAULT,
18707 	    VM_PROT_DEFAULT,
18708 	    VM_INHERIT_DEFAULT);
18709 	assert(kr == KERN_SUCCESS);
18710 	dst_end = dst_start + size;
18711 
18712 	/* get a kernel buffer */
18713 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18714 
18715 	/* physically copy "copy_map" mappings to new VM object */
18716 	for (src_cur = src_start, dst_cur = dst_start;
18717 	    src_cur < src_end;
18718 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18719 		vm_size_t bytes;
18720 
18721 		bytes = PAGE_SIZE;
18722 		if (src_cur + PAGE_SIZE > src_end) {
18723 			/* partial copy for last page */
18724 			bytes = src_end - src_cur;
18725 			assert(bytes > 0 && bytes < PAGE_SIZE);
18726 			/* rest of dst page should be zero-filled */
18727 		}
18728 		/* get bytes from src mapping */
18729 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
18730 		if (kr != KERN_SUCCESS) {
18731 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18732 		}
18733 		/* put bytes in dst mapping */
18734 		assert(dst_cur < dst_end);
18735 		assert(dst_cur + bytes <= dst_end);
18736 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18737 		if (kr != KERN_SUCCESS) {
18738 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18739 		}
18740 	}
18741 
18742 	/* free kernel buffer */
18743 	kfree_data(kbuf, PAGE_SIZE);
18744 
18745 	/* destroy new map */
18746 	vm_map_destroy(new_map);
18747 	new_map = VM_MAP_NULL;
18748 
18749 	/* dispose of the old map entries in "copy_map" */
18750 	while (vm_map_copy_first_entry(copy_map) !=
18751 	    vm_map_copy_to_entry(copy_map)) {
18752 		entry = vm_map_copy_first_entry(copy_map);
18753 		vm_map_copy_entry_unlink(copy_map, entry);
18754 		if (entry->is_sub_map) {
18755 			vm_map_deallocate(VME_SUBMAP(entry));
18756 		} else {
18757 			vm_object_deallocate(VME_OBJECT(entry));
18758 		}
18759 		vm_map_copy_entry_dispose(entry);
18760 	}
18761 
18762 	/* change "copy_map"'s page_size to match "target_map" */
18763 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18764 	copy_map->offset = 0;
18765 	copy_map->size = size;
18766 
18767 	/* insert new map entry in "copy_map" */
18768 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18769 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18770 
18771 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18772 	return KERN_SUCCESS;
18773 }
18774 
18775 void
18776 vm_map_copy_adjust_get_target_copy_map(
18777 	vm_map_copy_t   copy_map,
18778 	vm_map_copy_t   *target_copy_map_p);
18779 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18780 vm_map_copy_adjust_get_target_copy_map(
18781 	vm_map_copy_t   copy_map,
18782 	vm_map_copy_t   *target_copy_map_p)
18783 {
18784 	vm_map_copy_t   target_copy_map;
18785 	vm_map_entry_t  entry, target_entry;
18786 
18787 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18788 		/* the caller already has a "target_copy_map": use it */
18789 		return;
18790 	}
18791 
18792 	/* the caller wants us to create a new copy of "copy_map" */
18793 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18794 	target_copy_map = vm_map_copy_allocate(copy_map->type);
18795 	target_copy_map->offset = copy_map->offset;
18796 	target_copy_map->size = copy_map->size;
18797 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18798 	for (entry = vm_map_copy_first_entry(copy_map);
18799 	    entry != vm_map_copy_to_entry(copy_map);
18800 	    entry = entry->vme_next) {
18801 		target_entry = vm_map_copy_entry_create(target_copy_map);
18802 		vm_map_entry_copy_full(target_entry, entry);
18803 		if (target_entry->is_sub_map) {
18804 			vm_map_reference(VME_SUBMAP(target_entry));
18805 		} else {
18806 			vm_object_reference(VME_OBJECT(target_entry));
18807 		}
18808 		vm_map_copy_entry_link(
18809 			target_copy_map,
18810 			vm_map_copy_last_entry(target_copy_map),
18811 			target_entry);
18812 	}
18813 	entry = VM_MAP_ENTRY_NULL;
18814 	*target_copy_map_p = target_copy_map;
18815 }
18816 
18817 /*
18818  * Callers of this function must call vm_map_copy_require on
18819  * previously created vm_map_copy_t or pass a newly created
18820  * one to ensure that it hasn't been forged.
18821  */
18822 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18823 vm_map_copy_trim(
18824 	vm_map_copy_t   copy_map,
18825 	uint16_t        new_page_shift,
18826 	vm_map_offset_t trim_start,
18827 	vm_map_offset_t trim_end)
18828 {
18829 	uint16_t        copy_page_shift;
18830 	vm_map_entry_t  entry, next_entry;
18831 
18832 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18833 	assert(copy_map->cpy_hdr.nentries > 0);
18834 
18835 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18836 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18837 
18838 	/* use the new page_shift to do the clipping */
18839 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18840 	copy_map->cpy_hdr.page_shift = new_page_shift;
18841 
18842 	for (entry = vm_map_copy_first_entry(copy_map);
18843 	    entry != vm_map_copy_to_entry(copy_map);
18844 	    entry = next_entry) {
18845 		next_entry = entry->vme_next;
18846 		if (entry->vme_end <= trim_start) {
18847 			/* entry fully before trim range: skip */
18848 			continue;
18849 		}
18850 		if (entry->vme_start >= trim_end) {
18851 			/* entry fully after trim range: done */
18852 			break;
18853 		}
18854 		/* clip entry if needed */
18855 		vm_map_copy_clip_start(copy_map, entry, trim_start);
18856 		vm_map_copy_clip_end(copy_map, entry, trim_end);
18857 		/* dispose of entry */
18858 		copy_map->size -= entry->vme_end - entry->vme_start;
18859 		vm_map_copy_entry_unlink(copy_map, entry);
18860 		if (entry->is_sub_map) {
18861 			vm_map_deallocate(VME_SUBMAP(entry));
18862 		} else {
18863 			vm_object_deallocate(VME_OBJECT(entry));
18864 		}
18865 		vm_map_copy_entry_dispose(entry);
18866 		entry = VM_MAP_ENTRY_NULL;
18867 	}
18868 
18869 	/* restore copy_map's original page_shift */
18870 	copy_map->cpy_hdr.page_shift = copy_page_shift;
18871 }
18872 
18873 /*
18874  * Make any necessary adjustments to "copy_map" to allow it to be
18875  * mapped into "target_map".
18876  * If no changes were necessary, "target_copy_map" points to the
18877  * untouched "copy_map".
18878  * If changes are necessary, changes will be made to "target_copy_map".
18879  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18880  * copy the original "copy_map" to it before applying the changes.
18881  * The caller should discard "target_copy_map" if it's not the same as
18882  * the original "copy_map".
18883  */
18884 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18885 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_ut offset_u,vm_map_size_ut size_u,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18886 vm_map_copy_adjust_to_target(
18887 	vm_map_copy_t           src_copy_map,
18888 	vm_map_offset_ut        offset_u,
18889 	vm_map_size_ut          size_u,
18890 	vm_map_t                target_map,
18891 	boolean_t               copy,
18892 	vm_map_copy_t           *target_copy_map_p,
18893 	vm_map_offset_t         *overmap_start_p,
18894 	vm_map_offset_t         *overmap_end_p,
18895 	vm_map_offset_t         *trimmed_start_p)
18896 {
18897 	vm_map_copy_t           copy_map, target_copy_map;
18898 	vm_map_size_t           target_size;
18899 	vm_map_size_t           src_copy_map_size;
18900 	vm_map_size_t           overmap_start, overmap_end;
18901 	int                     misalignments;
18902 	vm_map_entry_t          entry, target_entry;
18903 	vm_map_offset_t         addr_adjustment;
18904 	vm_map_offset_t         new_start, new_end;
18905 	int                     copy_page_mask, target_page_mask;
18906 	uint16_t                copy_page_shift, target_page_shift;
18907 	vm_map_offset_t         trimmed_end;
18908 	vm_map_size_t           map_size;
18909 	kern_return_t           kr;
18910 
18911 	/*
18912 	 * Sanitize any input parameters that are addr/size/prot/inherit
18913 	 */
18914 	kr = vm_map_copy_addr_size_sanitize(
18915 		target_map,
18916 		offset_u,
18917 		size_u,
18918 		VM_SANITIZE_CALLER_MACH_MEMORY_ENTRY_MAP_SIZE,
18919 		&new_start,
18920 		&new_end,
18921 		&map_size);
18922 	if (__improbable(kr != KERN_SUCCESS)) {
18923 		return vm_sanitize_get_kr(kr);
18924 	}
18925 
18926 	/*
18927 	 * Assert that the vm_map_copy is coming from the right
18928 	 * zone and hasn't been forged
18929 	 */
18930 	vm_map_copy_require(src_copy_map);
18931 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18932 
18933 	/*
18934 	 * Start working with "src_copy_map" but we'll switch
18935 	 * to "target_copy_map" as soon as we start making adjustments.
18936 	 */
18937 	copy_map = src_copy_map;
18938 	src_copy_map_size = src_copy_map->size;
18939 
18940 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18941 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18942 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18943 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18944 
18945 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), *target_copy_map_p);
18946 
18947 	target_copy_map = *target_copy_map_p;
18948 	if (target_copy_map != VM_MAP_COPY_NULL) {
18949 		vm_map_copy_require(target_copy_map);
18950 	}
18951 
18952 	if (new_end > copy_map->size) {
18953 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u));
18954 		return KERN_INVALID_ARGUMENT;
18955 	}
18956 
18957 	/* trim the end */
18958 	trimmed_end = 0;
18959 	new_end = VM_MAP_ROUND_PAGE(new_end, target_page_mask);
18960 	if (new_end < copy_map->size) {
18961 		trimmed_end = src_copy_map_size - new_end;
18962 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18963 		/* get "target_copy_map" if needed and adjust it */
18964 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18965 		    &target_copy_map);
18966 		copy_map = target_copy_map;
18967 		vm_map_copy_trim(target_copy_map, target_page_shift,
18968 		    new_end, copy_map->size);
18969 	}
18970 
18971 	/* trim the start */
18972 	new_start = VM_MAP_TRUNC_PAGE(new_start, target_page_mask);
18973 	if (new_start != 0) {
18974 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)0, (uint64_t)new_start);
18975 		/* get "target_copy_map" if needed and adjust it */
18976 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18977 		    &target_copy_map);
18978 		copy_map = target_copy_map;
18979 		vm_map_copy_trim(target_copy_map, target_page_shift,
18980 		    0, new_start);
18981 	}
18982 	*trimmed_start_p = new_start;
18983 
18984 	/* target_size starts with what's left after trimming */
18985 	target_size = copy_map->size;
18986 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18987 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18988 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
18989 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18990 
18991 	/* check for misalignments but don't adjust yet */
18992 	misalignments = 0;
18993 	overmap_start = 0;
18994 	overmap_end = 0;
18995 	if (copy_page_shift < target_page_shift) {
18996 		/*
18997 		 * Remapping from 4K to 16K: check the VM object alignments
18998 		 * throughout the range.
18999 		 * If the start and end of the range are mis-aligned, we can
19000 		 * over-map to re-align, and adjust the "overmap" start/end
19001 		 * and "target_size" of the range accordingly.
19002 		 * If there is any mis-alignment within the range:
19003 		 *     if "copy":
19004 		 *         we can do immediate-copy instead of copy-on-write,
19005 		 *     else:
19006 		 *         no way to remap and share; fail.
19007 		 */
19008 		for (entry = vm_map_copy_first_entry(copy_map);
19009 		    entry != vm_map_copy_to_entry(copy_map);
19010 		    entry = entry->vme_next) {
19011 			vm_object_offset_t object_offset_start, object_offset_end;
19012 
19013 			object_offset_start = VME_OFFSET(entry);
19014 			object_offset_end = object_offset_start;
19015 			object_offset_end += entry->vme_end - entry->vme_start;
19016 			if (object_offset_start & target_page_mask) {
19017 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
19018 					overmap_start++;
19019 				} else {
19020 					misalignments++;
19021 				}
19022 			}
19023 			if (object_offset_end & target_page_mask) {
19024 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
19025 					overmap_end++;
19026 				} else {
19027 					misalignments++;
19028 				}
19029 			}
19030 		}
19031 	}
19032 	entry = VM_MAP_ENTRY_NULL;
19033 
19034 	/* decide how to deal with misalignments */
19035 	assert(overmap_start <= 1);
19036 	assert(overmap_end <= 1);
19037 	if (!overmap_start && !overmap_end && !misalignments) {
19038 		/* copy_map is properly aligned for target_map ... */
19039 		if (*trimmed_start_p) {
19040 			/* ... but we trimmed it, so still need to adjust */
19041 		} else {
19042 			/* ... and we didn't trim anything: we're done */
19043 			if (target_copy_map == VM_MAP_COPY_NULL) {
19044 				target_copy_map = copy_map;
19045 			}
19046 			*target_copy_map_p = target_copy_map;
19047 			*overmap_start_p = 0;
19048 			*overmap_end_p = 0;
19049 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19050 			return KERN_SUCCESS;
19051 		}
19052 	} else if (misalignments && !copy) {
19053 		/* can't "share" if misaligned */
19054 		DEBUG4K_ADJUST("unsupported sharing\n");
19055 #if MACH_ASSERT
19056 		if (debug4k_panic_on_misaligned_sharing) {
19057 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
19058 		}
19059 #endif /* MACH_ASSERT */
19060 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
19061 		return KERN_NOT_SUPPORTED;
19062 	} else {
19063 		/* can't virtual-copy if misaligned (but can physical-copy) */
19064 		DEBUG4K_ADJUST("mis-aligned copying\n");
19065 	}
19066 
19067 	/* get a "target_copy_map" if needed and switch to it */
19068 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
19069 	copy_map = target_copy_map;
19070 
19071 	if (misalignments && copy) {
19072 		vm_map_size_t target_copy_map_size;
19073 
19074 		/*
19075 		 * Can't do copy-on-write with misaligned mappings.
19076 		 * Replace the mappings with a physical copy of the original
19077 		 * mappings' contents.
19078 		 */
19079 		target_copy_map_size = target_copy_map->size;
19080 		kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
19081 		if (kr != KERN_SUCCESS) {
19082 			return kr;
19083 		}
19084 		*target_copy_map_p = target_copy_map;
19085 		*overmap_start_p = 0;
19086 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
19087 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19088 		return KERN_SUCCESS;
19089 	}
19090 
19091 	/* apply the adjustments */
19092 	misalignments = 0;
19093 	overmap_start = 0;
19094 	overmap_end = 0;
19095 	/* remove copy_map->offset, so that everything starts at offset 0 */
19096 	addr_adjustment = copy_map->offset;
19097 	/* also remove whatever we trimmed from the start */
19098 	addr_adjustment += *trimmed_start_p;
19099 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
19100 	    target_entry != vm_map_copy_to_entry(target_copy_map);
19101 	    target_entry = target_entry->vme_next) {
19102 		vm_object_offset_t object_offset_start, object_offset_end;
19103 
19104 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19105 		object_offset_start = VME_OFFSET(target_entry);
19106 		if (object_offset_start & target_page_mask) {
19107 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19108 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
19109 				/*
19110 				 * start of 1st entry is mis-aligned:
19111 				 * re-adjust by over-mapping.
19112 				 */
19113 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
19114 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
19115 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
19116 			} else {
19117 				misalignments++;
19118 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
19119 				assert(copy);
19120 			}
19121 		}
19122 
19123 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
19124 			target_size += overmap_start;
19125 		} else {
19126 			target_entry->vme_start += overmap_start;
19127 		}
19128 		target_entry->vme_end += overmap_start;
19129 
19130 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
19131 		if (object_offset_end & target_page_mask) {
19132 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19133 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
19134 				/*
19135 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
19136 				 */
19137 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
19138 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
19139 				target_entry->vme_end += overmap_end;
19140 				target_size += overmap_end;
19141 			} else {
19142 				misalignments++;
19143 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
19144 				assert(copy);
19145 			}
19146 		}
19147 		target_entry->vme_start -= addr_adjustment;
19148 		target_entry->vme_end -= addr_adjustment;
19149 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19150 	}
19151 
19152 	target_copy_map->size = target_size;
19153 	target_copy_map->offset += overmap_start;
19154 	target_copy_map->offset -= addr_adjustment;
19155 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
19156 
19157 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
19158 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
19159 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
19160 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
19161 
19162 	*target_copy_map_p = target_copy_map;
19163 	*overmap_start_p = overmap_start;
19164 	*overmap_end_p = overmap_end;
19165 
19166 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19167 	return KERN_SUCCESS;
19168 }
19169 
19170 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)19171 vm_map_range_physical_size(
19172 	vm_map_t         map,
19173 	vm_map_address_t start,
19174 	mach_vm_size_t   size,
19175 	mach_vm_size_t * phys_size)
19176 {
19177 	kern_return_t   kr;
19178 	vm_map_copy_t   copy_map, target_copy_map;
19179 	vm_map_offset_t adjusted_start, adjusted_end;
19180 	vm_map_size_t   adjusted_size;
19181 	vm_prot_t       cur_prot, max_prot;
19182 	vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
19183 	vm_map_kernel_flags_t vmk_flags;
19184 
19185 	if (size == 0) {
19186 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
19187 		*phys_size = 0;
19188 		return KERN_SUCCESS;
19189 	}
19190 
19191 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
19192 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
19193 	if (__improbable(os_add_overflow(start, size, &end) ||
19194 	    adjusted_end <= adjusted_start)) {
19195 		/* wraparound */
19196 		printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
19197 		*phys_size = 0;
19198 		return KERN_INVALID_ARGUMENT;
19199 	}
19200 	if (__improbable(vm_map_range_overflows(map, start, size))) {
19201 		*phys_size = 0;
19202 		return KERN_INVALID_ADDRESS;
19203 	}
19204 	assert(adjusted_end > adjusted_start);
19205 	adjusted_size = adjusted_end - adjusted_start;
19206 	*phys_size = adjusted_size;
19207 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
19208 		return KERN_SUCCESS;
19209 	}
19210 	if (start == 0) {
19211 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
19212 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
19213 		if (__improbable(adjusted_end <= adjusted_start)) {
19214 			/* wraparound */
19215 			printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
19216 			*phys_size = 0;
19217 			return KERN_INVALID_ARGUMENT;
19218 		}
19219 		assert(adjusted_end > adjusted_start);
19220 		adjusted_size = adjusted_end - adjusted_start;
19221 		*phys_size = adjusted_size;
19222 		return KERN_SUCCESS;
19223 	}
19224 
19225 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
19226 	vmk_flags.vmkf_copy_pageable = TRUE;
19227 	vmk_flags.vmkf_copy_same_map = TRUE;
19228 	assert(adjusted_size != 0);
19229 	cur_prot = VM_PROT_NONE; /* legacy mode */
19230 	max_prot = VM_PROT_NONE; /* legacy mode */
19231 	vmk_flags.vmkf_remap_legacy_mode = true;
19232 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
19233 	    FALSE /* copy */,
19234 	    &copy_map,
19235 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
19236 	    vmk_flags);
19237 	if (kr != KERN_SUCCESS) {
19238 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19239 		//assert(0);
19240 		*phys_size = 0;
19241 		return kr;
19242 	}
19243 	assert(copy_map != VM_MAP_COPY_NULL);
19244 	target_copy_map = copy_map;
19245 	DEBUG4K_ADJUST("adjusting...\n");
19246 	kr = vm_map_copy_adjust_to_target(
19247 		copy_map,
19248 		start - adjusted_start, /* offset */
19249 		size, /* size */
19250 		kernel_map,
19251 		FALSE,                          /* copy */
19252 		&target_copy_map,
19253 		&overmap_start,
19254 		&overmap_end,
19255 		&trimmed_start);
19256 	if (kr == KERN_SUCCESS) {
19257 		if (target_copy_map->size != *phys_size) {
19258 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
19259 		}
19260 		*phys_size = target_copy_map->size;
19261 	} else {
19262 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19263 		//assert(0);
19264 		*phys_size = 0;
19265 	}
19266 	vm_map_copy_discard(copy_map);
19267 	copy_map = VM_MAP_COPY_NULL;
19268 
19269 	return kr;
19270 }
19271 
19272 static __attribute__((always_inline, warn_unused_result))
19273 kern_return_t
vm_map_remap_sanitize(vm_map_t src_map,vm_map_t target_map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_offset_ut memory_address_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * target_addr,vm_map_address_t * mask,vm_map_offset_t * memory_address,vm_map_offset_t * memory_end,vm_map_size_t * memory_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)19274 vm_map_remap_sanitize(
19275 	vm_map_t                src_map,
19276 	vm_map_t                target_map,
19277 	vm_map_address_ut       address_u,
19278 	vm_map_size_ut          size_u,
19279 	vm_map_offset_ut        mask_u,
19280 	vm_map_offset_ut        memory_address_u,
19281 	vm_prot_ut              cur_protection_u,
19282 	vm_prot_ut              max_protection_u,
19283 	vm_inherit_ut           inheritance_u,
19284 	vm_map_kernel_flags_t   vmk_flags,
19285 	vm_map_address_t       *target_addr,
19286 	vm_map_address_t       *mask,
19287 	vm_map_offset_t        *memory_address,
19288 	vm_map_offset_t        *memory_end,
19289 	vm_map_size_t          *memory_size,
19290 	vm_prot_t              *cur_protection,
19291 	vm_prot_t              *max_protection,
19292 	vm_inherit_t           *inheritance)
19293 {
19294 	kern_return_t           result;
19295 	vm_sanitize_flags_t     vm_sanitize_flags;
19296 
19297 	result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_VM_MAP_REMAP,
19298 	    inheritance);
19299 	if (__improbable(result != KERN_SUCCESS)) {
19300 		return result;
19301 	}
19302 
19303 	result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
19304 	    VM_SANITIZE_CALLER_VM_MAP_REMAP, target_map,
19305 	    cur_protection, max_protection);
19306 	if (__improbable(result != KERN_SUCCESS)) {
19307 		return result;
19308 	}
19309 
19310 	result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, mask);
19311 	if (__improbable(result != KERN_SUCCESS)) {
19312 		return result;
19313 	}
19314 
19315 	/*
19316 	 * If the user is requesting that we return the address of the
19317 	 * first byte of the data (rather than the base of the page),
19318 	 * then we use different rounding semantics: specifically,
19319 	 * we assume that (memory_address, size) describes a region
19320 	 * all of whose pages we must cover, rather than a base to be truncated
19321 	 * down and a size to be added to that base.  So we figure out
19322 	 * the highest page that the requested region includes and make
19323 	 * sure that the size will cover it.
19324 	 *
19325 	 * The key example we're worried about it is of the form:
19326 	 *
19327 	 *              memory_address = 0x1ff0, size = 0x20
19328 	 *
19329 	 * With the old semantics, we round down the memory_address to 0x1000
19330 	 * and round up the size to 0x1000, resulting in our covering *only*
19331 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
19332 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
19333 	 * 0x1000 and page 0x2000 in the region we remap.
19334 	 *
19335 	 * VM_SANITIZE_FLAGS_REALIGN_START asks for the old (broken) semantics.
19336 	 */
19337 	vm_sanitize_flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS;
19338 	if (!vmk_flags.vmf_return_data_addr) {
19339 		vm_sanitize_flags |= VM_SANITIZE_FLAGS_REALIGN_START;
19340 	}
19341 
19342 	result = vm_sanitize_addr_size(memory_address_u, size_u,
19343 	    VM_SANITIZE_CALLER_VM_MAP_REMAP, src_map,
19344 	    vm_sanitize_flags, memory_address, memory_end,
19345 	    memory_size);
19346 	if (__improbable(result != KERN_SUCCESS)) {
19347 		return result;
19348 	}
19349 
19350 	*target_addr = vm_sanitize_addr(target_map, address_u);
19351 	return KERN_SUCCESS;
19352 }
19353 
19354 /*
19355  *	Routine:	vm_remap
19356  *
19357  *			Map portion of a task's address space.
19358  *			Mapped region must not overlap more than
19359  *			one vm memory object. Protections and
19360  *			inheritance attributes remain the same
19361  *			as in the original task and are	out parameters.
19362  *			Source and Target task can be identical
19363  *			Other attributes are identical as for vm_map()
19364  */
19365 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_ut * address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_ut memory_address_u,boolean_t copy,vm_prot_ut * cur_protection_u,vm_prot_ut * max_protection_u,vm_inherit_ut inheritance_u)19366 vm_map_remap(
19367 	vm_map_t                target_map,
19368 	vm_map_address_ut      *address_u,
19369 	vm_map_size_ut          size_u,
19370 	vm_map_offset_ut        mask_u,
19371 	vm_map_kernel_flags_t   vmk_flags,
19372 	vm_map_t                src_map,
19373 	vm_map_offset_ut        memory_address_u,
19374 	boolean_t               copy,
19375 	vm_prot_ut             *cur_protection_u, /* IN/OUT */
19376 	vm_prot_ut             *max_protection_u, /* IN/OUT */
19377 	vm_inherit_ut           inheritance_u)
19378 {
19379 	vm_map_address_t        target_addr, mask;
19380 	vm_map_size_t           target_size;
19381 	vm_map_offset_t         memory_address, memory_end;
19382 	vm_map_size_t           memory_size;
19383 	vm_prot_t               cur_protection, max_protection;
19384 	vm_inherit_t            inheritance;
19385 	kern_return_t           result;
19386 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
19387 	vm_map_copy_t           copy_map;
19388 	vm_map_offset_t         offset_in_mapping;
19389 	vm_map_size_t           src_page_mask, target_page_mask;
19390 	vm_map_size_t           initial_size;
19391 	VM_MAP_ZAP_DECLARE(zap_list);
19392 
19393 	if (target_map == VM_MAP_NULL || src_map == VM_MAP_NULL) {
19394 		return KERN_INVALID_ARGUMENT;
19395 	}
19396 	src_page_mask    = VM_MAP_PAGE_MASK(src_map);
19397 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
19398 
19399 	if (src_page_mask != target_page_mask) {
19400 		if (copy) {
19401 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19402 		} else {
19403 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19404 		}
19405 	}
19406 
19407 	/*
19408 	 * Sanitize any input parameters that are addr/size/prot/inherit
19409 	 */
19410 	result = vm_map_remap_sanitize(src_map,
19411 	    target_map,
19412 	    *address_u,
19413 	    size_u,
19414 	    mask_u,
19415 	    memory_address_u,
19416 	    *cur_protection_u,
19417 	    *max_protection_u,
19418 	    inheritance_u,
19419 	    vmk_flags,
19420 	    &target_addr,
19421 	    &mask,
19422 	    &memory_address,
19423 	    &memory_end,
19424 	    &memory_size,
19425 	    &cur_protection,
19426 	    &max_protection,
19427 	    &inheritance);
19428 	if (__improbable(result != KERN_SUCCESS)) {
19429 		return vm_sanitize_get_kr(result);
19430 	}
19431 
19432 	if (vmk_flags.vmf_return_data_addr) {
19433 		/*
19434 		 * This is safe to unwrap now that the quantities
19435 		 * have been validated and rounded up normally.
19436 		 */
19437 		offset_in_mapping = vm_sanitize_offset_in_page(src_map,
19438 		    memory_address_u);
19439 		initial_size = VM_SANITIZE_UNSAFE_UNWRAP(size_u);
19440 	} else {
19441 		/*
19442 		 * IMPORTANT:
19443 		 * This legacy code path is broken: for the range mentioned
19444 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19445 		 * two 4k pages, it yields [ memory_address = 0x1000,
19446 		 * size = 0x1000 ], which covers only the first 4k page.
19447 		 * BUT some code unfortunately depends on this bug, so we
19448 		 * can't fix it without breaking something.
19449 		 * New code should get automatically opted in the new
19450 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19451 		 */
19452 		offset_in_mapping = 0;
19453 		initial_size = memory_size;
19454 	}
19455 
19456 	if (vmk_flags.vmf_resilient_media) {
19457 		/* must be copy-on-write to be "media resilient" */
19458 		if (!copy) {
19459 			return KERN_INVALID_ARGUMENT;
19460 		}
19461 	}
19462 
19463 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19464 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19465 
19466 	assert(memory_size != 0);
19467 	result = vm_map_copy_extract(src_map,
19468 	    memory_address,
19469 	    memory_size,
19470 	    copy, &copy_map,
19471 	    &cur_protection, /* IN/OUT */
19472 	    &max_protection, /* IN/OUT */
19473 	    inheritance,
19474 	    vmk_flags);
19475 	if (result != KERN_SUCCESS) {
19476 		return result;
19477 	}
19478 	assert(copy_map != VM_MAP_COPY_NULL);
19479 
19480 	/*
19481 	 * Handle the policy for vm map ranges
19482 	 *
19483 	 * If the maps differ, the target_map policy applies like for vm_map()
19484 	 * For same mapping remaps, we preserve the range.
19485 	 */
19486 	if (vmk_flags.vmkf_copy_same_map) {
19487 		vmk_flags.vmkf_range_id = copy_map->orig_range;
19488 	} else {
19489 		vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, memory_size);
19490 	}
19491 
19492 	target_size = memory_size;
19493 	if (src_page_mask != target_page_mask) {
19494 		vm_map_copy_t   target_copy_map;
19495 		vm_map_offset_t overmap_start = 0;
19496 		vm_map_offset_t overmap_end   = 0;
19497 		vm_map_offset_t trimmed_start = 0;
19498 
19499 		target_copy_map = copy_map; /* can modify "copy_map" itself */
19500 		DEBUG4K_ADJUST("adjusting...\n");
19501 		result = vm_map_copy_adjust_to_target(
19502 			copy_map,
19503 			offset_in_mapping, /* offset */
19504 			initial_size,
19505 			target_map,
19506 			copy,
19507 			&target_copy_map,
19508 			&overmap_start,
19509 			&overmap_end,
19510 			&trimmed_start);
19511 		if (result != KERN_SUCCESS) {
19512 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19513 			vm_map_copy_discard(copy_map);
19514 			return result;
19515 		}
19516 		if (trimmed_start == 0) {
19517 			/* nothing trimmed: no adjustment needed */
19518 		} else if (trimmed_start >= offset_in_mapping) {
19519 			/* trimmed more than offset_in_mapping: nothing left */
19520 			assert(overmap_start == 0);
19521 			assert(overmap_end == 0);
19522 			offset_in_mapping = 0;
19523 		} else {
19524 			/* trimmed some of offset_in_mapping: adjust */
19525 			assert(overmap_start == 0);
19526 			assert(overmap_end == 0);
19527 			offset_in_mapping -= trimmed_start;
19528 		}
19529 		offset_in_mapping += overmap_start;
19530 		target_size = target_copy_map->size;
19531 	}
19532 
19533 	/*
19534 	 * Allocate/check a range of free virtual address
19535 	 * space for the target
19536 	 */
19537 	target_size = vm_map_round_page(target_size, target_page_mask);
19538 
19539 	if (target_size == 0) {
19540 		vm_map_copy_discard(copy_map);
19541 		return KERN_INVALID_ARGUMENT;
19542 	}
19543 
19544 	if (__improbable(!vm_map_is_map_size_valid(
19545 		    target_map, target_size, vmk_flags.vmkf_no_soft_limit))) {
19546 		vm_map_copy_discard(copy_map);
19547 		return KERN_NO_SPACE;
19548 	}
19549 
19550 	vm_map_lock(target_map);
19551 
19552 	if (!vmk_flags.vmf_fixed) {
19553 		result = vm_map_locate_space_anywhere(target_map, target_size,
19554 		    mask, vmk_flags, &target_addr, &insp_entry);
19555 	} else {
19556 		/*
19557 		 * vm_map_locate_space_fixed will reject overflowing
19558 		 * target_addr + target_size values
19559 		 */
19560 		result = vm_map_locate_space_fixed(target_map, target_addr,
19561 		    target_size, mask, vmk_flags, &insp_entry, &zap_list);
19562 
19563 		if (result == KERN_MEMORY_PRESENT) {
19564 			assert(!vmk_flags.vmkf_already);
19565 			insp_entry = VM_MAP_ENTRY_NULL;
19566 			result = KERN_NO_SPACE;
19567 		}
19568 	}
19569 
19570 	if (result == KERN_SUCCESS) {
19571 		while (vm_map_copy_first_entry(copy_map) !=
19572 		    vm_map_copy_to_entry(copy_map)) {
19573 			vm_map_entry_t entry = vm_map_copy_first_entry(copy_map);
19574 
19575 			vm_map_copy_entry_unlink(copy_map, entry);
19576 
19577 			if (vmk_flags.vmkf_remap_prot_copy) {
19578 				/*
19579 				 * This vm_map_remap() is for a
19580 				 * vm_protect(VM_PROT_COPY), so the caller
19581 				 * expects to be allowed to add write access
19582 				 * to this new mapping.  This is done by
19583 				 * adding VM_PROT_WRITE to each entry's
19584 				 * max_protection... unless some security
19585 				 * settings disallow it.
19586 				 */
19587 				bool allow_write = false;
19588 				if (entry->vme_permanent) {
19589 					/* immutable mapping... */
19590 					if ((entry->max_protection & VM_PROT_EXECUTE) &&
19591 					    developer_mode_state()) {
19592 						/*
19593 						 * ... but executable and
19594 						 * possibly being debugged,
19595 						 * so let's allow it to become
19596 						 * writable, for breakpoints
19597 						 * and dtrace probes, for
19598 						 * example.
19599 						 */
19600 						allow_write = true;
19601 					} else {
19602 						printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19603 						    proc_selfpid(),
19604 						    (get_bsdtask_info(current_task())
19605 						    ? proc_name_address(get_bsdtask_info(current_task()))
19606 						    : "?"),
19607 						    (uint64_t)memory_address,
19608 						    (uint64_t)memory_size,
19609 						    entry->protection,
19610 						    entry->max_protection,
19611 						    developer_mode_state());
19612 						DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19613 						    vm_map_entry_t, entry,
19614 						    vm_map_offset_t, entry->vme_start,
19615 						    vm_map_offset_t, entry->vme_end,
19616 						    vm_prot_t, entry->protection,
19617 						    vm_prot_t, entry->max_protection,
19618 						    int, VME_ALIAS(entry));
19619 					}
19620 				} else {
19621 					allow_write = true;
19622 				}
19623 
19624 				/*
19625 				 * VM_PROT_COPY: allow this mapping to become
19626 				 * writable, unless it was "permanent".
19627 				 */
19628 				if (allow_write) {
19629 					entry->max_protection |= VM_PROT_WRITE;
19630 				}
19631 			}
19632 			if (vmk_flags.vmf_resilient_codesign) {
19633 				/* no codesigning -> read-only access */
19634 				entry->max_protection = VM_PROT_READ;
19635 				entry->protection = VM_PROT_READ;
19636 				entry->vme_resilient_codesign = TRUE;
19637 			}
19638 			entry->vme_start += target_addr;
19639 			entry->vme_end += target_addr;
19640 			assert(!entry->map_aligned);
19641 			if (vmk_flags.vmf_resilient_media &&
19642 			    !entry->is_sub_map &&
19643 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19644 			    VME_OBJECT(entry)->internal)) {
19645 				entry->vme_resilient_media = TRUE;
19646 			}
19647 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19648 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19649 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19650 			vm_map_store_entry_link(target_map, insp_entry, entry,
19651 			    vmk_flags);
19652 			insp_entry = entry;
19653 		}
19654 	}
19655 
19656 	if (vmk_flags.vmf_resilient_codesign) {
19657 		cur_protection = VM_PROT_READ;
19658 		max_protection = VM_PROT_READ;
19659 	}
19660 
19661 	if (result == KERN_SUCCESS) {
19662 		target_map->size += target_size;
19663 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19664 	}
19665 	vm_map_unlock(target_map);
19666 
19667 	vm_map_zap_dispose(&zap_list);
19668 
19669 	if (result == KERN_SUCCESS && target_map->wiring_required) {
19670 		result = vm_map_wire_nested(target_map, target_addr,
19671 		    target_addr + target_size, cur_protection, VM_KERN_MEMORY_MLOCK,
19672 		    TRUE, PMAP_NULL, 0, NULL);
19673 	}
19674 
19675 	if (result == KERN_SUCCESS) {
19676 #if KASAN
19677 		if (target_map->pmap == kernel_pmap) {
19678 			kasan_notify_address(target_addr, target_size);
19679 		}
19680 #endif
19681 		/*
19682 		 * If requested, return the address of the data pointed to by the
19683 		 * request, rather than the base of the resulting page.
19684 		 */
19685 		if (vmk_flags.vmf_return_data_addr) {
19686 			target_addr += offset_in_mapping;
19687 		}
19688 
19689 		/*
19690 		 * Update OUT parameters.
19691 		 */
19692 		*address_u = vm_sanitize_wrap_addr(target_addr);
19693 
19694 		*cur_protection_u = vm_sanitize_wrap_prot(cur_protection);
19695 		*max_protection_u = vm_sanitize_wrap_prot(max_protection);
19696 	}
19697 
19698 	if (src_page_mask != target_page_mask) {
19699 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)target_size, copy, target_map, (uint64_t)target_addr, (uint64_t)offset_in_mapping, result);
19700 	}
19701 	vm_map_copy_discard(copy_map);
19702 	copy_map = VM_MAP_COPY_NULL;
19703 
19704 	return result;
19705 }
19706 
19707 /*
19708  *	vm_map_switch_to:
19709  *
19710  *	Set the address map for the current thread to the specified map.
19711  *  Returns a struct containing info about the previous map, which should be
19712  *  restored with `vm_map_switch_back`
19713  */
19714 
19715 vm_map_switch_context_t
vm_map_switch_to(vm_map_t map)19716 vm_map_switch_to(vm_map_t map)
19717 {
19718 	thread_t thread = current_thread();
19719 	vm_map_t oldmap = thread->map;
19720 
19721 	/*
19722 	 * Deactivate the current map and activate the requested map
19723 	 */
19724 	mp_disable_preemption();
19725 	PMAP_SWITCH_USER(thread, map, cpu_number());
19726 	mp_enable_preemption();
19727 
19728 	vm_map_lock(map);
19729 	task_t task = map->owning_task;
19730 	if (task) {
19731 		task_reference(task);
19732 	}
19733 	vm_map_unlock(map);
19734 
19735 	return (vm_map_switch_context_t) { oldmap, task };
19736 }
19737 
19738 void
vm_map_switch_back(vm_map_switch_context_t ctx)19739 vm_map_switch_back(vm_map_switch_context_t ctx)
19740 {
19741 	thread_t thread = current_thread();
19742 	task_t task = ctx.task;
19743 	vm_map_t map = ctx.map;
19744 
19745 	if (task) {
19746 		task_deallocate(task);
19747 	} else {
19748 		/*
19749 		 * We want to make sure that vm_map_setup was not called while the
19750 		 * map was switched. This allows us to guarantee the property that
19751 		 * we always have a reference on current_map()->owning_task if it is
19752 		 * not NULL.
19753 		 */
19754 		assert(!thread->map->owning_task);
19755 	}
19756 
19757 	/*
19758 	 * Restore the original map from prior to vm_map_switch_to
19759 	 */
19760 	mp_disable_preemption();
19761 	PMAP_SWITCH_USER(thread, map, cpu_number());
19762 	mp_enable_preemption();
19763 }
19764 
19765 static __attribute__((always_inline, warn_unused_result))
19766 kern_return_t
vm_map_rw_user_sanitize(vm_map_t map,vm_map_address_ut addr_u,vm_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_address_t * addr,vm_map_address_t * end,vm_map_size_t * size)19767 vm_map_rw_user_sanitize(
19768 	vm_map_t                map,
19769 	vm_map_address_ut       addr_u,
19770 	vm_size_ut              size_u,
19771 	vm_sanitize_caller_t    vm_sanitize_caller,
19772 	vm_map_address_t       *addr,
19773 	vm_map_address_t       *end,
19774 	vm_map_size_t          *size)
19775 {
19776 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
19777 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
19778 	    VM_SANITIZE_FLAGS_CHECK_ADDR_RANGE;
19779 
19780 	return vm_sanitize_addr_size(addr_u, size_u,
19781 	           vm_sanitize_caller, map,
19782 	           flags,
19783 	           addr, end, size);
19784 }
19785 
19786 /*
19787  *	Routine:	vm_map_write_user
19788  *
19789  *	Description:
19790  *		Copy out data from a kernel space into space in the
19791  *		destination map. The space must already exist in the
19792  *		destination map.
19793  *		NOTE:  This routine should only be called by threads
19794  *		which can block on a page fault. i.e. kernel mode user
19795  *		threads.
19796  *
19797  */
19798 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_ut dst_addr_u,vm_size_ut size_u)19799 vm_map_write_user(
19800 	vm_map_t                map,
19801 	void                   *src_p,
19802 	vm_map_address_ut       dst_addr_u,
19803 	vm_size_ut              size_u)
19804 {
19805 	kern_return_t    kr;
19806 	vm_map_address_t dst_addr, dst_end;
19807 	vm_map_size_t    size;
19808 
19809 	/*
19810 	 * src_p isn't validated: [src_p, src_p + size_u)
19811 	 * is trusted kernel input.
19812 	 *
19813 	 * dst_addr_u and size_u are untrusted and need to be sanitized.
19814 	 */
19815 	kr = vm_map_rw_user_sanitize(map,
19816 	    dst_addr_u,
19817 	    size_u,
19818 	    VM_SANITIZE_CALLER_VM_MAP_WRITE_USER,
19819 	    &dst_addr,
19820 	    &dst_end,
19821 	    &size);
19822 	if (__improbable(kr != KERN_SUCCESS)) {
19823 		return vm_sanitize_get_kr(kr);
19824 	}
19825 
19826 	if (current_map() == map) {
19827 		if (copyout(src_p, dst_addr, size)) {
19828 			kr = KERN_INVALID_ADDRESS;
19829 		}
19830 	} else {
19831 		vm_map_switch_context_t switch_ctx;
19832 
19833 		/* take on the identity of the target map while doing */
19834 		/* the transfer */
19835 
19836 		vm_map_reference(map);
19837 		switch_ctx = vm_map_switch_to(map);
19838 		if (copyout(src_p, dst_addr, size)) {
19839 			kr = KERN_INVALID_ADDRESS;
19840 		}
19841 		vm_map_switch_back(switch_ctx);
19842 		vm_map_deallocate(map);
19843 	}
19844 	return kr;
19845 }
19846 
19847 /*
19848  *	Routine:	vm_map_read_user
19849  *
19850  *	Description:
19851  *		Copy in data from a user space source map into the
19852  *		kernel map. The space must already exist in the
19853  *		kernel map.
19854  *		NOTE:  This routine should only be called by threads
19855  *		which can block on a page fault. i.e. kernel mode user
19856  *		threads.
19857  *
19858  */
19859 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_ut src_addr_u,void * dst_p,vm_size_ut size_u)19860 vm_map_read_user(
19861 	vm_map_t                map,
19862 	vm_map_address_ut       src_addr_u,
19863 	void                   *dst_p,
19864 	vm_size_ut              size_u)
19865 {
19866 	kern_return_t    kr;
19867 	vm_map_address_t src_addr, src_end;
19868 	vm_map_size_t    size;
19869 
19870 	/*
19871 	 * dst_p isn't validated: [dst_p, dst_p + size_u)
19872 	 * is trusted kernel input.
19873 	 *
19874 	 * src_addr_u and size_u are untrusted and need to be sanitized.
19875 	 */
19876 	kr = vm_map_rw_user_sanitize(map,
19877 	    src_addr_u,
19878 	    size_u,
19879 	    VM_SANITIZE_CALLER_VM_MAP_READ_USER,
19880 	    &src_addr,
19881 	    &src_end,
19882 	    &size);
19883 	if (__improbable(kr != KERN_SUCCESS)) {
19884 		return vm_sanitize_get_kr(kr);
19885 	}
19886 
19887 	if (current_map() == map) {
19888 		if (copyin(src_addr, dst_p, size)) {
19889 			kr = KERN_INVALID_ADDRESS;
19890 		}
19891 	} else {
19892 		vm_map_switch_context_t switch_ctx;
19893 
19894 		/* take on the identity of the target map while doing */
19895 		/* the transfer */
19896 
19897 		vm_map_reference(map);
19898 		switch_ctx = vm_map_switch_to(map);
19899 		if (copyin(src_addr, dst_p, size)) {
19900 			kr = KERN_INVALID_ADDRESS;
19901 		}
19902 		vm_map_switch_back(switch_ctx);
19903 		vm_map_deallocate(map);
19904 	}
19905 	return kr;
19906 }
19907 
19908 
19909 static __attribute__((always_inline, warn_unused_result))
19910 kern_return_t
vm_map_check_protection_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * protection)19911 vm_map_check_protection_sanitize(
19912 	vm_map_t                map,
19913 	vm_map_offset_ut        start_u,
19914 	vm_map_offset_ut        end_u,
19915 	vm_prot_ut              protection_u,
19916 	vm_sanitize_caller_t    vm_sanitize_caller,
19917 	vm_map_offset_t        *start,
19918 	vm_map_offset_t        *end,
19919 	vm_prot_t              *protection)
19920 {
19921 	kern_return_t           kr;
19922 	vm_map_size_t           size;
19923 
19924 	kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
19925 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start, end,
19926 	    &size);
19927 	if (__improbable(kr != KERN_SUCCESS)) {
19928 		return kr;
19929 	}
19930 
19931 	/*
19932 	 * Given that the protection is used only for comparisons below
19933 	 * no sanitization is being applied on it.
19934 	 */
19935 	*protection = VM_SANITIZE_UNSAFE_UNWRAP(protection_u);
19936 
19937 	return KERN_SUCCESS;
19938 }
19939 
19940 /*
19941  *	vm_map_check_protection:
19942  *
19943  *	Assert that the target map allows the specified
19944  *	privilege on the entire address region given.
19945  *	The entire region must be allocated.
19946  */
19947 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller)19948 vm_map_check_protection(
19949 	vm_map_t                map,
19950 	vm_map_offset_ut        start_u,
19951 	vm_map_offset_ut        end_u,
19952 	vm_prot_ut              protection_u,
19953 	vm_sanitize_caller_t    vm_sanitize_caller)
19954 {
19955 	vm_map_entry_t entry;
19956 	vm_map_entry_t tmp_entry;
19957 	vm_map_offset_t start;
19958 	vm_map_offset_t end;
19959 	vm_prot_t protection;
19960 	kern_return_t kr;
19961 
19962 	kr = vm_map_check_protection_sanitize(map,
19963 	    start_u,
19964 	    end_u,
19965 	    protection_u,
19966 	    vm_sanitize_caller,
19967 	    &start,
19968 	    &end,
19969 	    &protection);
19970 	if (__improbable(kr != KERN_SUCCESS)) {
19971 		kr = vm_sanitize_get_kr(kr);
19972 		if (kr == KERN_SUCCESS) {
19973 			return true;
19974 		}
19975 		return false;
19976 	}
19977 
19978 	vm_map_lock(map);
19979 
19980 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
19981 		vm_map_unlock(map);
19982 		return false;
19983 	}
19984 
19985 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19986 		vm_map_unlock(map);
19987 		return false;
19988 	}
19989 
19990 	entry = tmp_entry;
19991 
19992 	while (start < end) {
19993 		if (entry == vm_map_to_entry(map)) {
19994 			vm_map_unlock(map);
19995 			return false;
19996 		}
19997 
19998 		/*
19999 		 *	No holes allowed!
20000 		 */
20001 
20002 		if (start < entry->vme_start) {
20003 			vm_map_unlock(map);
20004 			return false;
20005 		}
20006 
20007 		/*
20008 		 * Check protection associated with entry.
20009 		 */
20010 
20011 		if ((entry->protection & protection) != protection) {
20012 			vm_map_unlock(map);
20013 			return false;
20014 		}
20015 
20016 		/* go to next entry */
20017 
20018 		start = entry->vme_end;
20019 		entry = entry->vme_next;
20020 	}
20021 	vm_map_unlock(map);
20022 	return true;
20023 }
20024 
20025 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_ut address_u,vm_purgable_t control,int * state)20026 vm_map_purgable_control(
20027 	vm_map_t                map,
20028 	vm_map_offset_ut        address_u,
20029 	vm_purgable_t           control,
20030 	int                    *state)
20031 {
20032 	vm_map_offset_t         address;
20033 	vm_map_entry_t          entry;
20034 	vm_object_t             object;
20035 	kern_return_t           kr;
20036 	boolean_t               was_nonvolatile;
20037 
20038 	/*
20039 	 * Vet all the input parameters and current type and state of the
20040 	 * underlaying object.  Return with an error if anything is amiss.
20041 	 */
20042 	if (map == VM_MAP_NULL) {
20043 		return KERN_INVALID_ARGUMENT;
20044 	}
20045 
20046 	if (control != VM_PURGABLE_SET_STATE &&
20047 	    control != VM_PURGABLE_GET_STATE &&
20048 	    control != VM_PURGABLE_PURGE_ALL &&
20049 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
20050 		return KERN_INVALID_ARGUMENT;
20051 	}
20052 
20053 	if (control == VM_PURGABLE_PURGE_ALL) {
20054 		vm_purgeable_object_purge_all();
20055 		return KERN_SUCCESS;
20056 	}
20057 
20058 	if ((control == VM_PURGABLE_SET_STATE ||
20059 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
20060 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
20061 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
20062 		return KERN_INVALID_ARGUMENT;
20063 	}
20064 
20065 	address = vm_sanitize_addr(map, address_u);
20066 
20067 	vm_map_lock_read(map);
20068 
20069 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
20070 		/*
20071 		 * Must pass a valid non-submap address.
20072 		 */
20073 		vm_map_unlock_read(map);
20074 		return KERN_INVALID_ADDRESS;
20075 	}
20076 
20077 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
20078 	    control != VM_PURGABLE_GET_STATE) {
20079 		/*
20080 		 * Can't apply purgable controls to something you can't write.
20081 		 */
20082 		vm_map_unlock_read(map);
20083 		return KERN_PROTECTION_FAILURE;
20084 	}
20085 
20086 	object = VME_OBJECT(entry);
20087 	if (object == VM_OBJECT_NULL ||
20088 	    object->purgable == VM_PURGABLE_DENY) {
20089 		/*
20090 		 * Object must already be present and be purgeable.
20091 		 */
20092 		vm_map_unlock_read(map);
20093 		return KERN_INVALID_ARGUMENT;
20094 	}
20095 
20096 	vm_object_lock(object);
20097 
20098 #if 00
20099 	if (VME_OFFSET(entry) != 0 ||
20100 	    entry->vme_end - entry->vme_start != object->vo_size) {
20101 		/*
20102 		 * Can only apply purgable controls to the whole (existing)
20103 		 * object at once.
20104 		 */
20105 		vm_map_unlock_read(map);
20106 		vm_object_unlock(object);
20107 		return KERN_INVALID_ARGUMENT;
20108 	}
20109 #endif
20110 
20111 	assert(!entry->is_sub_map);
20112 	assert(!entry->use_pmap); /* purgeable has its own accounting */
20113 
20114 	vm_map_unlock_read(map);
20115 
20116 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
20117 
20118 	kr = vm_object_purgable_control(object, control, state);
20119 
20120 	if (was_nonvolatile &&
20121 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
20122 	    map->pmap == kernel_pmap) {
20123 #if DEBUG
20124 		object->vo_purgeable_volatilizer = kernel_task;
20125 #endif /* DEBUG */
20126 	}
20127 
20128 	vm_object_unlock(object);
20129 
20130 	return kr;
20131 }
20132 
20133 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)20134 vm_map_footprint_query_page_info(
20135 	vm_map_t        map,
20136 	vm_map_entry_t  map_entry,
20137 	vm_map_offset_t curr_s_offset,
20138 	int             *disposition_p)
20139 {
20140 	int             pmap_disp;
20141 	vm_object_t     object = VM_OBJECT_NULL;
20142 	int             disposition;
20143 	int             effective_page_size;
20144 
20145 	vm_map_lock_assert_held(map);
20146 	assert(!map->has_corpse_footprint);
20147 	assert(curr_s_offset >= map_entry->vme_start);
20148 	assert(curr_s_offset < map_entry->vme_end);
20149 
20150 	if (map_entry->is_sub_map) {
20151 		if (!map_entry->use_pmap) {
20152 			/* nested pmap: no footprint */
20153 			*disposition_p = 0;
20154 			return;
20155 		}
20156 	} else {
20157 		object = VME_OBJECT(map_entry);
20158 		if (object == VM_OBJECT_NULL) {
20159 			/* nothing mapped here: no need to ask */
20160 			*disposition_p = 0;
20161 			return;
20162 		}
20163 	}
20164 
20165 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
20166 
20167 	pmap_disp = 0;
20168 
20169 	/*
20170 	 * Query the pmap.
20171 	 */
20172 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
20173 
20174 	/*
20175 	 * Compute this page's disposition.
20176 	 */
20177 	disposition = 0;
20178 
20179 	/* deal with "alternate accounting" first */
20180 	if (!map_entry->is_sub_map &&
20181 	    object->vo_no_footprint) {
20182 		/* does not count in footprint */
20183 //		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20184 	} else if (!map_entry->is_sub_map &&
20185 	    !object->internal &&
20186 	    object->vo_ledger_tag &&
20187 	    VM_OBJECT_OWNER(object) != NULL &&
20188 	    VM_OBJECT_OWNER(object)->map == map) {
20189 		/* owned external object: wired pages count in footprint */
20190 		assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20191 		if ((((curr_s_offset
20192 		    - map_entry->vme_start
20193 		    + VME_OFFSET(map_entry))
20194 		    / effective_page_size) <
20195 		    object->wired_page_count)) {
20196 			/*
20197 			 * External object owned by this task: report the first
20198 			 * "#wired" pages as "resident" (to show that they
20199 			 * contribute to the footprint) but not "dirty"
20200 			 * (to avoid double-counting with the fake "owned"
20201 			 * region we'll report at the end of the address space
20202 			 * to account for all (mapped or not) owned memory
20203 			 * owned by this task.
20204 			 */
20205 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20206 		}
20207 	} else if (!map_entry->is_sub_map &&
20208 	    object->internal &&
20209 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
20210 	    (object->purgable == VM_PURGABLE_DENY &&
20211 	    object->vo_ledger_tag)) &&
20212 	    VM_OBJECT_OWNER(object) != NULL &&
20213 	    VM_OBJECT_OWNER(object)->map == map) {
20214 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20215 		if ((((curr_s_offset
20216 		    - map_entry->vme_start
20217 		    + VME_OFFSET(map_entry))
20218 		    / effective_page_size) <
20219 		    (object->resident_page_count +
20220 		    vm_compressor_pager_get_count(object->pager)))) {
20221 			/*
20222 			 * Non-volatile purgeable object owned
20223 			 * by this task: report the first
20224 			 * "#resident + #compressed" pages as
20225 			 * "resident" (to show that they
20226 			 * contribute to the footprint) but not
20227 			 * "dirty" (to avoid double-counting
20228 			 * with the fake "non-volatile" region
20229 			 * we'll report at the end of the
20230 			 * address space to account for all
20231 			 * (mapped or not) non-volatile memory
20232 			 * owned by this task.
20233 			 */
20234 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20235 		}
20236 	} else if (!map_entry->is_sub_map &&
20237 	    object->internal &&
20238 	    (object->purgable == VM_PURGABLE_VOLATILE ||
20239 	    object->purgable == VM_PURGABLE_EMPTY) &&
20240 	    VM_OBJECT_OWNER(object) != NULL &&
20241 	    VM_OBJECT_OWNER(object)->map == map) {
20242 		if (object->internal) {
20243 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20244 		}
20245 		if ((((curr_s_offset
20246 		    - map_entry->vme_start
20247 		    + VME_OFFSET(map_entry))
20248 		    / effective_page_size) <
20249 		    object->wired_page_count)) {
20250 			/*
20251 			 * Volatile|empty purgeable object owned
20252 			 * by this task: report the first
20253 			 * "#wired" pages as "resident" (to
20254 			 * show that they contribute to the
20255 			 * footprint) but not "dirty" (to avoid
20256 			 * double-counting with the fake
20257 			 * "non-volatile" region we'll report
20258 			 * at the end of the address space to
20259 			 * account for all (mapped or not)
20260 			 * non-volatile memory owned by this
20261 			 * task.
20262 			 */
20263 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20264 		}
20265 	} else if (!map_entry->is_sub_map &&
20266 	    map_entry->iokit_acct &&
20267 	    object->internal &&
20268 	    object->purgable == VM_PURGABLE_DENY) {
20269 		/*
20270 		 * Non-purgeable IOKit memory: phys_footprint
20271 		 * includes the entire virtual mapping.
20272 		 */
20273 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20274 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20275 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20276 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
20277 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
20278 		/* alternate accounting */
20279 #if __arm64__ && (DEVELOPMENT || DEBUG)
20280 		if (map->pmap->footprint_was_suspended) {
20281 			/*
20282 			 * The assertion below can fail if dyld
20283 			 * suspended footprint accounting
20284 			 * while doing some adjustments to
20285 			 * this page;  the mapping would say
20286 			 * "use pmap accounting" but the page
20287 			 * would be marked "alternate
20288 			 * accounting".
20289 			 */
20290 		} else
20291 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
20292 		{
20293 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20294 		}
20295 		disposition = 0;
20296 	} else {
20297 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
20298 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20299 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20300 			disposition |= VM_PAGE_QUERY_PAGE_REF;
20301 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
20302 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20303 			} else {
20304 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20305 			}
20306 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
20307 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20308 			}
20309 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
20310 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20311 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20312 		}
20313 	}
20314 
20315 	*disposition_p = disposition;
20316 }
20317 
20318 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_ut offset_u,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20319 vm_map_page_info(
20320 	vm_map_t                map,
20321 	vm_map_offset_ut        offset_u,
20322 	vm_page_info_flavor_t   flavor,
20323 	vm_page_info_t          info,
20324 	mach_msg_type_number_t  *count)
20325 {
20326 	return vm_map_page_range_info_internal(map,
20327 	           offset_u, /* start of range */
20328 	           vm_sanitize_compute_ut_end(offset_u, 1), /* this will get rounded in the call to the page boundary */
20329 	           (int)-1, /* effective_page_shift: unspecified */
20330 	           flavor,
20331 	           info,
20332 	           count);
20333 }
20334 
20335 static __attribute__((always_inline, warn_unused_result))
20336 kern_return_t
vm_map_page_range_info_sanitize(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,vm_map_offset_t effective_page_mask,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_offset_t * offset_in_page)20337 vm_map_page_range_info_sanitize(
20338 	vm_map_t                map,
20339 	vm_map_offset_ut        start_offset_u,
20340 	vm_map_offset_ut        end_offset_u,
20341 	vm_map_offset_t         effective_page_mask,
20342 	vm_map_offset_t        *start,
20343 	vm_map_offset_t        *end,
20344 	vm_map_offset_t        *offset_in_page)
20345 {
20346 	kern_return_t           retval;
20347 	vm_map_size_t           size;
20348 
20349 	/*
20350 	 * Perform validation against map's mask but don't align start/end,
20351 	 * as we need for those to be aligned wrt effective_page_mask
20352 	 */
20353 	retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20354 	    VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, map,
20355 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
20356 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES, start,
20357 	    end, &size);
20358 	if (retval != KERN_SUCCESS) {
20359 		return retval;
20360 	}
20361 
20362 	retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20363 	    VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, effective_page_mask,
20364 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start,
20365 	    end, &size);
20366 	if (retval != KERN_SUCCESS) {
20367 		return retval;
20368 	}
20369 
20370 	*offset_in_page = vm_sanitize_offset_in_page(effective_page_mask,
20371 	    start_offset_u);
20372 
20373 	return KERN_SUCCESS;
20374 }
20375 
20376 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20377 vm_map_page_range_info_internal(
20378 	vm_map_t                map,
20379 	vm_map_offset_ut        start_offset_u,
20380 	vm_map_offset_ut        end_offset_u,
20381 	int                     effective_page_shift,
20382 	vm_page_info_flavor_t   flavor,
20383 	vm_page_info_t          info,
20384 	mach_msg_type_number_t  *count)
20385 {
20386 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
20387 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
20388 	vm_page_t               m = VM_PAGE_NULL;
20389 	kern_return_t           retval = KERN_SUCCESS;
20390 	int                     disposition = 0;
20391 	int                     ref_count = 0;
20392 	int                     depth = 0, info_idx = 0;
20393 	vm_page_info_basic_t    basic_info = 0;
20394 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
20395 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
20396 	boolean_t               do_region_footprint;
20397 	ledger_amount_t         ledger_resident, ledger_compressed;
20398 	int                     effective_page_size;
20399 	vm_map_offset_t         effective_page_mask;
20400 
20401 	switch (flavor) {
20402 	case VM_PAGE_INFO_BASIC:
20403 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
20404 			/*
20405 			 * The "vm_page_info_basic_data" structure was not
20406 			 * properly padded, so allow the size to be off by
20407 			 * one to maintain backwards binary compatibility...
20408 			 */
20409 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
20410 				return KERN_INVALID_ARGUMENT;
20411 			}
20412 		}
20413 		break;
20414 	default:
20415 		return KERN_INVALID_ARGUMENT;
20416 	}
20417 
20418 	if (effective_page_shift == -1) {
20419 		effective_page_shift = vm_self_region_page_shift_safely(map);
20420 		if (effective_page_shift == -1) {
20421 			return KERN_INVALID_ARGUMENT;
20422 		}
20423 	}
20424 	effective_page_size = (1 << effective_page_shift);
20425 	effective_page_mask = effective_page_size - 1;
20426 
20427 
20428 	retval = vm_map_page_range_info_sanitize(map,
20429 	    start_offset_u,
20430 	    end_offset_u,
20431 	    effective_page_mask,
20432 	    &start,
20433 	    &end,
20434 	    &offset_in_page);
20435 	if (retval != KERN_SUCCESS) {
20436 		return vm_sanitize_get_kr(retval);
20437 	}
20438 
20439 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
20440 
20441 	do_region_footprint = task_self_region_footprint();
20442 	disposition = 0;
20443 	ref_count = 0;
20444 	depth = 0;
20445 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
20446 
20447 	vm_map_lock_read(map);
20448 
20449 
20450 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
20451 
20452 	for (curr_s_offset = start; curr_s_offset < end;) {
20453 		/*
20454 		 * New lookup needs reset of these variables.
20455 		 */
20456 		curr_object = object = VM_OBJECT_NULL;
20457 		offset_in_object = 0;
20458 		ref_count = 0;
20459 		depth = 0;
20460 
20461 		if (do_region_footprint &&
20462 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
20463 			/*
20464 			 * Request for "footprint" info about a page beyond
20465 			 * the end of address space: this must be for
20466 			 * the fake region vm_map_region_recurse_64()
20467 			 * reported to account for non-volatile purgeable
20468 			 * memory owned by this task.
20469 			 */
20470 			disposition = 0;
20471 
20472 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
20473 			    (unsigned) ledger_compressed) {
20474 				/*
20475 				 * We haven't reported all the "non-volatile
20476 				 * compressed" pages yet, so report this fake
20477 				 * page as "compressed".
20478 				 */
20479 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20480 			} else {
20481 				/*
20482 				 * We've reported all the non-volatile
20483 				 * compressed page but not all the non-volatile
20484 				 * pages , so report this fake page as
20485 				 * "resident dirty".
20486 				 */
20487 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20488 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20489 				disposition |= VM_PAGE_QUERY_PAGE_REF;
20490 			}
20491 			switch (flavor) {
20492 			case VM_PAGE_INFO_BASIC:
20493 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20494 				basic_info->disposition = disposition;
20495 				basic_info->ref_count = 1;
20496 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20497 				basic_info->offset = 0;
20498 				basic_info->depth = 0;
20499 
20500 				info_idx++;
20501 				break;
20502 			}
20503 			curr_s_offset += effective_page_size;
20504 			continue;
20505 		}
20506 
20507 		/*
20508 		 * First, find the map entry covering "curr_s_offset", going down
20509 		 * submaps if necessary.
20510 		 */
20511 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
20512 			/* no entry -> no object -> no page */
20513 
20514 			if (curr_s_offset < vm_map_min(map)) {
20515 				/*
20516 				 * Illegal address that falls below map min.
20517 				 */
20518 				curr_e_offset = MIN(end, vm_map_min(map));
20519 			} else if (curr_s_offset >= vm_map_max(map)) {
20520 				/*
20521 				 * Illegal address that falls on/after map max.
20522 				 */
20523 				curr_e_offset = end;
20524 			} else if (map_entry == vm_map_to_entry(map)) {
20525 				/*
20526 				 * Hit a hole.
20527 				 */
20528 				if (map_entry->vme_next == vm_map_to_entry(map)) {
20529 					/*
20530 					 * Empty map.
20531 					 */
20532 					curr_e_offset = MIN(map->max_offset, end);
20533 				} else {
20534 					/*
20535 					 * Hole at start of the map.
20536 					 */
20537 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20538 				}
20539 			} else {
20540 				if (map_entry->vme_next == vm_map_to_entry(map)) {
20541 					/*
20542 					 * Hole at the end of the map.
20543 					 */
20544 					curr_e_offset = MIN(map->max_offset, end);
20545 				} else {
20546 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20547 				}
20548 			}
20549 
20550 			assert(curr_e_offset >= curr_s_offset);
20551 
20552 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20553 
20554 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20555 
20556 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20557 
20558 			curr_s_offset = curr_e_offset;
20559 
20560 			info_idx += num_pages;
20561 
20562 			continue;
20563 		}
20564 
20565 		/* compute offset from this map entry's start */
20566 		offset_in_object = curr_s_offset - map_entry->vme_start;
20567 
20568 		/* compute offset into this map entry's object (or submap) */
20569 		offset_in_object += VME_OFFSET(map_entry);
20570 
20571 		if (map_entry->is_sub_map) {
20572 			vm_map_t sub_map = VM_MAP_NULL;
20573 			vm_page_info_t submap_info = 0;
20574 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20575 
20576 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20577 
20578 			submap_s_offset = offset_in_object;
20579 			submap_e_offset = submap_s_offset + range_len;
20580 
20581 			sub_map = VME_SUBMAP(map_entry);
20582 
20583 			vm_map_reference(sub_map);
20584 			vm_map_unlock_read(map);
20585 
20586 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20587 
20588 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20589 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20590 
20591 			retval = vm_map_page_range_info_internal(sub_map,
20592 			    submap_s_offset,
20593 			    submap_e_offset,
20594 			    effective_page_shift,
20595 			    VM_PAGE_INFO_BASIC,
20596 			    (vm_page_info_t) submap_info,
20597 			    count);
20598 
20599 			assert(retval == KERN_SUCCESS);
20600 
20601 			vm_map_deallocate(sub_map);
20602 			sub_map = VM_MAP_NULL;
20603 			vm_map_lock_read(map);
20604 
20605 			/* Move the "info" index by the number of pages we inspected.*/
20606 			info_idx += range_len >> effective_page_shift;
20607 
20608 			/* Move our current offset by the size of the range we inspected.*/
20609 			curr_s_offset += range_len;
20610 
20611 			continue;
20612 		}
20613 
20614 		object = VME_OBJECT(map_entry);
20615 
20616 		if (object == VM_OBJECT_NULL) {
20617 			/*
20618 			 * We don't have an object here and, hence,
20619 			 * no pages to inspect. We'll fill up the
20620 			 * info structure appropriately.
20621 			 */
20622 
20623 			curr_e_offset = MIN(map_entry->vme_end, end);
20624 
20625 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20626 
20627 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20628 
20629 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20630 
20631 			curr_s_offset = curr_e_offset;
20632 
20633 			info_idx += num_pages;
20634 
20635 			continue;
20636 		}
20637 
20638 		if (do_region_footprint) {
20639 			disposition = 0;
20640 			if (map->has_corpse_footprint) {
20641 				/*
20642 				 * Query the page info data we saved
20643 				 * while forking the corpse.
20644 				 */
20645 				vm_map_corpse_footprint_query_page_info(
20646 					map,
20647 					curr_s_offset,
20648 					&disposition);
20649 			} else {
20650 				/*
20651 				 * Query the live pmap for footprint info
20652 				 * about this page.
20653 				 */
20654 				vm_map_footprint_query_page_info(
20655 					map,
20656 					map_entry,
20657 					curr_s_offset,
20658 					&disposition);
20659 			}
20660 			switch (flavor) {
20661 			case VM_PAGE_INFO_BASIC:
20662 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20663 				basic_info->disposition = disposition;
20664 				basic_info->ref_count = 1;
20665 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20666 				basic_info->offset = 0;
20667 				basic_info->depth = 0;
20668 
20669 				info_idx++;
20670 				break;
20671 			}
20672 			curr_s_offset += effective_page_size;
20673 			continue;
20674 		}
20675 
20676 		vm_object_reference(object);
20677 		/*
20678 		 * Shared mode -- so we can allow other readers
20679 		 * to grab the lock too.
20680 		 */
20681 		vm_object_lock_shared(object);
20682 
20683 		curr_e_offset = MIN(map_entry->vme_end, end);
20684 
20685 		vm_map_unlock_read(map);
20686 
20687 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20688 
20689 		curr_object = object;
20690 
20691 		for (; curr_s_offset < curr_e_offset;) {
20692 			if (object == curr_object) {
20693 				/* account for our object reference above. */
20694 				ref_count = os_ref_get_count_raw(&curr_object->ref_count) - 1;
20695 			} else {
20696 				ref_count = os_ref_get_count_raw(&curr_object->ref_count);
20697 			}
20698 
20699 			curr_offset_in_object = offset_in_object;
20700 
20701 			for (;;) {
20702 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20703 
20704 				if (m != VM_PAGE_NULL) {
20705 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20706 					break;
20707 				} else {
20708 					if (curr_object->internal &&
20709 					    curr_object->alive &&
20710 					    !curr_object->terminating &&
20711 					    curr_object->pager_ready) {
20712 						if (vm_object_compressor_pager_state_get(curr_object, vm_object_trunc_page(curr_offset_in_object))
20713 						    == VM_EXTERNAL_STATE_EXISTS) {
20714 							/* the pager has that page */
20715 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20716 							break;
20717 						}
20718 					}
20719 
20720 					/*
20721 					 * Go down the VM object shadow chain until we find the page
20722 					 * we're looking for.
20723 					 */
20724 
20725 					if (curr_object->shadow != VM_OBJECT_NULL) {
20726 						vm_object_t shadow = VM_OBJECT_NULL;
20727 
20728 						curr_offset_in_object += curr_object->vo_shadow_offset;
20729 						shadow = curr_object->shadow;
20730 
20731 						vm_object_lock_shared(shadow);
20732 						vm_object_unlock(curr_object);
20733 
20734 						curr_object = shadow;
20735 						depth++;
20736 						continue;
20737 					} else {
20738 						break;
20739 					}
20740 				}
20741 			}
20742 
20743 			/* The ref_count is not strictly accurate, it measures the number   */
20744 			/* of entities holding a ref on the object, they may not be mapping */
20745 			/* the object or may not be mapping the section holding the         */
20746 			/* target page but its still a ball park number and though an over- */
20747 			/* count, it picks up the copy-on-write cases                       */
20748 
20749 			/* We could also get a picture of page sharing from pmap_attributes */
20750 			/* but this would under count as only faulted-in mappings would     */
20751 			/* show up.							    */
20752 
20753 			if ((curr_object == object) && curr_object->shadow) {
20754 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20755 			}
20756 
20757 			if (!curr_object->internal) {
20758 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20759 			}
20760 
20761 			if (m != VM_PAGE_NULL) {
20762 				if (vm_page_is_fictitious(m)) {
20763 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20764 				} else {
20765 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20766 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20767 					}
20768 
20769 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20770 						disposition |= VM_PAGE_QUERY_PAGE_REF;
20771 					}
20772 
20773 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20774 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20775 					}
20776 
20777 					/*
20778 					 * XXX TODO4K:
20779 					 * when this routine deals with 4k
20780 					 * pages, check the appropriate CS bit
20781 					 * here.
20782 					 */
20783 					if (m->vmp_cs_validated) {
20784 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20785 					}
20786 					if (m->vmp_cs_tainted) {
20787 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20788 					}
20789 					if (m->vmp_cs_nx) {
20790 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20791 					}
20792 					if (m->vmp_reusable || curr_object->all_reusable) {
20793 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20794 					}
20795 				}
20796 			}
20797 
20798 			switch (flavor) {
20799 			case VM_PAGE_INFO_BASIC:
20800 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20801 				basic_info->disposition = disposition;
20802 				basic_info->ref_count = ref_count;
20803 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
20804 				    VM_KERNEL_ADDRHASH(curr_object);
20805 				basic_info->offset =
20806 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20807 				basic_info->depth = depth;
20808 
20809 				info_idx++;
20810 				break;
20811 			}
20812 
20813 			disposition = 0;
20814 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20815 
20816 			/*
20817 			 * Move to next offset in the range and in our object.
20818 			 */
20819 			curr_s_offset += effective_page_size;
20820 			offset_in_object += effective_page_size;
20821 			curr_offset_in_object = offset_in_object;
20822 
20823 			if (curr_object != object) {
20824 				vm_object_unlock(curr_object);
20825 
20826 				curr_object = object;
20827 
20828 				vm_object_lock_shared(curr_object);
20829 			} else {
20830 				vm_object_lock_yield_shared(curr_object);
20831 			}
20832 		}
20833 
20834 		vm_object_unlock(curr_object);
20835 		vm_object_deallocate(curr_object);
20836 
20837 		vm_map_lock_read(map);
20838 	}
20839 
20840 	vm_map_unlock_read(map);
20841 	return retval;
20842 }
20843 
20844 static __attribute__((always_inline, warn_unused_result))
20845 kern_return_t
vm_map_msync_sanitize(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_object_offset_t * address,vm_map_size_t * size)20846 vm_map_msync_sanitize(
20847 	vm_map_t                map,
20848 	vm_map_address_ut       address_u,
20849 	vm_map_size_ut          size_u,
20850 	vm_object_offset_t     *address,
20851 	vm_map_size_t          *size)
20852 {
20853 	vm_object_offset_t      end;
20854 
20855 	return vm_sanitize_addr_size(address_u, size_u,
20856 	           VM_SANITIZE_CALLER_VM_MAP_MSYNC,
20857 	           map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS,
20858 	           address, &end, size);
20859 }
20860 
20861 /*
20862  *	vm_map_msync
20863  *
20864  *	Synchronises the memory range specified with its backing store
20865  *	image by either flushing or cleaning the contents to the appropriate
20866  *	memory manager engaging in a memory object synchronize dialog with
20867  *	the manager.  The client doesn't return until the manager issues
20868  *	m_o_s_completed message.  MIG Magically converts user task parameter
20869  *	to the task's address map.
20870  *
20871  *	interpretation of sync_flags
20872  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
20873  *				  pages to manager.
20874  *
20875  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20876  *				- discard pages, write dirty or precious
20877  *				  pages back to memory manager.
20878  *
20879  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20880  *				- write dirty or precious pages back to
20881  *				  the memory manager.
20882  *
20883  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
20884  *				  is a hole in the region, and we would
20885  *				  have returned KERN_SUCCESS, return
20886  *				  KERN_INVALID_ADDRESS instead.
20887  *
20888  *	NOTE
20889  *	The memory object attributes have not yet been implemented, this
20890  *	function will have to deal with the invalidate attribute
20891  *
20892  *	RETURNS
20893  *	KERN_INVALID_TASK		Bad task parameter
20894  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
20895  *	KERN_SUCCESS			The usual.
20896  *	KERN_INVALID_ADDRESS		There was a hole in the region.
20897  */
20898 
20899 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_sync_t sync_flags)20900 vm_map_msync(
20901 	vm_map_t                map,
20902 	vm_map_address_ut       address_u,
20903 	vm_map_size_ut          size_u,
20904 	vm_sync_t               sync_flags)
20905 {
20906 	vm_map_entry_t          entry;
20907 	vm_map_size_t           size, amount_left;
20908 	vm_object_offset_t      address, offset;
20909 	vm_object_offset_t      start_offset, end_offset;
20910 	boolean_t               do_sync_req;
20911 	boolean_t               had_hole = FALSE;
20912 	vm_map_offset_t         pmap_offset;
20913 	kern_return_t           kr;
20914 
20915 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20916 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20917 		return KERN_INVALID_ARGUMENT;
20918 	}
20919 
20920 	if (map == VM_MAP_NULL) {
20921 		return KERN_INVALID_TASK;
20922 	}
20923 
20924 	kr = vm_map_msync_sanitize(map,
20925 	    address_u,
20926 	    size_u,
20927 	    &address,
20928 	    &size);
20929 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20930 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20931 	}
20932 	if (__improbable(kr != KERN_SUCCESS)) {
20933 		return vm_sanitize_get_kr(kr);
20934 	}
20935 
20936 	amount_left = size;
20937 
20938 	while (amount_left > 0) {
20939 		vm_object_size_t        flush_size;
20940 		vm_object_t             object;
20941 
20942 		vm_map_lock(map);
20943 		if (!vm_map_lookup_entry(map,
20944 		    address,
20945 		    &entry)) {
20946 			vm_map_size_t   skip;
20947 
20948 			/*
20949 			 * hole in the address map.
20950 			 */
20951 			had_hole = TRUE;
20952 
20953 			if (sync_flags & VM_SYNC_KILLPAGES) {
20954 				/*
20955 				 * For VM_SYNC_KILLPAGES, there should be
20956 				 * no holes in the range, since we couldn't
20957 				 * prevent someone else from allocating in
20958 				 * that hole and we wouldn't want to "kill"
20959 				 * their pages.
20960 				 */
20961 				vm_map_unlock(map);
20962 				break;
20963 			}
20964 
20965 			/*
20966 			 * Check for empty map.
20967 			 */
20968 			if (entry == vm_map_to_entry(map) &&
20969 			    entry->vme_next == entry) {
20970 				vm_map_unlock(map);
20971 				break;
20972 			}
20973 			/*
20974 			 * Check that we don't wrap and that
20975 			 * we have at least one real map entry.
20976 			 */
20977 			if ((map->hdr.nentries == 0) ||
20978 			    (entry->vme_next->vme_start < address)) {
20979 				vm_map_unlock(map);
20980 				break;
20981 			}
20982 			/*
20983 			 * Move up to the next entry if needed
20984 			 */
20985 			skip = (entry->vme_next->vme_start - address);
20986 			if (skip >= amount_left) {
20987 				amount_left = 0;
20988 			} else {
20989 				amount_left -= skip;
20990 			}
20991 			address = entry->vme_next->vme_start;
20992 			vm_map_unlock(map);
20993 			continue;
20994 		}
20995 
20996 		offset = address - entry->vme_start;
20997 		pmap_offset = address;
20998 
20999 		/*
21000 		 * do we have more to flush than is contained in this
21001 		 * entry ?
21002 		 */
21003 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
21004 			flush_size = entry->vme_end -
21005 			    (entry->vme_start + offset);
21006 		} else {
21007 			flush_size = amount_left;
21008 		}
21009 		amount_left -= flush_size;
21010 		address += flush_size;
21011 
21012 		if (entry->is_sub_map == TRUE) {
21013 			vm_map_t        local_map;
21014 			vm_map_offset_t local_offset;
21015 
21016 			local_map = VME_SUBMAP(entry);
21017 			local_offset = VME_OFFSET(entry);
21018 			vm_map_reference(local_map);
21019 			vm_map_unlock(map);
21020 			if (vm_map_msync(
21021 				    local_map,
21022 				    local_offset,
21023 				    flush_size,
21024 				    sync_flags) == KERN_INVALID_ADDRESS) {
21025 				had_hole = TRUE;
21026 			}
21027 			vm_map_deallocate(local_map);
21028 			local_map = VM_MAP_NULL;
21029 			continue;
21030 		}
21031 		object = VME_OBJECT(entry);
21032 
21033 		/*
21034 		 * We can't sync this object if the object has not been
21035 		 * created yet
21036 		 */
21037 		if (object == VM_OBJECT_NULL) {
21038 			vm_map_unlock(map);
21039 			continue;
21040 		}
21041 		offset += VME_OFFSET(entry);
21042 
21043 		vm_object_lock(object);
21044 
21045 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
21046 			int kill_pages = 0;
21047 
21048 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
21049 				/*
21050 				 * This is a destructive operation and so we
21051 				 * err on the side of limiting the range of
21052 				 * the operation.
21053 				 */
21054 				start_offset = vm_object_round_page(offset);
21055 				end_offset = vm_object_trunc_page(offset + flush_size);
21056 
21057 				if (end_offset <= start_offset) {
21058 					vm_object_unlock(object);
21059 					vm_map_unlock(map);
21060 					continue;
21061 				}
21062 
21063 				pmap_offset += start_offset - offset;
21064 			} else {
21065 				start_offset = offset;
21066 				end_offset = offset + flush_size;
21067 			}
21068 
21069 			if (sync_flags & VM_SYNC_KILLPAGES) {
21070 				if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
21071 				    ((object->copy_strategy !=
21072 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
21073 				    (object->vo_copy == VM_OBJECT_NULL))) &&
21074 				    (object->shadow == VM_OBJECT_NULL)) {
21075 					if (os_ref_get_count_raw(&object->ref_count) != 1) {
21076 						vm_page_stats_reusable.free_shared++;
21077 					}
21078 					kill_pages = 1;
21079 				} else {
21080 					kill_pages = -1;
21081 				}
21082 			}
21083 			if (kill_pages != -1) {
21084 				boolean_t kill_no_write = FALSE;
21085 
21086 				if ((entry->protection & VM_PROT_EXECUTE) ||
21087 				    entry->vme_xnu_user_debug) {
21088 					/*
21089 					 * Executable or user debug pages might be write-protected by
21090 					 * hardware, so do not attempt to write to these pages.
21091 					 */
21092 					kill_no_write = TRUE;
21093 				}
21094 				vm_object_deactivate_pages(
21095 					object,
21096 					start_offset,
21097 					(vm_object_size_t) (end_offset - start_offset),
21098 					kill_pages,
21099 					FALSE, /* reusable_pages */
21100 					kill_no_write,
21101 					map->pmap,
21102 					pmap_offset);
21103 			}
21104 			vm_object_unlock(object);
21105 			vm_map_unlock(map);
21106 			continue;
21107 		}
21108 		/*
21109 		 * We can't sync this object if there isn't a pager.
21110 		 * Don't bother to sync internal objects, since there can't
21111 		 * be any "permanent" storage for these objects anyway.
21112 		 */
21113 		if ((object->pager == MEMORY_OBJECT_NULL) ||
21114 		    (object->internal) || (object->private)) {
21115 			vm_object_unlock(object);
21116 			vm_map_unlock(map);
21117 			continue;
21118 		}
21119 		/*
21120 		 * keep reference on the object until syncing is done
21121 		 */
21122 		vm_object_reference_locked(object);
21123 		vm_object_unlock(object);
21124 
21125 		vm_map_unlock(map);
21126 
21127 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
21128 			start_offset = vm_object_trunc_page(offset);
21129 			end_offset = vm_object_round_page(offset + flush_size);
21130 		} else {
21131 			start_offset = offset;
21132 			end_offset = offset + flush_size;
21133 		}
21134 
21135 		do_sync_req = vm_object_sync(object,
21136 		    start_offset,
21137 		    (end_offset - start_offset),
21138 		    sync_flags & VM_SYNC_INVALIDATE,
21139 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
21140 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
21141 		    sync_flags & VM_SYNC_SYNCHRONOUS);
21142 
21143 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
21144 			/*
21145 			 * clear out the clustering and read-ahead hints
21146 			 */
21147 			vm_object_lock(object);
21148 
21149 			object->pages_created = 0;
21150 			object->pages_used = 0;
21151 			object->sequential = 0;
21152 			object->last_alloc = 0;
21153 
21154 			vm_object_unlock(object);
21155 		}
21156 		vm_object_deallocate(object);
21157 	} /* while */
21158 
21159 	/* for proper msync() behaviour */
21160 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
21161 		return KERN_INVALID_ADDRESS;
21162 	}
21163 
21164 	return KERN_SUCCESS;
21165 }/* vm_msync */
21166 
21167 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)21168 vm_named_entry_associate_vm_object(
21169 	vm_named_entry_t        named_entry,
21170 	vm_object_t             object,
21171 	vm_object_offset_t      offset,
21172 	vm_object_size_t        size,
21173 	vm_prot_t               prot)
21174 {
21175 	vm_map_copy_t copy;
21176 	vm_map_entry_t copy_entry;
21177 
21178 	assert(!named_entry->is_sub_map);
21179 	assert(!named_entry->is_copy);
21180 	assert(!named_entry->is_object);
21181 	assert(!named_entry->internal);
21182 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
21183 
21184 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
21185 	copy->offset = offset;
21186 	copy->size = size;
21187 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
21188 
21189 	copy_entry = vm_map_copy_entry_create(copy);
21190 	copy_entry->protection = prot;
21191 	copy_entry->max_protection = prot;
21192 	copy_entry->use_pmap = TRUE;
21193 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
21194 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
21195 	VME_OBJECT_SET(copy_entry, object, false, 0);
21196 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
21197 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
21198 
21199 	named_entry->backing.copy = copy;
21200 	named_entry->is_object = TRUE;
21201 	if (object->internal) {
21202 		named_entry->internal = TRUE;
21203 	}
21204 
21205 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
21206 	    named_entry, copy, object, offset, size, prot);
21207 }
21208 
21209 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)21210 vm_named_entry_to_vm_object(
21211 	vm_named_entry_t named_entry)
21212 {
21213 	vm_map_copy_t   copy;
21214 	vm_map_entry_t  copy_entry;
21215 	vm_object_t     object;
21216 
21217 	assert(!named_entry->is_sub_map);
21218 	assert(!named_entry->is_copy);
21219 	assert(named_entry->is_object);
21220 	copy = named_entry->backing.copy;
21221 	assert(copy != VM_MAP_COPY_NULL);
21222 	/*
21223 	 * Assert that the vm_map_copy is coming from the right
21224 	 * zone and hasn't been forged
21225 	 */
21226 	vm_map_copy_require(copy);
21227 	assert(copy->cpy_hdr.nentries == 1);
21228 	copy_entry = vm_map_copy_first_entry(copy);
21229 	object = VME_OBJECT(copy_entry);
21230 
21231 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
21232 
21233 	return object;
21234 }
21235 
21236 /*
21237  *	Routine:	convert_port_entry_to_map
21238  *	Purpose:
21239  *		Convert from a port specifying an entry or a task
21240  *		to a map. Doesn't consume the port ref; produces a map ref,
21241  *		which may be null.  Unlike convert_port_to_map, the
21242  *		port may be task or a named entry backed.
21243  *	Conditions:
21244  *		Nothing locked.
21245  */
21246 
21247 vm_map_t
convert_port_entry_to_map(ipc_port_t port)21248 convert_port_entry_to_map(
21249 	ipc_port_t      port)
21250 {
21251 	vm_map_t map = VM_MAP_NULL;
21252 	vm_named_entry_t named_entry;
21253 
21254 	if (!IP_VALID(port)) {
21255 		return VM_MAP_NULL;
21256 	}
21257 
21258 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
21259 		return convert_port_to_map(port);
21260 	}
21261 
21262 	named_entry = mach_memory_entry_from_port(port);
21263 
21264 	if ((named_entry->is_sub_map) &&
21265 	    (named_entry->protection & VM_PROT_WRITE)) {
21266 		map = named_entry->backing.map;
21267 		if (map->pmap != PMAP_NULL) {
21268 			if (map->pmap == kernel_pmap) {
21269 				panic("userspace has access "
21270 				    "to a kernel map %p", map);
21271 			}
21272 			pmap_require(map->pmap);
21273 		}
21274 		vm_map_reference(map);
21275 	}
21276 
21277 	return map;
21278 }
21279 
21280 /*
21281  * Export routines to other components for the things we access locally through
21282  * macros.
21283  */
21284 #undef current_map
21285 vm_map_t
current_map(void)21286 current_map(void)
21287 {
21288 	return current_map_fast();
21289 }
21290 
21291 /*
21292  *	vm_map_reference:
21293  *
21294  *	Takes a reference on the specified map.
21295  */
21296 void
vm_map_reference(vm_map_t map)21297 vm_map_reference(
21298 	vm_map_t        map)
21299 {
21300 	if (__probable(map != VM_MAP_NULL)) {
21301 		vm_map_require(map);
21302 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
21303 	}
21304 }
21305 
21306 /*
21307  *	vm_map_deallocate:
21308  *
21309  *	Removes a reference from the specified map,
21310  *	destroying it if no references remain.
21311  *	The map should not be locked.
21312  */
21313 void
vm_map_deallocate(vm_map_t map)21314 vm_map_deallocate(
21315 	vm_map_t        map)
21316 {
21317 	if (__probable(map != VM_MAP_NULL)) {
21318 		vm_map_require(map);
21319 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
21320 			vm_map_destroy(map);
21321 		}
21322 	}
21323 }
21324 
21325 void
vm_map_inspect_deallocate(vm_map_inspect_t map)21326 vm_map_inspect_deallocate(
21327 	vm_map_inspect_t      map)
21328 {
21329 	vm_map_deallocate((vm_map_t)map);
21330 }
21331 
21332 void
vm_map_read_deallocate(vm_map_read_t map)21333 vm_map_read_deallocate(
21334 	vm_map_read_t      map)
21335 {
21336 	vm_map_deallocate((vm_map_t)map);
21337 }
21338 
21339 
21340 void
vm_map_disable_NX(vm_map_t map)21341 vm_map_disable_NX(vm_map_t map)
21342 {
21343 	if (map == NULL) {
21344 		return;
21345 	}
21346 	if (map->pmap == NULL) {
21347 		return;
21348 	}
21349 
21350 	pmap_disable_NX(map->pmap);
21351 }
21352 
21353 void
vm_map_disallow_data_exec(vm_map_t map)21354 vm_map_disallow_data_exec(vm_map_t map)
21355 {
21356 	if (map == NULL) {
21357 		return;
21358 	}
21359 
21360 	map->map_disallow_data_exec = TRUE;
21361 }
21362 
21363 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
21364  * more descriptive.
21365  */
21366 void
vm_map_set_32bit(vm_map_t map)21367 vm_map_set_32bit(vm_map_t map)
21368 {
21369 #if defined(__arm64__)
21370 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
21371 #else
21372 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
21373 #endif
21374 }
21375 
21376 
21377 void
vm_map_set_64bit(vm_map_t map)21378 vm_map_set_64bit(vm_map_t map)
21379 {
21380 #if defined(__arm64__)
21381 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
21382 #else
21383 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
21384 #endif
21385 }
21386 
21387 /*
21388  * Expand the maximum size of an existing map to 64GB.
21389  */
21390 void
vm_map_set_jumbo(vm_map_t map)21391 vm_map_set_jumbo(vm_map_t map)
21392 {
21393 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21394 	vm_map_set_max_addr(map, ~0, false);
21395 #else /* arm64 */
21396 	(void) map;
21397 #endif
21398 }
21399 
21400 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21401 /*
21402  * Expand the maximum size of an existing map to the maximum supported.
21403  */
21404 void
vm_map_set_extra_jumbo(vm_map_t map)21405 vm_map_set_extra_jumbo(vm_map_t map)
21406 {
21407 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21408 	vm_map_set_max_addr(map, ~0, true);
21409 #else /* arm64 */
21410 	(void) map;
21411 #endif
21412 }
21413 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21414 
21415 /*
21416  * This map has a JIT entitlement
21417  */
21418 void
vm_map_set_jit_entitled(vm_map_t map)21419 vm_map_set_jit_entitled(vm_map_t map)
21420 {
21421 #if defined (__arm64__)
21422 	pmap_set_jit_entitled(map->pmap);
21423 #else /* arm64 */
21424 	(void) map;
21425 #endif
21426 }
21427 
21428 /*
21429  * Get status of this maps TPRO flag
21430  */
21431 boolean_t
vm_map_tpro(vm_map_t map)21432 vm_map_tpro(vm_map_t map)
21433 {
21434 #if defined (__arm64e__)
21435 	return pmap_get_tpro(map->pmap);
21436 #else /* arm64e */
21437 	(void) map;
21438 	return FALSE;
21439 #endif
21440 }
21441 
21442 /*
21443  * This map has TPRO enabled
21444  */
21445 void
vm_map_set_tpro(vm_map_t map)21446 vm_map_set_tpro(vm_map_t map)
21447 {
21448 #if defined (__arm64e__)
21449 	pmap_set_tpro(map->pmap);
21450 #else /* arm64e */
21451 	(void) map;
21452 #endif
21453 }
21454 
21455 
21456 
21457 /*
21458  * Does this map have TPRO enforcement enabled
21459  */
21460 boolean_t
vm_map_tpro_enforcement(vm_map_t map)21461 vm_map_tpro_enforcement(vm_map_t map)
21462 {
21463 	return map->tpro_enforcement;
21464 }
21465 
21466 /*
21467  * Set TPRO enforcement for this map
21468  */
21469 void
vm_map_set_tpro_enforcement(vm_map_t map)21470 vm_map_set_tpro_enforcement(vm_map_t map)
21471 {
21472 	if (vm_map_tpro(map)) {
21473 		vm_map_lock(map);
21474 		map->tpro_enforcement = TRUE;
21475 		vm_map_unlock(map);
21476 	}
21477 }
21478 
21479 /*
21480  * Enable TPRO on the requested region
21481  *
21482  * Note:
21483  *     This routine is primarily intended to be called during/soon after map
21484  *     creation before the associated task has been released to run. It is only
21485  *     currently safe when we have no resident pages.
21486  */
21487 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)21488 vm_map_set_tpro_range(
21489 	__unused vm_map_t map,
21490 	__unused vm_map_address_t start,
21491 	__unused vm_map_address_t end)
21492 {
21493 	return TRUE;
21494 }
21495 
21496 /*
21497  * Expand the maximum size of an existing map.
21498  */
21499 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset,__unused bool extra_jumbo)21500 vm_map_set_max_addr(
21501 	vm_map_t map,
21502 	vm_map_offset_t new_max_offset,
21503 	__unused bool extra_jumbo)
21504 {
21505 #if defined(__arm64__)
21506 	vm_map_offset_t max_supported_offset;
21507 	vm_map_offset_t old_max_offset;
21508 	unsigned int option = ARM_PMAP_MAX_OFFSET_JUMBO;
21509 
21510 	vm_map_lock(map);
21511 
21512 	old_max_offset = map->max_offset;
21513 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21514 	if (extra_jumbo) {
21515 		option = ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO;
21516 	}
21517 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21518 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), option);
21519 
21520 	new_max_offset = trunc_page(new_max_offset);
21521 
21522 	/* The address space cannot be shrunk using this routine. */
21523 	if (old_max_offset >= new_max_offset) {
21524 		vm_map_unlock(map);
21525 		return;
21526 	}
21527 
21528 	if (max_supported_offset < new_max_offset) {
21529 		new_max_offset = max_supported_offset;
21530 	}
21531 
21532 	map->max_offset = new_max_offset;
21533 
21534 	/*
21535 	 * Disable the following chunk of code that extends the "holes" list
21536 	 * to accomodate a larger VM map.
21537 	 * In `vm_map_create_options()`, we now set the end of the "holes" list to
21538 	 * max(map->max_offset, MACH_VM_MAX_ADDRESS) for all platforms.
21539 	 * MACH_VM_MAX_ADDRESS is the largest virtual address a userspace process
21540 	 * can map, so any `new_max_offset` value will be <= MACH_VM_MAX_ADDRESS.
21541 	 * The "holes" list does not need to be adjusted.
21542 	 */
21543 #if 0
21544 	if (map->holelistenabled) {
21545 		if (map->holes_list->prev->vme_end == old_max_offset) {
21546 			/*
21547 			 * There is already a hole at the end of the map; simply make it bigger.
21548 			 */
21549 			map->holes_list->prev->vme_end = map->max_offset;
21550 		} else {
21551 			/*
21552 			 * There is no hole at the end, so we need to create a new hole
21553 			 * for the new empty space we're creating.
21554 			 */
21555 			struct vm_map_links *new_hole;
21556 
21557 			new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
21558 			new_hole->start = old_max_offset;
21559 			new_hole->end = map->max_offset;
21560 			new_hole->prev = map->holes_list->prev;
21561 			new_hole->next = (struct vm_map_entry *)map->holes_list;
21562 			map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
21563 			map->holes_list->prev = (struct vm_map_entry *)new_hole;
21564 		}
21565 	}
21566 #endif
21567 
21568 	vm_map_unlock(map);
21569 #else
21570 	(void)map;
21571 	(void)new_max_offset;
21572 #endif
21573 }
21574 
21575 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21576 vm_compute_max_offset(boolean_t is64)
21577 {
21578 #if defined(__arm64__)
21579 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21580 #else
21581 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21582 #endif
21583 }
21584 
21585 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21586 vm_map_get_max_aslr_slide_section(
21587 	vm_map_t                map __unused,
21588 	int64_t                 *max_sections,
21589 	int64_t                 *section_size)
21590 {
21591 #if defined(__arm64__)
21592 	*max_sections = 3;
21593 	*section_size = ARM_TT_TWIG_SIZE;
21594 #else
21595 	*max_sections = 1;
21596 	*section_size = 0;
21597 #endif
21598 }
21599 
21600 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21601 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21602 {
21603 #if defined(__arm64__)
21604 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21605 	 * limited embedded address space; this is also meant to minimize pmap
21606 	 * memory usage on 16KB page systems.
21607 	 */
21608 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21609 #else
21610 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21611 #endif
21612 }
21613 
21614 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21615 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21616 {
21617 #if defined(__arm64__)
21618 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21619 	 * of independent entropy on 16KB page systems.
21620 	 */
21621 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21622 #else
21623 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21624 #endif
21625 }
21626 
21627 boolean_t
vm_map_is_64bit(vm_map_t map)21628 vm_map_is_64bit(
21629 	vm_map_t map)
21630 {
21631 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21632 }
21633 
21634 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21635 vm_map_has_hard_pagezero(
21636 	vm_map_t        map,
21637 	vm_map_offset_t pagezero_size)
21638 {
21639 	/*
21640 	 * XXX FBDP
21641 	 * We should lock the VM map (for read) here but we can get away
21642 	 * with it for now because there can't really be any race condition:
21643 	 * the VM map's min_offset is changed only when the VM map is created
21644 	 * and when the zero page is established (when the binary gets loaded),
21645 	 * and this routine gets called only when the task terminates and the
21646 	 * VM map is being torn down, and when a new map is created via
21647 	 * load_machfile()/execve().
21648 	 */
21649 	return map->min_offset >= pagezero_size;
21650 }
21651 
21652 /*
21653  * Raise a VM map's maximun offset.
21654  */
21655 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21656 vm_map_raise_max_offset(
21657 	vm_map_t        map,
21658 	vm_map_offset_t new_max_offset)
21659 {
21660 	kern_return_t   ret;
21661 
21662 	vm_map_lock(map);
21663 	ret = KERN_INVALID_ADDRESS;
21664 
21665 	if (new_max_offset >= map->max_offset) {
21666 		if (!vm_map_is_64bit(map)) {
21667 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21668 				map->max_offset = new_max_offset;
21669 				ret = KERN_SUCCESS;
21670 			}
21671 		} else {
21672 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21673 				map->max_offset = new_max_offset;
21674 				ret = KERN_SUCCESS;
21675 			}
21676 		}
21677 	}
21678 
21679 	vm_map_unlock(map);
21680 	return ret;
21681 }
21682 
21683 
21684 /*
21685  * Raise a VM map's minimum offset.
21686  * To strictly enforce "page zero" reservation.
21687  */
21688 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21689 vm_map_raise_min_offset(
21690 	vm_map_t        map,
21691 	vm_map_offset_t new_min_offset)
21692 {
21693 	vm_map_entry_t  first_entry;
21694 
21695 	new_min_offset = vm_map_round_page(new_min_offset,
21696 	    VM_MAP_PAGE_MASK(map));
21697 
21698 	vm_map_lock(map);
21699 
21700 	if (new_min_offset < map->min_offset) {
21701 		/*
21702 		 * Can't move min_offset backwards, as that would expose
21703 		 * a part of the address space that was previously, and for
21704 		 * possibly good reasons, inaccessible.
21705 		 */
21706 		vm_map_unlock(map);
21707 		return KERN_INVALID_ADDRESS;
21708 	}
21709 	if (new_min_offset >= map->max_offset) {
21710 		/* can't go beyond the end of the address space */
21711 		vm_map_unlock(map);
21712 		return KERN_INVALID_ADDRESS;
21713 	}
21714 
21715 	first_entry = vm_map_first_entry(map);
21716 	if (first_entry != vm_map_to_entry(map) &&
21717 	    first_entry->vme_start < new_min_offset) {
21718 		/*
21719 		 * Some memory was already allocated below the new
21720 		 * minimun offset.  It's too late to change it now...
21721 		 */
21722 		vm_map_unlock(map);
21723 		return KERN_NO_SPACE;
21724 	}
21725 
21726 	map->min_offset = new_min_offset;
21727 
21728 	if (map->holelistenabled) {
21729 		assert(map->holes_list);
21730 		map->holes_list->start = new_min_offset;
21731 		assert(new_min_offset < map->holes_list->end);
21732 	}
21733 
21734 	vm_map_unlock(map);
21735 
21736 	return KERN_SUCCESS;
21737 }
21738 
21739 /*
21740  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21741  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21742  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21743  * have to reach over to the BSD data structures.
21744  */
21745 
21746 uint64_t vm_map_set_size_limit_count = 0;
21747 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21748 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21749 {
21750 	kern_return_t kr;
21751 
21752 	vm_map_lock(map);
21753 	if (new_size_limit < map->size) {
21754 		/* new limit should not be lower than its current size */
21755 		DTRACE_VM2(vm_map_set_size_limit_fail,
21756 		    vm_map_size_t, map->size,
21757 		    uint64_t, new_size_limit);
21758 		kr = KERN_FAILURE;
21759 	} else if (new_size_limit == map->size_limit) {
21760 		/* no change */
21761 		kr = KERN_SUCCESS;
21762 	} else {
21763 		/* set new limit */
21764 		DTRACE_VM2(vm_map_set_size_limit,
21765 		    vm_map_size_t, map->size,
21766 		    uint64_t, new_size_limit);
21767 		if (new_size_limit != RLIM_INFINITY) {
21768 			vm_map_set_size_limit_count++;
21769 		}
21770 		map->size_limit = new_size_limit;
21771 		kr = KERN_SUCCESS;
21772 	}
21773 	vm_map_unlock(map);
21774 	return kr;
21775 }
21776 
21777 uint64_t vm_map_set_data_limit_count = 0;
21778 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21779 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21780 {
21781 	kern_return_t kr;
21782 
21783 	vm_map_lock(map);
21784 	if (new_data_limit < map->size) {
21785 		/* new limit should not be lower than its current size */
21786 		DTRACE_VM2(vm_map_set_data_limit_fail,
21787 		    vm_map_size_t, map->size,
21788 		    uint64_t, new_data_limit);
21789 		kr = KERN_FAILURE;
21790 	} else if (new_data_limit == map->data_limit) {
21791 		/* no change */
21792 		kr = KERN_SUCCESS;
21793 	} else {
21794 		/* set new limit */
21795 		DTRACE_VM2(vm_map_set_data_limit,
21796 		    vm_map_size_t, map->size,
21797 		    uint64_t, new_data_limit);
21798 		if (new_data_limit != RLIM_INFINITY) {
21799 			vm_map_set_data_limit_count++;
21800 		}
21801 		map->data_limit = new_data_limit;
21802 		kr = KERN_SUCCESS;
21803 	}
21804 	vm_map_unlock(map);
21805 	return kr;
21806 }
21807 
21808 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21809 vm_map_set_user_wire_limit(vm_map_t     map,
21810     vm_size_t    limit)
21811 {
21812 	vm_map_lock(map);
21813 	map->user_wire_limit = limit;
21814 	vm_map_unlock(map);
21815 }
21816 
21817 
21818 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21819 vm_map_switch_protect(vm_map_t     map,
21820     boolean_t    val)
21821 {
21822 	vm_map_lock(map);
21823 	map->switch_protect = val;
21824 	vm_map_unlock(map);
21825 }
21826 
21827 extern int cs_process_enforcement_enable;
21828 boolean_t
vm_map_cs_enforcement(vm_map_t map)21829 vm_map_cs_enforcement(
21830 	vm_map_t map)
21831 {
21832 	if (cs_process_enforcement_enable) {
21833 		return TRUE;
21834 	}
21835 	return map->cs_enforcement;
21836 }
21837 
21838 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21839 vm_map_cs_wx_enable(
21840 	__unused vm_map_t map)
21841 {
21842 #if CODE_SIGNING_MONITOR
21843 	kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21844 	if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21845 		return KERN_SUCCESS;
21846 	}
21847 	return ret;
21848 #else
21849 	/* The VM manages WX memory entirely on its own */
21850 	return KERN_SUCCESS;
21851 #endif
21852 }
21853 
21854 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21855 vm_map_csm_allow_jit(
21856 	__unused vm_map_t map)
21857 {
21858 #if CODE_SIGNING_MONITOR
21859 	return csm_allow_jit_region(vm_map_pmap(map));
21860 #else
21861 	/* No code signing monitor to enforce JIT policy */
21862 	return KERN_SUCCESS;
21863 #endif
21864 }
21865 
21866 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21867 vm_map_cs_debugged_set(
21868 	vm_map_t map,
21869 	boolean_t val)
21870 {
21871 	vm_map_lock(map);
21872 	map->cs_debugged = val;
21873 	vm_map_unlock(map);
21874 }
21875 
21876 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21877 vm_map_cs_enforcement_set(
21878 	vm_map_t map,
21879 	boolean_t val)
21880 {
21881 	vm_map_lock(map);
21882 	map->cs_enforcement = val;
21883 	pmap_set_vm_map_cs_enforced(map->pmap, val);
21884 	vm_map_unlock(map);
21885 }
21886 
21887 /*
21888  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21889  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21890  * bump both counters.
21891  */
21892 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21893 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21894 {
21895 	pmap_t pmap = vm_map_pmap(map);
21896 
21897 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21898 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21899 }
21900 
21901 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21902 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21903 {
21904 	pmap_t pmap = vm_map_pmap(map);
21905 
21906 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21907 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21908 }
21909 
21910 /* Add (generate) code signature for memory range */
21911 #if CONFIG_DYNAMIC_CODE_SIGNING
21912 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21913 vm_map_sign(vm_map_t map,
21914     vm_map_offset_t start,
21915     vm_map_offset_t end)
21916 {
21917 	vm_map_entry_t entry;
21918 	vm_map_offset_t entry_start;
21919 	vm_object_offset_t entry_offset;
21920 	vm_page_t m;
21921 	vm_object_t object;
21922 
21923 	/*
21924 	 * Vet all the input parameters and current type and state of the
21925 	 * underlaying object.  Return with an error if anything is amiss.
21926 	 */
21927 	if (map == VM_MAP_NULL) {
21928 		return KERN_INVALID_ARGUMENT;
21929 	}
21930 
21931 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21932 		return KERN_INVALID_ADDRESS;
21933 	}
21934 
21935 	vm_map_lock_read(map);
21936 
21937 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21938 		/*
21939 		 * Must pass a valid non-submap address.
21940 		 */
21941 		vm_map_unlock_read(map);
21942 		return KERN_INVALID_ADDRESS;
21943 	}
21944 
21945 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
21946 		/*
21947 		 * Map entry doesn't cover the requested range. Not handling
21948 		 * this situation currently.
21949 		 */
21950 		vm_map_unlock_read(map);
21951 		return KERN_INVALID_ARGUMENT;
21952 	}
21953 
21954 	object = VME_OBJECT(entry);
21955 	if (object == VM_OBJECT_NULL) {
21956 		/*
21957 		 * Object must already be present or we can't sign.
21958 		 */
21959 		vm_map_unlock_read(map);
21960 		return KERN_INVALID_ARGUMENT;
21961 	}
21962 
21963 	vm_object_lock(object);
21964 
21965 	entry_start = entry->vme_start;
21966 	entry_offset = VME_OFFSET(entry);
21967 	vm_map_unlock_read(map);
21968 	entry = VM_MAP_ENTRY_NULL; /* no longer valid after unlocking map */
21969 
21970 	while (start < end) {
21971 		uint32_t refmod;
21972 
21973 		m = vm_page_lookup(object,
21974 		    start - entry_start + entry_offset);
21975 		if (m == VM_PAGE_NULL) {
21976 			/* shoud we try to fault a page here? we can probably
21977 			 * demand it exists and is locked for this request */
21978 			vm_object_unlock(object);
21979 			return KERN_FAILURE;
21980 		}
21981 		/* deal with special page status */
21982 		if (m->vmp_busy ||
21983 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart ||
21984 		    vm_page_is_private(m) || m->vmp_absent))) {
21985 			vm_object_unlock(object);
21986 			return KERN_FAILURE;
21987 		}
21988 
21989 		/* Page is OK... now "validate" it */
21990 		/* This is the place where we'll call out to create a code
21991 		 * directory, later */
21992 		/* XXX TODO4K: deal with 4k subpages individually? */
21993 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21994 
21995 		/* The page is now "clean" for codesigning purposes. That means
21996 		 * we don't consider it as modified (wpmapped) anymore. But
21997 		 * we'll disconnect the page so we note any future modification
21998 		 * attempts. */
21999 		m->vmp_wpmapped = FALSE;
22000 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
22001 
22002 		/* Pull the dirty status from the pmap, since we cleared the
22003 		 * wpmapped bit */
22004 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
22005 			SET_PAGE_DIRTY(m, FALSE);
22006 		}
22007 
22008 		/* On to the next page */
22009 		start += PAGE_SIZE;
22010 	}
22011 	vm_object_unlock(object);
22012 
22013 	return KERN_SUCCESS;
22014 }
22015 #endif
22016 
22017 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)22018 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
22019 {
22020 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
22021 	vm_map_entry_t  next_entry;
22022 	kern_return_t   kr = KERN_SUCCESS;
22023 	VM_MAP_ZAP_DECLARE(zap_list);
22024 
22025 	vm_map_lock(map);
22026 
22027 	for (entry = vm_map_first_entry(map);
22028 	    entry != vm_map_to_entry(map);
22029 	    entry = next_entry) {
22030 		next_entry = entry->vme_next;
22031 
22032 		if (!entry->is_sub_map &&
22033 		    VME_OBJECT(entry) &&
22034 		    (VME_OBJECT(entry)->internal == TRUE) &&
22035 		    (os_ref_get_count_raw(&VME_OBJECT(entry)->ref_count) == 1)) {
22036 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
22037 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
22038 
22039 			(void)vm_map_delete(map, entry->vme_start,
22040 			    entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
22041 			    KMEM_GUARD_NONE, &zap_list);
22042 		}
22043 	}
22044 
22045 	vm_map_unlock(map);
22046 
22047 	vm_map_zap_dispose(&zap_list);
22048 
22049 	return kr;
22050 }
22051 
22052 
22053 #if DEVELOPMENT || DEBUG
22054 
22055 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)22056 vm_map_disconnect_page_mappings(
22057 	vm_map_t map,
22058 	boolean_t do_unnest)
22059 {
22060 	vm_map_entry_t entry;
22061 	ledger_amount_t byte_count = 0;
22062 
22063 	if (do_unnest == TRUE) {
22064 #ifndef NO_NESTED_PMAP
22065 		vm_map_lock(map);
22066 
22067 		for (entry = vm_map_first_entry(map);
22068 		    entry != vm_map_to_entry(map);
22069 		    entry = entry->vme_next) {
22070 			if (entry->is_sub_map && entry->use_pmap) {
22071 				/*
22072 				 * Make sure the range between the start of this entry and
22073 				 * the end of this entry is no longer nested, so that
22074 				 * we will only remove mappings from the pmap in use by this
22075 				 * this task
22076 				 */
22077 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
22078 			}
22079 		}
22080 		vm_map_unlock(map);
22081 #endif
22082 	}
22083 	vm_map_lock_read(map);
22084 
22085 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
22086 
22087 	for (entry = vm_map_first_entry(map);
22088 	    entry != vm_map_to_entry(map);
22089 	    entry = entry->vme_next) {
22090 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
22091 		    (VME_OBJECT(entry)->phys_contiguous))) {
22092 			continue;
22093 		}
22094 		if (entry->is_sub_map) {
22095 			assert(!entry->use_pmap);
22096 		}
22097 
22098 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
22099 	}
22100 	vm_map_unlock_read(map);
22101 
22102 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
22103 }
22104 
22105 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)22106 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
22107 {
22108 	vm_object_t object = NULL;
22109 	vm_object_offset_t offset;
22110 	vm_prot_t prot;
22111 	boolean_t wired;
22112 	vm_map_version_t version;
22113 	vm_map_t real_map;
22114 	int result = KERN_FAILURE;
22115 
22116 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
22117 	vm_map_lock(map);
22118 
22119 	result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
22120 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
22121 	    NULL, &real_map, NULL);
22122 	if (object == NULL) {
22123 		result = KERN_MEMORY_ERROR;
22124 	} else if (object->pager) {
22125 		result = vm_compressor_pager_inject_error(object->pager,
22126 		    offset);
22127 	} else {
22128 		result = KERN_MEMORY_PRESENT;
22129 	}
22130 
22131 	if (object != NULL) {
22132 		vm_object_unlock(object);
22133 	}
22134 
22135 	if (real_map != map) {
22136 		vm_map_unlock(real_map);
22137 	}
22138 	vm_map_unlock(map);
22139 
22140 	return result;
22141 }
22142 
22143 /* iterate over map entries. Call the first argument block for the number of entries and the second for every entry
22144  * returns: KERN_SUCCESS if iteration completed ok,
22145  *      error code if callback returned an error
22146  *      KERN_FAILURE if there was a race of adding/removing entries during the iteration and the number of entries
22147  *      iterated is different from the number in the first call
22148  */
22149 static kern_return_t
22150 vm_map_entries_foreach_locked(vm_map_t map, kern_return_t (^count_handler)(int nentries),
22151     kern_return_t (^entry_handler)(void* entry))
22152 {
22153 	vm_map_lock_assert_held(map);
22154 	int nentries = map->hdr.nentries;
22155 	kern_return_t error = count_handler(nentries);
22156 	if (error) {
22157 		return error;
22158 	}
22159 
22160 	/* iterate until we loop back to the map, see get_vmmap_entries() */
22161 	vm_map_entry_t entry = vm_map_first_entry(map);
22162 	int count = 0;
22163 	while (entry != vm_map_to_entry(map)) {
22164 		error = entry_handler(entry);
22165 		if (error != KERN_SUCCESS) {
22166 			return error;
22167 		}
22168 		entry = entry->vme_next;
22169 		++count;
22170 		if (count > nentries) {
22171 			/* nentries and entries iteration don't agree on how many entries there are, shouldn't really happen */
22172 			return KERN_FAILURE;
22173 		}
22174 	}
22175 	if (count < nentries) {
22176 		return KERN_FAILURE;
22177 	}
22178 	return KERN_SUCCESS;
22179 }
22180 
22181 kern_return_t
22182 vm_map_entries_foreach(vm_map_t map, kern_return_t (^count_handler)(int nentries),
22183     kern_return_t (^entry_handler)(void* entry))
22184 {
22185 	vm_map_lock_read(map);
22186 	kern_return_t error = vm_map_entries_foreach_locked(map, count_handler, entry_handler);
22187 	vm_map_unlock_read(map);
22188 	return error;
22189 }
22190 
22191 /*
22192  * Dump info about the entry into the given buffer.
22193  * return true on success, false if there was not enough space in the give buffer
22194  * argument size in: bytes free in the given buffer, out: bytes written
22195  */
22196 kern_return_t
vm_map_dump_entry_and_compressor_pager(void * pentry,char * buf,size_t * size)22197 vm_map_dump_entry_and_compressor_pager(void* pentry, char *buf, size_t *size)
22198 {
22199 	size_t insize = *size;
22200 	kern_return_t kr;
22201 	size_t offset = 0;
22202 
22203 	*size = 0;
22204 	if (sizeof(struct vm_map_entry_info) > insize) {
22205 		return KERN_NO_SPACE;
22206 	}
22207 
22208 	vm_map_entry_t entry = (vm_map_entry_t)pentry;
22209 	struct vm_map_entry_info *out_entry = (struct vm_map_entry_info*)buf;
22210 	out_entry->vmei_start = entry->vme_start;
22211 	out_entry->vmei_end = entry->vme_end;
22212 	out_entry->vmei_alias = VME_ALIAS(entry);
22213 	out_entry->vmei_offset = VME_OFFSET(entry);
22214 	out_entry->vmei_is_sub_map = entry->is_sub_map;
22215 	out_entry->vmei_protection = entry->protection;
22216 	offset += sizeof(struct vm_map_entry_info);
22217 
22218 	out_entry->vmei_slot_mapping_count = 0;
22219 	out_entry->vmei_is_compressor_pager = false;
22220 	*size = offset;
22221 	if (out_entry->vmei_is_sub_map) {
22222 		return KERN_SUCCESS; // TODO: sub_map interrogation not supported yet
22223 	}
22224 	/* have a vm_object? */
22225 	vm_object_t object = VME_OBJECT(entry);
22226 	if (object == VM_OBJECT_NULL || !object->internal) {
22227 		return KERN_SUCCESS;
22228 	}
22229 	/* objects has a pager? */
22230 	memory_object_t pager = object->pager;
22231 	if (pager != MEMORY_OBJECT_NULL) {
22232 		return KERN_SUCCESS;
22233 	}
22234 	bool is_compressor = false;
22235 	unsigned int slot_mapping_count = 0;
22236 	size_t pager_info_size = insize - offset;
22237 	kr = vm_compressor_pager_dump(pager, buf + offset, &pager_info_size, &is_compressor, &slot_mapping_count);
22238 	if (kr != KERN_SUCCESS) {
22239 		/* didn't have enough space for everything we want to write, caller needs to retry */
22240 		return kr;
22241 	}
22242 	offset += pager_info_size;
22243 	/* if we got here, is_compressor should be true due to the object->internal check above, so this assignment
22244 	 * is just for sanity sake */
22245 	out_entry->vmei_is_compressor_pager = is_compressor;
22246 	out_entry->vmei_slot_mapping_count = slot_mapping_count;
22247 	*size = offset;
22248 	return KERN_SUCCESS;
22249 }
22250 
22251 
22252 #endif
22253 
22254 
22255 #if CONFIG_FREEZE
22256 
22257 
22258 extern struct freezer_context freezer_context_global;
22259 AbsoluteTime c_freezer_last_yield_ts = 0;
22260 
22261 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
22262 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
22263 
22264 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)22265 vm_map_freeze(
22266 	task_t       task,
22267 	unsigned int *purgeable_count,
22268 	unsigned int *wired_count,
22269 	unsigned int *clean_count,
22270 	unsigned int *dirty_count,
22271 	unsigned int dirty_budget,
22272 	unsigned int *shared_count,
22273 	int          *freezer_error_code,
22274 	boolean_t    eval_only)
22275 {
22276 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
22277 	kern_return_t   kr = KERN_SUCCESS;
22278 	boolean_t       evaluation_phase = TRUE;
22279 	vm_object_t     cur_shared_object = NULL;
22280 	int             cur_shared_obj_ref_cnt = 0;
22281 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
22282 
22283 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
22284 
22285 	/*
22286 	 * We need the exclusive lock here so that we can
22287 	 * block any page faults or lookups while we are
22288 	 * in the middle of freezing this vm map.
22289 	 */
22290 	vm_map_t map = task->map;
22291 
22292 	vm_map_lock(map);
22293 
22294 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
22295 
22296 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22297 		if (vm_compressor_low_on_space()) {
22298 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22299 		}
22300 
22301 		if (vm_swap_low_on_space()) {
22302 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22303 		}
22304 
22305 		kr = KERN_NO_SPACE;
22306 		goto done;
22307 	}
22308 
22309 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
22310 		/*
22311 		 * In-memory compressor backing the freezer. No disk.
22312 		 * So no need to do the evaluation phase.
22313 		 */
22314 		evaluation_phase = FALSE;
22315 
22316 		if (eval_only == TRUE) {
22317 			/*
22318 			 * We don't support 'eval_only' mode
22319 			 * in this non-swap config.
22320 			 */
22321 			*freezer_error_code = FREEZER_ERROR_GENERIC;
22322 			kr = KERN_INVALID_ARGUMENT;
22323 			goto done;
22324 		}
22325 
22326 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22327 		clock_get_uptime(&c_freezer_last_yield_ts);
22328 	}
22329 again:
22330 
22331 	for (entry2 = vm_map_first_entry(map);
22332 	    entry2 != vm_map_to_entry(map);
22333 	    entry2 = entry2->vme_next) {
22334 		vm_object_t src_object;
22335 
22336 		if (entry2->is_sub_map) {
22337 			continue;
22338 		}
22339 
22340 		src_object = VME_OBJECT(entry2);
22341 		if (!src_object ||
22342 		    src_object->phys_contiguous ||
22343 		    !src_object->internal) {
22344 			continue;
22345 		}
22346 
22347 		/* If eligible, scan the entry, moving eligible pages over to our parent object */
22348 
22349 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
22350 			/*
22351 			 * We skip purgeable objects during evaluation phase only.
22352 			 * If we decide to freeze this process, we'll explicitly
22353 			 * purge these objects before we go around again with
22354 			 * 'evaluation_phase' set to FALSE.
22355 			 */
22356 
22357 			if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
22358 				/*
22359 				 * We want to purge objects that may not belong to this task but are mapped
22360 				 * in this task alone. Since we already purged this task's purgeable memory
22361 				 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
22362 				 * on this task's purgeable objects. Hence the check for only volatile objects.
22363 				 */
22364 				if (evaluation_phase ||
22365 				    src_object->purgable != VM_PURGABLE_VOLATILE ||
22366 				    os_ref_get_count_raw(&src_object->ref_count) != 1) {
22367 					continue;
22368 				}
22369 				vm_object_lock(src_object);
22370 				if (src_object->purgable == VM_PURGABLE_VOLATILE &&
22371 				    os_ref_get_count_raw(&src_object->ref_count) == 1) {
22372 					purgeable_q_t old_queue;
22373 
22374 					/* object should be on a purgeable queue */
22375 					assert(src_object->objq.next != NULL &&
22376 					    src_object->objq.prev != NULL);
22377 					/* move object from its volatile queue to the nonvolatile queue */
22378 					old_queue = vm_purgeable_object_remove(src_object);
22379 					assert(old_queue);
22380 					if (src_object->purgeable_when_ripe) {
22381 						/* remove a token from that volatile queue */
22382 						vm_page_lock_queues();
22383 						vm_purgeable_token_delete_first(old_queue);
22384 						vm_page_unlock_queues();
22385 					}
22386 					/* purge the object */
22387 					vm_object_purge(src_object, 0);
22388 				}
22389 				vm_object_unlock(src_object);
22390 				continue;
22391 			}
22392 
22393 			/*
22394 			 * Pages belonging to this object could be swapped to disk.
22395 			 * Make sure it's not a shared object because we could end
22396 			 * up just bringing it back in again.
22397 			 *
22398 			 * We try to optimize somewhat by checking for objects that are mapped
22399 			 * more than once within our own map. But we don't do full searches,
22400 			 * we just look at the entries following our current entry.
22401 			 */
22402 
22403 			if (os_ref_get_count_raw(&src_object->ref_count) > 1) {
22404 				if (src_object != cur_shared_object) {
22405 					obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22406 					dirty_shared_count += obj_pages_snapshot;
22407 
22408 					cur_shared_object = src_object;
22409 					cur_shared_obj_ref_cnt = 1;
22410 					continue;
22411 				} else {
22412 					cur_shared_obj_ref_cnt++;
22413 					if (os_ref_get_count_raw(&src_object->ref_count) == cur_shared_obj_ref_cnt) {
22414 						/*
22415 						 * Fall through to below and treat this object as private.
22416 						 * So deduct its pages from our shared total and add it to the
22417 						 * private total.
22418 						 */
22419 
22420 						dirty_shared_count -= obj_pages_snapshot;
22421 						dirty_private_count += obj_pages_snapshot;
22422 					} else {
22423 						continue;
22424 					}
22425 				}
22426 			}
22427 
22428 
22429 			if (os_ref_get_count_raw(&src_object->ref_count) == 1) {
22430 				dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22431 			}
22432 
22433 			if (evaluation_phase == TRUE) {
22434 				continue;
22435 			}
22436 		}
22437 
22438 		uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
22439 		*wired_count += src_object->wired_page_count;
22440 
22441 		if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22442 			if (vm_compressor_low_on_space()) {
22443 				*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22444 			}
22445 
22446 			if (vm_swap_low_on_space()) {
22447 				*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22448 			}
22449 
22450 			kr = KERN_NO_SPACE;
22451 			break;
22452 		}
22453 		if (paged_out_count >= dirty_budget) {
22454 			break;
22455 		}
22456 		dirty_budget -= paged_out_count;
22457 	}
22458 
22459 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
22460 	if (evaluation_phase) {
22461 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
22462 
22463 		if (dirty_shared_count > shared_pages_threshold) {
22464 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
22465 			kr = KERN_FAILURE;
22466 			goto done;
22467 		}
22468 
22469 		if (dirty_shared_count &&
22470 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
22471 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
22472 			kr = KERN_FAILURE;
22473 			goto done;
22474 		}
22475 
22476 		evaluation_phase = FALSE;
22477 		dirty_shared_count = dirty_private_count = 0;
22478 
22479 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22480 		clock_get_uptime(&c_freezer_last_yield_ts);
22481 
22482 		if (eval_only) {
22483 			kr = KERN_SUCCESS;
22484 			goto done;
22485 		}
22486 
22487 		vm_purgeable_purge_task_owned(task);
22488 
22489 		goto again;
22490 	} else {
22491 		kr = KERN_SUCCESS;
22492 	}
22493 
22494 done:
22495 	vm_map_unlock(map);
22496 
22497 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
22498 		vm_object_compressed_freezer_done();
22499 	}
22500 	return kr;
22501 }
22502 
22503 #endif
22504 
22505 /*
22506  * vm_map_entry_should_cow_for_true_share:
22507  *
22508  * Determines if the map entry should be clipped and setup for copy-on-write
22509  * to avoid applying "true_share" to a large VM object when only a subset is
22510  * targeted.
22511  *
22512  * For now, we target only the map entries created for the Objective C
22513  * Garbage Collector, which initially have the following properties:
22514  *	- alias == VM_MEMORY_MALLOC
22515  *      - wired_count == 0
22516  *      - !needs_copy
22517  * and a VM object with:
22518  *      - internal
22519  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
22520  *      - !true_share
22521  *      - vo_size == ANON_CHUNK_SIZE
22522  *
22523  * Only non-kernel map entries.
22524  */
22525 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)22526 vm_map_entry_should_cow_for_true_share(
22527 	vm_map_entry_t  entry)
22528 {
22529 	vm_object_t     object;
22530 
22531 	if (entry->is_sub_map) {
22532 		/* entry does not point at a VM object */
22533 		return FALSE;
22534 	}
22535 
22536 	if (entry->needs_copy) {
22537 		/* already set for copy_on_write: done! */
22538 		return FALSE;
22539 	}
22540 
22541 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
22542 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
22543 		/* not a malloc heap or Obj-C Garbage Collector heap */
22544 		return FALSE;
22545 	}
22546 
22547 	if (entry->wired_count) {
22548 		/* wired: can't change the map entry... */
22549 		vm_counters.should_cow_but_wired++;
22550 		return FALSE;
22551 	}
22552 
22553 	object = VME_OBJECT(entry);
22554 
22555 	if (object == VM_OBJECT_NULL) {
22556 		/* no object yet... */
22557 		return FALSE;
22558 	}
22559 
22560 	if (!object->internal) {
22561 		/* not an internal object */
22562 		return FALSE;
22563 	}
22564 
22565 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
22566 		/* not the default copy strategy */
22567 		return FALSE;
22568 	}
22569 
22570 	if (object->true_share) {
22571 		/* already true_share: too late to avoid it */
22572 		return FALSE;
22573 	}
22574 
22575 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
22576 	    object->vo_size != ANON_CHUNK_SIZE) {
22577 		/* ... not an object created for the ObjC Garbage Collector */
22578 		return FALSE;
22579 	}
22580 
22581 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
22582 	    object->vo_size != 2048 * 4096) {
22583 		/* ... not a "MALLOC_SMALL" heap */
22584 		return FALSE;
22585 	}
22586 
22587 	/*
22588 	 * All the criteria match: we have a large object being targeted for "true_share".
22589 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
22590 	 * try and avoid setting up the entire object for "true_share" by clipping the
22591 	 * targeted range and setting it up for copy-on-write.
22592 	 */
22593 	return TRUE;
22594 }
22595 
22596 uint64_t vm_map_range_overflows_count = 0;
22597 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22598 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22599 vm_map_range_overflows(
22600 	vm_map_t map,
22601 	vm_map_offset_t addr,
22602 	vm_map_size_t size)
22603 {
22604 	vm_map_offset_t start, end, sum;
22605 	vm_map_offset_t pgmask;
22606 
22607 	if (size == 0) {
22608 		/* empty range -> no overflow */
22609 		return false;
22610 	}
22611 	pgmask = vm_map_page_mask(map);
22612 	start = vm_map_trunc_page_mask(addr, pgmask);
22613 	end = vm_map_round_page_mask(addr + size, pgmask);
22614 	if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22615 		vm_map_range_overflows_count++;
22616 		if (vm_map_range_overflows_log) {
22617 			printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22618 			    proc_selfpid(),
22619 			    proc_best_name(current_proc()),
22620 			    (uint64_t)addr,
22621 			    (uint64_t)size,
22622 			    (uint64_t)pgmask);
22623 		}
22624 		DTRACE_VM4(vm_map_range_overflows,
22625 		    vm_map_t, map,
22626 		    uint32_t, pgmask,
22627 		    uint64_t, (uint64_t)addr,
22628 		    uint64_t, (uint64_t)size);
22629 		return true;
22630 	}
22631 	return false;
22632 }
22633 
22634 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22635 vm_map_round_page_mask(
22636 	vm_map_offset_t offset,
22637 	vm_map_offset_t mask)
22638 {
22639 	return VM_MAP_ROUND_PAGE(offset, mask);
22640 }
22641 
22642 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22643 vm_map_trunc_page_mask(
22644 	vm_map_offset_t offset,
22645 	vm_map_offset_t mask)
22646 {
22647 	return VM_MAP_TRUNC_PAGE(offset, mask);
22648 }
22649 
22650 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22651 vm_map_page_aligned(
22652 	vm_map_offset_t offset,
22653 	vm_map_offset_t mask)
22654 {
22655 	return ((offset) & mask) == 0;
22656 }
22657 
22658 int
vm_map_page_shift(vm_map_t map)22659 vm_map_page_shift(
22660 	vm_map_t map)
22661 {
22662 	return VM_MAP_PAGE_SHIFT(map);
22663 }
22664 
22665 int
vm_map_page_size(vm_map_t map)22666 vm_map_page_size(
22667 	vm_map_t map)
22668 {
22669 	return VM_MAP_PAGE_SIZE(map);
22670 }
22671 
22672 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22673 vm_map_page_mask(
22674 	vm_map_t map)
22675 {
22676 	return VM_MAP_PAGE_MASK(map);
22677 }
22678 
22679 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22680 vm_map_set_page_shift(
22681 	vm_map_t        map,
22682 	int             pageshift)
22683 {
22684 	if (map->hdr.nentries != 0) {
22685 		/* too late to change page size */
22686 		return KERN_FAILURE;
22687 	}
22688 
22689 	map->hdr.page_shift = (uint16_t)pageshift;
22690 
22691 	return KERN_SUCCESS;
22692 }
22693 
22694 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22695 vm_map_query_volatile(
22696 	vm_map_t        map,
22697 	mach_vm_size_t  *volatile_virtual_size_p,
22698 	mach_vm_size_t  *volatile_resident_size_p,
22699 	mach_vm_size_t  *volatile_compressed_size_p,
22700 	mach_vm_size_t  *volatile_pmap_size_p,
22701 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
22702 {
22703 	mach_vm_size_t  volatile_virtual_size;
22704 	mach_vm_size_t  volatile_resident_count;
22705 	mach_vm_size_t  volatile_compressed_count;
22706 	mach_vm_size_t  volatile_pmap_count;
22707 	mach_vm_size_t  volatile_compressed_pmap_count;
22708 	mach_vm_size_t  resident_count;
22709 	vm_map_entry_t  entry;
22710 	vm_object_t     object;
22711 
22712 	/* map should be locked by caller */
22713 
22714 	volatile_virtual_size = 0;
22715 	volatile_resident_count = 0;
22716 	volatile_compressed_count = 0;
22717 	volatile_pmap_count = 0;
22718 	volatile_compressed_pmap_count = 0;
22719 
22720 	for (entry = vm_map_first_entry(map);
22721 	    entry != vm_map_to_entry(map);
22722 	    entry = entry->vme_next) {
22723 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
22724 
22725 		if (entry->is_sub_map) {
22726 			continue;
22727 		}
22728 		if (!(entry->protection & VM_PROT_WRITE)) {
22729 			continue;
22730 		}
22731 		object = VME_OBJECT(entry);
22732 		if (object == VM_OBJECT_NULL) {
22733 			continue;
22734 		}
22735 		if (object->purgable != VM_PURGABLE_VOLATILE &&
22736 		    object->purgable != VM_PURGABLE_EMPTY) {
22737 			continue;
22738 		}
22739 		if (VME_OFFSET(entry)) {
22740 			/*
22741 			 * If the map entry has been split and the object now
22742 			 * appears several times in the VM map, we don't want
22743 			 * to count the object's resident_page_count more than
22744 			 * once.  We count it only for the first one, starting
22745 			 * at offset 0 and ignore the other VM map entries.
22746 			 */
22747 			continue;
22748 		}
22749 		resident_count = object->resident_page_count;
22750 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22751 			resident_count = 0;
22752 		} else {
22753 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22754 		}
22755 
22756 		volatile_virtual_size += entry->vme_end - entry->vme_start;
22757 		volatile_resident_count += resident_count;
22758 		if (object->pager) {
22759 			volatile_compressed_count +=
22760 			    vm_compressor_pager_get_count(object->pager);
22761 		}
22762 		pmap_compressed_bytes = 0;
22763 		pmap_resident_bytes =
22764 		    pmap_query_resident(map->pmap,
22765 		    entry->vme_start,
22766 		    entry->vme_end,
22767 		    &pmap_compressed_bytes);
22768 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22769 		volatile_compressed_pmap_count += (pmap_compressed_bytes
22770 		    / PAGE_SIZE);
22771 	}
22772 
22773 	/* map is still locked on return */
22774 
22775 	*volatile_virtual_size_p = volatile_virtual_size;
22776 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22777 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22778 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22779 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22780 
22781 	return KERN_SUCCESS;
22782 }
22783 
22784 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22785 vm_map_sizes(vm_map_t map,
22786     vm_map_size_t * psize,
22787     vm_map_size_t * pfree,
22788     vm_map_size_t * plargest_free)
22789 {
22790 	vm_map_entry_t  entry;
22791 	vm_map_offset_t prev;
22792 	vm_map_size_t   free, total_free, largest_free;
22793 	boolean_t       end;
22794 
22795 	if (!map) {
22796 		*psize = *pfree = *plargest_free = 0;
22797 		return;
22798 	}
22799 	total_free = largest_free = 0;
22800 
22801 	vm_map_lock_read(map);
22802 	if (psize) {
22803 		*psize = map->max_offset - map->min_offset;
22804 	}
22805 
22806 	prev = map->min_offset;
22807 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22808 		end = (entry == vm_map_to_entry(map));
22809 
22810 		if (end) {
22811 			free = entry->vme_end   - prev;
22812 		} else {
22813 			free = entry->vme_start - prev;
22814 		}
22815 
22816 		total_free += free;
22817 		if (free > largest_free) {
22818 			largest_free = free;
22819 		}
22820 
22821 		if (end) {
22822 			break;
22823 		}
22824 		prev = entry->vme_end;
22825 	}
22826 	vm_map_unlock_read(map);
22827 	if (pfree) {
22828 		*pfree = total_free;
22829 	}
22830 	if (plargest_free) {
22831 		*plargest_free = largest_free;
22832 	}
22833 }
22834 
22835 #if VM_SCAN_FOR_SHADOW_CHAIN
22836 int
vm_map_shadow_max(vm_map_t map)22837 vm_map_shadow_max(
22838 	vm_map_t map)
22839 {
22840 	int             shadows, shadows_max;
22841 	vm_map_entry_t  entry;
22842 	vm_object_t     object, next_object;
22843 
22844 	if (map == NULL) {
22845 		return 0;
22846 	}
22847 
22848 	shadows_max = 0;
22849 
22850 	vm_map_lock_read(map);
22851 
22852 	for (entry = vm_map_first_entry(map);
22853 	    entry != vm_map_to_entry(map);
22854 	    entry = entry->vme_next) {
22855 		if (entry->is_sub_map) {
22856 			continue;
22857 		}
22858 		object = VME_OBJECT(entry);
22859 		if (object == NULL) {
22860 			continue;
22861 		}
22862 		vm_object_lock_shared(object);
22863 		for (shadows = 0;
22864 		    object->shadow != NULL;
22865 		    shadows++, object = next_object) {
22866 			next_object = object->shadow;
22867 			vm_object_lock_shared(next_object);
22868 			vm_object_unlock(object);
22869 		}
22870 		vm_object_unlock(object);
22871 		if (shadows > shadows_max) {
22872 			shadows_max = shadows;
22873 		}
22874 	}
22875 
22876 	vm_map_unlock_read(map);
22877 
22878 	return shadows_max;
22879 }
22880 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22881 
22882 void
vm_commit_pagezero_status(vm_map_t lmap)22883 vm_commit_pagezero_status(vm_map_t lmap)
22884 {
22885 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22886 }
22887 
22888 #if __x86_64__
22889 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22890 vm_map_set_high_start(
22891 	vm_map_t        map,
22892 	vm_map_offset_t high_start)
22893 {
22894 	map->vmmap_high_start = high_start;
22895 }
22896 #endif /* __x86_64__ */
22897 
22898 #if CODE_SIGNING_MONITOR
22899 
22900 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22901 vm_map_entry_cs_associate(
22902 	vm_map_t                map,
22903 	vm_map_entry_t          entry,
22904 	vm_map_kernel_flags_t   vmk_flags)
22905 {
22906 	vm_object_t cs_object, cs_shadow, backing_object;
22907 	vm_object_offset_t cs_offset, backing_offset;
22908 	void *cs_blobs;
22909 	struct vnode *cs_vnode;
22910 	kern_return_t cs_ret;
22911 
22912 	if (map->pmap == NULL ||
22913 	    entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22914 	    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22915 	    VME_OBJECT(entry) == VM_OBJECT_NULL) {
22916 		return KERN_SUCCESS;
22917 	}
22918 
22919 	if (!(entry->protection & VM_PROT_EXECUTE)) {
22920 		/*
22921 		 * This memory region is not executable, so the code-signing
22922 		 * monitor would usually not care about it...
22923 		 */
22924 		if (vmk_flags.vmkf_remap_prot_copy &&
22925 		    (entry->max_protection & VM_PROT_EXECUTE)) {
22926 			/*
22927 			 * ... except if the memory region is being remapped
22928 			 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22929 			 * which is what a debugger or dtrace would be doing
22930 			 * to prepare to modify an executable page to insert
22931 			 * a breakpoint or activate a probe.
22932 			 * In that case, fall through so that we can mark
22933 			 * this region as being "debugged" and no longer
22934 			 * strictly code-signed.
22935 			 */
22936 		} else {
22937 			/*
22938 			 * Really not executable, so no need to tell the
22939 			 * code-signing monitor.
22940 			 */
22941 			return KERN_SUCCESS;
22942 		}
22943 	}
22944 
22945 	vm_map_lock_assert_exclusive(map);
22946 
22947 	/*
22948 	 * Check for a debug association mapping before we check for used_for_jit. This
22949 	 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22950 	 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22951 	 * since they are mapped with RW or RX permissions, which the page table monitor
22952 	 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22953 	 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22954 	 * violation when those USER_EXEC pages are mapped as RW.
22955 	 *
22956 	 * Since these pages switch between RW and RX through mprotect, they mimic what
22957 	 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22958 	 * on macOS systems, this works in our favor here and allows us to continue to
22959 	 * support these legacy-programmed applications without sacrificing security on
22960 	 * the page table or the code signing monitor. We don't need to explicitly check
22961 	 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22962 	 * created with RX, then the application must map it as RW in order to first write
22963 	 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22964 	 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22965 	 * Similarly, if the mapping was created as RW, and then switched to RX,
22966 	 * vm_map_protect will again mark the entry as a copy, and both these cases
22967 	 * lead to this if-statement being entered.
22968 	 *
22969 	 * For more information: rdar://115313336.
22970 	 */
22971 	if (vmk_flags.vmkf_remap_prot_copy) {
22972 		cs_ret = csm_associate_debug_region(
22973 			map->pmap,
22974 			entry->vme_start,
22975 			entry->vme_end - entry->vme_start);
22976 
22977 		/*
22978 		 * csm_associate_debug_region returns not supported when the code signing
22979 		 * monitor is disabled. This is intentional, since cs_ret is checked towards
22980 		 * the end of the function, and if it is not supported, then we still want the
22981 		 * VM to perform code-signing enforcement on this entry. That said, if we don't
22982 		 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22983 		 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22984 		 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22985 		 * cases, which will cause a violation when attempted to be mapped as writable).
22986 		 */
22987 		if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22988 			entry->vme_xnu_user_debug = TRUE;
22989 		}
22990 #if DEVELOPMENT || DEBUG
22991 		if (vm_log_xnu_user_debug) {
22992 			printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ]  vme_xnu_user_debug=%d cs_ret %d\n",
22993 			    proc_selfpid(),
22994 			    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22995 			    __FUNCTION__, __LINE__,
22996 			    map, entry,
22997 			    (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22998 			    entry->vme_xnu_user_debug,
22999 			    cs_ret);
23000 		}
23001 #endif /* DEVELOPMENT || DEBUG */
23002 		goto done;
23003 	}
23004 
23005 	if (entry->used_for_jit) {
23006 		cs_ret = csm_associate_jit_region(
23007 			map->pmap,
23008 			entry->vme_start,
23009 			entry->vme_end - entry->vme_start);
23010 		goto done;
23011 	}
23012 
23013 	cs_object = VME_OBJECT(entry);
23014 	vm_object_lock_shared(cs_object);
23015 	cs_offset = VME_OFFSET(entry);
23016 
23017 	/* find the VM object backed by the code-signed vnode */
23018 	for (;;) {
23019 		/* go to the bottom of cs_object's shadow chain */
23020 		for (;
23021 		    cs_object->shadow != VM_OBJECT_NULL;
23022 		    cs_object = cs_shadow) {
23023 			cs_shadow = cs_object->shadow;
23024 			cs_offset += cs_object->vo_shadow_offset;
23025 			vm_object_lock_shared(cs_shadow);
23026 			vm_object_unlock(cs_object);
23027 		}
23028 		if (cs_object->internal ||
23029 		    cs_object->pager == MEMORY_OBJECT_NULL) {
23030 			vm_object_unlock(cs_object);
23031 			return KERN_SUCCESS;
23032 		}
23033 
23034 		cs_offset += cs_object->paging_offset;
23035 
23036 		/*
23037 		 * cs_object could be backed by a:
23038 		 *      vnode_pager
23039 		 *	apple_protect_pager
23040 		 *      shared_region_pager
23041 		 *	fourk_pager (multiple backing objects -> fail?)
23042 		 * ask the pager if it has a backing VM object
23043 		 */
23044 		if (!memory_object_backing_object(cs_object->pager,
23045 		    cs_offset,
23046 		    &backing_object,
23047 		    &backing_offset)) {
23048 			/* no backing object: cs_object is it */
23049 			break;
23050 		}
23051 
23052 		/* look down the backing object's shadow chain */
23053 		vm_object_lock_shared(backing_object);
23054 		vm_object_unlock(cs_object);
23055 		cs_object = backing_object;
23056 		cs_offset = backing_offset;
23057 	}
23058 
23059 	cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
23060 	if (cs_vnode == NULL) {
23061 		/* no vnode, no code signatures to associate */
23062 		cs_ret = KERN_SUCCESS;
23063 	} else {
23064 		cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
23065 		    &cs_blobs);
23066 		assert(cs_ret == KERN_SUCCESS);
23067 		cs_ret = cs_associate_blob_with_mapping(map->pmap,
23068 		    entry->vme_start,
23069 		    (entry->vme_end - entry->vme_start),
23070 		    cs_offset,
23071 		    cs_blobs);
23072 	}
23073 	vm_object_unlock(cs_object);
23074 	cs_object = VM_OBJECT_NULL;
23075 
23076 done:
23077 	if (cs_ret == KERN_SUCCESS) {
23078 		DTRACE_VM2(vm_map_entry_cs_associate_success,
23079 		    vm_map_offset_t, entry->vme_start,
23080 		    vm_map_offset_t, entry->vme_end);
23081 		if (vm_map_executable_immutable) {
23082 			/*
23083 			 * Prevent this executable
23084 			 * mapping from being unmapped
23085 			 * or modified.
23086 			 */
23087 			entry->vme_permanent = TRUE;
23088 		}
23089 		/*
23090 		 * pmap says it will validate the
23091 		 * code-signing validity of pages
23092 		 * faulted in via this mapping, so
23093 		 * this map entry should be marked so
23094 		 * that vm_fault() bypasses code-signing
23095 		 * validation for faults coming through
23096 		 * this mapping.
23097 		 */
23098 		entry->csm_associated = TRUE;
23099 	} else if (cs_ret == KERN_NOT_SUPPORTED) {
23100 		/*
23101 		 * pmap won't check the code-signing
23102 		 * validity of pages faulted in via
23103 		 * this mapping, so VM should keep
23104 		 * doing it.
23105 		 */
23106 		DTRACE_VM3(vm_map_entry_cs_associate_off,
23107 		    vm_map_offset_t, entry->vme_start,
23108 		    vm_map_offset_t, entry->vme_end,
23109 		    int, cs_ret);
23110 	} else {
23111 		/*
23112 		 * A real error: do not allow
23113 		 * execution in this mapping.
23114 		 */
23115 		DTRACE_VM3(vm_map_entry_cs_associate_failure,
23116 		    vm_map_offset_t, entry->vme_start,
23117 		    vm_map_offset_t, entry->vme_end,
23118 		    int, cs_ret);
23119 		if (vmk_flags.vmkf_overwrite_immutable) {
23120 			/*
23121 			 * We can get here when we remap an apple_protect pager
23122 			 * on top of an already cs_associated executable mapping
23123 			 * with the same code signatures, so we don't want to
23124 			 * lose VM_PROT_EXECUTE in that case...
23125 			 */
23126 		} else {
23127 			entry->protection &= ~VM_PROT_ALLEXEC;
23128 			entry->max_protection &= ~VM_PROT_ALLEXEC;
23129 		}
23130 	}
23131 
23132 	return cs_ret;
23133 }
23134 
23135 #endif /* CODE_SIGNING_MONITOR */
23136 
23137 inline bool
vm_map_is_corpse_source(vm_map_t map)23138 vm_map_is_corpse_source(vm_map_t map)
23139 {
23140 	bool status = false;
23141 	if (map) {
23142 		vm_map_lock_read(map);
23143 		status = map->corpse_source;
23144 		vm_map_unlock_read(map);
23145 	}
23146 	return status;
23147 }
23148 
23149 inline void
vm_map_set_corpse_source(vm_map_t map)23150 vm_map_set_corpse_source(vm_map_t map)
23151 {
23152 	if (map) {
23153 		vm_map_lock(map);
23154 		map->corpse_source = true;
23155 		vm_map_unlock(map);
23156 	}
23157 }
23158 
23159 inline void
vm_map_unset_corpse_source(vm_map_t map)23160 vm_map_unset_corpse_source(vm_map_t map)
23161 {
23162 	if (map) {
23163 		vm_map_lock(map);
23164 		map->corpse_source = false;
23165 		vm_map_unlock(map);
23166 	}
23167 }
23168 /*
23169  * FORKED CORPSE FOOTPRINT
23170  *
23171  * A forked corpse gets a copy of the original VM map but its pmap is mostly
23172  * empty since it never ran and never got to fault in any pages.
23173  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
23174  * a forked corpse would therefore return very little information.
23175  *
23176  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
23177  * to vm_map_fork() to collect footprint information from the original VM map
23178  * and its pmap, and store it in the forked corpse's VM map.  That information
23179  * is stored in place of the VM map's "hole list" since we'll never need to
23180  * lookup for holes in the corpse's map.
23181  *
23182  * The corpse's footprint info looks like this:
23183  *
23184  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
23185  * as follows:
23186  *                     +---------------------------------------+
23187  *            header-> | cf_size                               |
23188  *                     +-------------------+-------------------+
23189  *                     | cf_last_region    | cf_last_zeroes    |
23190  *                     +-------------------+-------------------+
23191  *           region1-> | cfr_vaddr                             |
23192  *                     +-------------------+-------------------+
23193  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
23194  *                     +---------------------------------------+
23195  *                     | d4 | d5 | ...                         |
23196  *                     +---------------------------------------+
23197  *                     | ...                                   |
23198  *                     +-------------------+-------------------+
23199  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
23200  *                     +-------------------+-------------------+
23201  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
23202  *                     +---------------------------------------+
23203  *                     | d0 | d1 ...                           |
23204  *                     +---------------------------------------+
23205  *                       ...
23206  *                     +---------------------------------------+
23207  *       last region-> | cfr_vaddr                             |
23208  *                     +---------------------------------------+
23209  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
23210  *                     +---------------------------------------+
23211  *                       ...
23212  *                     +---------------------------------------+
23213  *                     | dx | dy | dz | na | na | na | na | na |
23214  *                     +---------------------------------------+
23215  *
23216  * where:
23217  *      cf_size:	total size of the buffer (rounded to page size)
23218  *      cf_last_region:	offset in the buffer of the last "region" sub-header
23219  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
23220  *			of last region
23221  *	cfr_vaddr:	virtual address of the start of the covered "region"
23222  *	cfr_num_pages:	number of pages in the covered "region"
23223  *	d*:		disposition of the page at that virtual address
23224  * Regions in the buffer are word-aligned.
23225  *
23226  * We estimate the size of the buffer based on the number of memory regions
23227  * and the virtual size of the address space.  While copying each memory region
23228  * during vm_map_fork(), we also collect the footprint info for that region
23229  * and store it in the buffer, packing it as much as possible (coalescing
23230  * contiguous memory regions to avoid having too many region headers and
23231  * avoiding long streaks of "zero" page dispositions by splitting footprint
23232  * "regions", so the number of regions in the footprint buffer might not match
23233  * the number of memory regions in the address space.
23234  *
23235  * We also have to copy the original task's "nonvolatile" ledgers since that's
23236  * part of the footprint and will need to be reported to any tool asking for
23237  * the footprint information of the forked corpse.
23238  */
23239 
23240 uint64_t vm_map_corpse_footprint_count = 0;
23241 uint64_t vm_map_corpse_footprint_size_avg = 0;
23242 uint64_t vm_map_corpse_footprint_size_max = 0;
23243 uint64_t vm_map_corpse_footprint_full = 0;
23244 uint64_t vm_map_corpse_footprint_no_buf = 0;
23245 
23246 struct vm_map_corpse_footprint_header {
23247 	vm_size_t       cf_size;        /* allocated buffer size */
23248 	uint32_t        cf_last_region; /* offset of last region in buffer */
23249 	union {
23250 		uint32_t cfu_last_zeroes; /* during creation:
23251 		                           * number of "zero" dispositions at
23252 		                           * end of last region */
23253 		uint32_t cfu_hint_region; /* during lookup:
23254 		                           * offset of last looked up region */
23255 #define cf_last_zeroes cfu.cfu_last_zeroes
23256 #define cf_hint_region cfu.cfu_hint_region
23257 	} cfu;
23258 };
23259 typedef uint8_t cf_disp_t;
23260 struct vm_map_corpse_footprint_region {
23261 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
23262 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
23263 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
23264 } __attribute__((packed));
23265 
23266 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)23267 vm_page_disposition_to_cf_disp(
23268 	int disposition)
23269 {
23270 	assert(sizeof(cf_disp_t) == 1);
23271 	/* relocate bits that don't fit in a "uint8_t" */
23272 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
23273 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
23274 	}
23275 	/* cast gets rid of extra bits */
23276 	return (cf_disp_t) disposition;
23277 }
23278 
23279 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)23280 vm_page_cf_disp_to_disposition(
23281 	cf_disp_t cf_disp)
23282 {
23283 	int disposition;
23284 
23285 	assert(sizeof(cf_disp_t) == 1);
23286 	disposition = (int) cf_disp;
23287 	/* move relocated bits back in place */
23288 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
23289 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
23290 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
23291 	}
23292 	return disposition;
23293 }
23294 
23295 static kmem_guard_t
vm_map_corpse_footprint_guard(vm_map_t map)23296 vm_map_corpse_footprint_guard(vm_map_t map)
23297 {
23298 	return (kmem_guard_t){
23299 		       .kmg_atomic = true,
23300 		       .kmg_tag = VM_KERN_MEMORY_DIAG,
23301 		       .kmg_context = os_hash_kernel_pointer(&map->vmmap_corpse_footprint),
23302 	};
23303 }
23304 
23305 /*
23306  * vm_map_corpse_footprint_new_region:
23307  *      closes the current footprint "region" and creates a new one
23308  *
23309  * Returns NULL if there's not enough space in the buffer for a new region.
23310  */
23311 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)23312 vm_map_corpse_footprint_new_region(
23313 	struct vm_map_corpse_footprint_header *footprint_header)
23314 {
23315 	uintptr_t       footprint_edge;
23316 	uint32_t        new_region_offset;
23317 	struct vm_map_corpse_footprint_region *footprint_region;
23318 	struct vm_map_corpse_footprint_region *new_footprint_region;
23319 
23320 	footprint_edge = ((uintptr_t)footprint_header +
23321 	    footprint_header->cf_size);
23322 	footprint_region = ((struct vm_map_corpse_footprint_region *)
23323 	    ((char *)footprint_header +
23324 	    footprint_header->cf_last_region));
23325 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
23326 	    footprint_edge);
23327 
23328 	/* get rid of trailing zeroes in the last region */
23329 	assert(footprint_region->cfr_num_pages >=
23330 	    footprint_header->cf_last_zeroes);
23331 	footprint_region->cfr_num_pages -=
23332 	    footprint_header->cf_last_zeroes;
23333 	footprint_header->cf_last_zeroes = 0;
23334 
23335 	/* reuse this region if it's now empty */
23336 	if (footprint_region->cfr_num_pages == 0) {
23337 		return footprint_region;
23338 	}
23339 
23340 	/* compute offset of new region */
23341 	new_region_offset = footprint_header->cf_last_region;
23342 	new_region_offset += sizeof(*footprint_region);
23343 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23344 	new_region_offset = roundup(new_region_offset, sizeof(int));
23345 
23346 	/* check if we're going over the edge */
23347 	if (((uintptr_t)footprint_header +
23348 	    new_region_offset +
23349 	    sizeof(*footprint_region)) >=
23350 	    footprint_edge) {
23351 		/* over the edge: no new region */
23352 		return NULL;
23353 	}
23354 
23355 	/* adjust offset of last region in header */
23356 	footprint_header->cf_last_region = new_region_offset;
23357 
23358 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
23359 	    ((char *)footprint_header +
23360 	    footprint_header->cf_last_region);
23361 	new_footprint_region->cfr_vaddr = 0;
23362 	new_footprint_region->cfr_num_pages = 0;
23363 	/* caller needs to initialize new region */
23364 
23365 	return new_footprint_region;
23366 }
23367 
23368 /*
23369  * vm_map_corpse_footprint_collect:
23370  *	collect footprint information for "old_entry" in "old_map" and
23371  *	stores it in "new_map"'s vmmap_footprint_info.
23372  */
23373 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)23374 vm_map_corpse_footprint_collect(
23375 	vm_map_t        old_map,
23376 	vm_map_entry_t  old_entry,
23377 	vm_map_t        new_map)
23378 {
23379 	vm_map_offset_t va;
23380 	kmem_return_t kmr;
23381 	struct vm_map_corpse_footprint_header *footprint_header;
23382 	struct vm_map_corpse_footprint_region *footprint_region;
23383 	struct vm_map_corpse_footprint_region *new_footprint_region;
23384 	cf_disp_t       *next_disp_p;
23385 	uintptr_t       footprint_edge;
23386 	uint32_t        num_pages_tmp;
23387 	int             effective_page_size;
23388 
23389 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
23390 
23391 	va = old_entry->vme_start;
23392 
23393 	vm_map_lock_assert_exclusive(old_map);
23394 	vm_map_lock_assert_exclusive(new_map);
23395 
23396 	assert(new_map->has_corpse_footprint);
23397 	assert(!old_map->has_corpse_footprint);
23398 	if (!new_map->has_corpse_footprint ||
23399 	    old_map->has_corpse_footprint) {
23400 		/*
23401 		 * This can only transfer footprint info from a
23402 		 * map with a live pmap to a map with a corpse footprint.
23403 		 */
23404 		return KERN_NOT_SUPPORTED;
23405 	}
23406 
23407 	if (new_map->vmmap_corpse_footprint == NULL) {
23408 		vm_size_t buf_size;
23409 
23410 		buf_size = (sizeof(*footprint_header) +
23411 		    (old_map->hdr.nentries
23412 		    *
23413 		    (sizeof(*footprint_region) +
23414 		    +3))            /* potential alignment for each region */
23415 		    +
23416 		    ((old_map->size / effective_page_size)
23417 		    *
23418 		    sizeof(cf_disp_t)));      /* disposition for each page */
23419 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
23420 		buf_size = round_page(buf_size);
23421 
23422 		/* limit buffer to 1 page to validate overflow detection */
23423 //		buf_size = PAGE_SIZE;
23424 
23425 		/* limit size to a somewhat sane amount */
23426 #if XNU_TARGET_OS_OSX
23427 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
23428 #else /* XNU_TARGET_OS_OSX */
23429 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
23430 #endif /* XNU_TARGET_OS_OSX */
23431 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
23432 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
23433 		}
23434 		kmem_guard_t guard = vm_map_corpse_footprint_guard(new_map);
23435 		kmr = kmem_alloc_guard(kernel_map, buf_size + PAGE_SIZE, 0,
23436 		    KMA_DATA | KMA_GUARD_LAST | KMA_KOBJECT | KMA_ZERO,
23437 		    guard);
23438 		if (kmr.kmr_return != KERN_SUCCESS) {
23439 			vm_map_corpse_footprint_no_buf++;
23440 			return kmr.kmr_return;
23441 		}
23442 
23443 		/* initialize header and 1st region */
23444 		footprint_header = (struct vm_map_corpse_footprint_header *)kmr.kmr_ptr;
23445 		assert3p(footprint_header, !=, NULL);
23446 		new_map->vmmap_corpse_footprint = footprint_header;
23447 
23448 		footprint_header->cf_size = buf_size;
23449 		footprint_header->cf_last_region =
23450 		    sizeof(*footprint_header);
23451 		footprint_header->cf_last_zeroes = 0;
23452 
23453 		footprint_region = (struct vm_map_corpse_footprint_region *)
23454 		    ((char *)footprint_header +
23455 		    footprint_header->cf_last_region);
23456 		footprint_region->cfr_vaddr = 0;
23457 		footprint_region->cfr_num_pages = 0;
23458 	} else {
23459 		/* retrieve header and last region */
23460 		footprint_header = (struct vm_map_corpse_footprint_header *)
23461 		    new_map->vmmap_corpse_footprint;
23462 		footprint_region = (struct vm_map_corpse_footprint_region *)
23463 		    ((char *)footprint_header +
23464 		    footprint_header->cf_last_region);
23465 	}
23466 	footprint_edge = ((uintptr_t)footprint_header +
23467 	    footprint_header->cf_size);
23468 
23469 	if ((footprint_region->cfr_vaddr +
23470 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
23471 	    effective_page_size))
23472 	    != old_entry->vme_start) {
23473 		uint64_t num_pages_delta, num_pages_delta_size;
23474 		uint32_t region_offset_delta_size;
23475 
23476 		/*
23477 		 * Not the next contiguous virtual address:
23478 		 * start a new region or store "zero" dispositions for
23479 		 * the missing pages?
23480 		 */
23481 		/* size of gap in actual page dispositions */
23482 		num_pages_delta = ((old_entry->vme_start -
23483 		    footprint_region->cfr_vaddr) / effective_page_size)
23484 		    - footprint_region->cfr_num_pages;
23485 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
23486 		/* size of gap as a new footprint region header */
23487 		region_offset_delta_size =
23488 		    (sizeof(*footprint_region) +
23489 		    roundup(((footprint_region->cfr_num_pages -
23490 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
23491 		    sizeof(int)) -
23492 		    ((footprint_region->cfr_num_pages -
23493 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
23494 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
23495 		if (region_offset_delta_size < num_pages_delta_size ||
23496 		    os_add3_overflow(footprint_region->cfr_num_pages,
23497 		    (uint32_t) num_pages_delta,
23498 		    1,
23499 		    &num_pages_tmp)) {
23500 			/*
23501 			 * Storing data for this gap would take more space
23502 			 * than inserting a new footprint region header:
23503 			 * let's start a new region and save space. If it's a
23504 			 * tie, let's avoid using a new region, since that
23505 			 * would require more region hops to find the right
23506 			 * range during lookups.
23507 			 *
23508 			 * If the current region's cfr_num_pages would overflow
23509 			 * if we added "zero" page dispositions for the gap,
23510 			 * no choice but to start a new region.
23511 			 */
23512 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
23513 			new_footprint_region =
23514 			    vm_map_corpse_footprint_new_region(footprint_header);
23515 			/* check that we're not going over the edge */
23516 			if (new_footprint_region == NULL) {
23517 				goto over_the_edge;
23518 			}
23519 			footprint_region = new_footprint_region;
23520 			/* initialize new region as empty */
23521 			footprint_region->cfr_vaddr = old_entry->vme_start;
23522 			footprint_region->cfr_num_pages = 0;
23523 		} else {
23524 			/*
23525 			 * Store "zero" page dispositions for the missing
23526 			 * pages.
23527 			 */
23528 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
23529 			for (; num_pages_delta > 0; num_pages_delta--) {
23530 				next_disp_p = (cf_disp_t *)
23531 				    ((uintptr_t) footprint_region +
23532 				    sizeof(*footprint_region));
23533 				next_disp_p += footprint_region->cfr_num_pages;
23534 				/* check that we're not going over the edge */
23535 				if ((uintptr_t)next_disp_p >= footprint_edge) {
23536 					goto over_the_edge;
23537 				}
23538 				/* store "zero" disposition for this gap page */
23539 				footprint_region->cfr_num_pages++;
23540 				*next_disp_p = (cf_disp_t) 0;
23541 				footprint_header->cf_last_zeroes++;
23542 			}
23543 		}
23544 	}
23545 
23546 	for (va = old_entry->vme_start;
23547 	    va < old_entry->vme_end;
23548 	    va += effective_page_size) {
23549 		int             disposition;
23550 		cf_disp_t       cf_disp;
23551 
23552 		vm_map_footprint_query_page_info(old_map,
23553 		    old_entry,
23554 		    va,
23555 		    &disposition);
23556 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
23557 
23558 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
23559 
23560 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
23561 			/*
23562 			 * Ignore "zero" dispositions at start of
23563 			 * region: just move start of region.
23564 			 */
23565 			footprint_region->cfr_vaddr += effective_page_size;
23566 			continue;
23567 		}
23568 
23569 		/* would region's cfr_num_pages overflow? */
23570 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
23571 		    &num_pages_tmp)) {
23572 			/* overflow: create a new region */
23573 			new_footprint_region =
23574 			    vm_map_corpse_footprint_new_region(
23575 				footprint_header);
23576 			if (new_footprint_region == NULL) {
23577 				goto over_the_edge;
23578 			}
23579 			footprint_region = new_footprint_region;
23580 			footprint_region->cfr_vaddr = va;
23581 			footprint_region->cfr_num_pages = 0;
23582 		}
23583 
23584 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
23585 		    sizeof(*footprint_region));
23586 		next_disp_p += footprint_region->cfr_num_pages;
23587 		/* check that we're not going over the edge */
23588 		if ((uintptr_t)next_disp_p >= footprint_edge) {
23589 			goto over_the_edge;
23590 		}
23591 		/* store this dispostion */
23592 		*next_disp_p = cf_disp;
23593 		footprint_region->cfr_num_pages++;
23594 
23595 		if (cf_disp != 0) {
23596 			/* non-zero disp: break the current zero streak */
23597 			footprint_header->cf_last_zeroes = 0;
23598 			/* done */
23599 			continue;
23600 		}
23601 
23602 		/* zero disp: add to the current streak of zeroes */
23603 		footprint_header->cf_last_zeroes++;
23604 		if ((footprint_header->cf_last_zeroes +
23605 		    roundup(((footprint_region->cfr_num_pages -
23606 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23607 		    (sizeof(int) - 1),
23608 		    sizeof(int))) <
23609 		    (sizeof(*footprint_header))) {
23610 			/*
23611 			 * There are not enough trailing "zero" dispositions
23612 			 * (+ the extra padding we would need for the previous
23613 			 * region); creating a new region would not save space
23614 			 * at this point, so let's keep this "zero" disposition
23615 			 * in this region and reconsider later.
23616 			 */
23617 			continue;
23618 		}
23619 		/*
23620 		 * Create a new region to avoid having too many consecutive
23621 		 * "zero" dispositions.
23622 		 */
23623 		new_footprint_region =
23624 		    vm_map_corpse_footprint_new_region(footprint_header);
23625 		if (new_footprint_region == NULL) {
23626 			goto over_the_edge;
23627 		}
23628 		footprint_region = new_footprint_region;
23629 		/* initialize the new region as empty ... */
23630 		footprint_region->cfr_num_pages = 0;
23631 		/* ... and skip this "zero" disp */
23632 		footprint_region->cfr_vaddr = va + effective_page_size;
23633 	}
23634 
23635 	return KERN_SUCCESS;
23636 
23637 over_the_edge:
23638 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23639 	vm_map_corpse_footprint_full++;
23640 	return KERN_RESOURCE_SHORTAGE;
23641 }
23642 
23643 /*
23644  * vm_map_corpse_footprint_collect_done:
23645  *	completes the footprint collection by getting rid of any remaining
23646  *	trailing "zero" dispositions and trimming the unused part of the
23647  *	kernel buffer
23648  */
23649 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23650 vm_map_corpse_footprint_collect_done(
23651 	vm_map_t        new_map)
23652 {
23653 	struct vm_map_corpse_footprint_header *footprint_header;
23654 	struct vm_map_corpse_footprint_region *footprint_region;
23655 	vm_size_t       buf_size, actual_size;
23656 
23657 	assert(new_map->has_corpse_footprint);
23658 	if (!new_map->has_corpse_footprint ||
23659 	    new_map->vmmap_corpse_footprint == NULL) {
23660 		return;
23661 	}
23662 
23663 	footprint_header = (struct vm_map_corpse_footprint_header *)
23664 	    new_map->vmmap_corpse_footprint;
23665 	buf_size = footprint_header->cf_size;
23666 
23667 	footprint_region = (struct vm_map_corpse_footprint_region *)
23668 	    ((char *)footprint_header +
23669 	    footprint_header->cf_last_region);
23670 
23671 	/* get rid of trailing zeroes in last region */
23672 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23673 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23674 	footprint_header->cf_last_zeroes = 0;
23675 
23676 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
23677 	    sizeof(*footprint_region) +
23678 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23679 
23680 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23681 	vm_map_corpse_footprint_size_avg =
23682 	    (((vm_map_corpse_footprint_size_avg *
23683 	    vm_map_corpse_footprint_count) +
23684 	    actual_size) /
23685 	    (vm_map_corpse_footprint_count + 1));
23686 	vm_map_corpse_footprint_count++;
23687 	if (actual_size > vm_map_corpse_footprint_size_max) {
23688 		vm_map_corpse_footprint_size_max = actual_size;
23689 	}
23690 
23691 	actual_size = round_page(actual_size);
23692 	assert3u(buf_size, >=, actual_size);
23693 	if (buf_size > actual_size) {
23694 		/*
23695 		 * Free unused space at the end of the buffer
23696 		 */
23697 		kmem_guard_t guard = vm_map_corpse_footprint_guard(new_map);
23698 		kmem_return_t kmr = kmem_realloc_guard(kernel_map,
23699 		    (vm_offset_t)footprint_header,
23700 		    /* Account for guard page */
23701 		    buf_size + PAGE_SIZE,
23702 		    actual_size + PAGE_SIZE,
23703 		    KMR_DATA | KMR_GUARD_LAST | KMR_FREEOLD | KMR_KOBJECT,
23704 		    guard);
23705 		assertf(kmr.kmr_return == KERN_SUCCESS,
23706 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23707 		    footprint_header,
23708 		    (uint64_t) buf_size,
23709 		    (uint64_t) actual_size,
23710 		    kmr.kmr_return);
23711 		footprint_header = (struct vm_map_corpse_footprint_header *)kmr.kmr_ptr;
23712 		assert3p(footprint_header, !=, NULL);
23713 		new_map->vmmap_corpse_footprint = footprint_header;
23714 		footprint_region = NULL;
23715 	}
23716 
23717 	footprint_header->cf_size = actual_size;
23718 }
23719 
23720 /*
23721  * vm_map_corpse_footprint_query_page_info:
23722  *	retrieves the disposition of the page at virtual address "vaddr"
23723  *	in the forked corpse's VM map
23724  *
23725  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23726  */
23727 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23728 vm_map_corpse_footprint_query_page_info(
23729 	vm_map_t        map,
23730 	vm_map_offset_t va,
23731 	int             *disposition_p)
23732 {
23733 	struct vm_map_corpse_footprint_header *footprint_header;
23734 	struct vm_map_corpse_footprint_region *footprint_region;
23735 	uint32_t        footprint_region_offset;
23736 	vm_map_offset_t region_start, region_end;
23737 	int             disp_idx;
23738 	kern_return_t   kr;
23739 	int             effective_page_size;
23740 	cf_disp_t       cf_disp;
23741 
23742 	if (!map->has_corpse_footprint) {
23743 		*disposition_p = 0;
23744 		kr = KERN_INVALID_ARGUMENT;
23745 		goto done;
23746 	}
23747 
23748 	footprint_header = map->vmmap_corpse_footprint;
23749 	if (footprint_header == NULL) {
23750 		*disposition_p = 0;
23751 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23752 		kr = KERN_INVALID_ARGUMENT;
23753 		goto done;
23754 	}
23755 
23756 	/* start looking at the hint ("cf_hint_region") */
23757 	footprint_region_offset = footprint_header->cf_hint_region;
23758 
23759 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23760 
23761 lookup_again:
23762 	if (footprint_region_offset < sizeof(*footprint_header)) {
23763 		/* hint too low: start from 1st region */
23764 		footprint_region_offset = sizeof(*footprint_header);
23765 	}
23766 	if (footprint_region_offset > footprint_header->cf_last_region) {
23767 		/* hint too high: re-start from 1st region */
23768 		footprint_region_offset = sizeof(*footprint_header);
23769 	}
23770 	footprint_region = (struct vm_map_corpse_footprint_region *)
23771 	    ((char *)footprint_header + footprint_region_offset);
23772 	region_start = footprint_region->cfr_vaddr;
23773 	region_end = (region_start +
23774 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23775 	    effective_page_size));
23776 	if (va < region_start &&
23777 	    footprint_region_offset != sizeof(*footprint_header)) {
23778 		/* our range starts before the hint region */
23779 
23780 		/* reset the hint (in a racy way...) */
23781 		footprint_header->cf_hint_region = sizeof(*footprint_header);
23782 		/* lookup "va" again from 1st region */
23783 		footprint_region_offset = sizeof(*footprint_header);
23784 		goto lookup_again;
23785 	}
23786 
23787 	while (va >= region_end) {
23788 		if (footprint_region_offset >= footprint_header->cf_last_region) {
23789 			break;
23790 		}
23791 		/* skip the region's header */
23792 		footprint_region_offset += sizeof(*footprint_region);
23793 		/* skip the region's page dispositions */
23794 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23795 		/* align to next word boundary */
23796 		footprint_region_offset =
23797 		    roundup(footprint_region_offset,
23798 		    sizeof(int));
23799 		footprint_region = (struct vm_map_corpse_footprint_region *)
23800 		    ((char *)footprint_header + footprint_region_offset);
23801 		region_start = footprint_region->cfr_vaddr;
23802 		region_end = (region_start +
23803 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23804 		    effective_page_size));
23805 	}
23806 	if (va < region_start || va >= region_end) {
23807 		/* page not found */
23808 		*disposition_p = 0;
23809 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23810 		kr = KERN_SUCCESS;
23811 		goto done;
23812 	}
23813 
23814 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
23815 	footprint_header->cf_hint_region = footprint_region_offset;
23816 
23817 	/* get page disposition for "va" in this region */
23818 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23819 	cf_disp = footprint_region->cfr_disposition[disp_idx];
23820 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23821 	kr = KERN_SUCCESS;
23822 done:
23823 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23824 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23825 	DTRACE_VM4(footprint_query_page_info,
23826 	    vm_map_t, map,
23827 	    vm_map_offset_t, va,
23828 	    int, *disposition_p,
23829 	    kern_return_t, kr);
23830 
23831 	return kr;
23832 }
23833 
23834 void
vm_map_corpse_footprint_destroy(vm_map_t map)23835 vm_map_corpse_footprint_destroy(
23836 	vm_map_t        map)
23837 {
23838 	if (map->has_corpse_footprint &&
23839 	    map->vmmap_corpse_footprint != NULL) {
23840 		struct vm_map_corpse_footprint_header *footprint_header;
23841 		vm_size_t buf_size;
23842 
23843 		footprint_header = map->vmmap_corpse_footprint;
23844 		buf_size = footprint_header->cf_size;
23845 		kmem_guard_t guard = vm_map_corpse_footprint_guard(map);
23846 		kmem_free_guard(kernel_map, (vm_offset_t)footprint_header,
23847 		    buf_size + PAGE_SIZE,
23848 		    KMF_GUARD_LAST, guard);
23849 		map->vmmap_corpse_footprint = NULL;
23850 		map->has_corpse_footprint = FALSE;
23851 	}
23852 }
23853 
23854 /*
23855  * vm_map_copy_footprint_ledgers:
23856  *	copies any ledger that's relevant to the memory footprint of "old_task"
23857  *	into the forked corpse's task ("new_task")
23858  */
23859 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23860 vm_map_copy_footprint_ledgers(
23861 	task_t  old_task,
23862 	task_t  new_task)
23863 {
23864 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23865 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23866 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23867 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23868 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23869 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23870 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23871 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23872 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23873 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23874 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23875 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23876 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23877 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23878 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23879 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23880 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23881 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23882 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23883 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23884 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_nofootprint_total);
23885 }
23886 
23887 /*
23888  * vm_map_copy_ledger:
23889  *	copy a single ledger from "old_task" to "new_task"
23890  */
23891 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23892 vm_map_copy_ledger(
23893 	task_t  old_task,
23894 	task_t  new_task,
23895 	int     ledger_entry)
23896 {
23897 	ledger_amount_t old_balance, new_balance, delta;
23898 
23899 	assert(new_task->map->has_corpse_footprint);
23900 	if (!new_task->map->has_corpse_footprint) {
23901 		return;
23902 	}
23903 
23904 	/* turn off sanity checks for the ledger we're about to mess with */
23905 	ledger_disable_panic_on_negative(new_task->ledger,
23906 	    ledger_entry);
23907 
23908 	/* adjust "new_task" to match "old_task" */
23909 	ledger_get_balance(old_task->ledger,
23910 	    ledger_entry,
23911 	    &old_balance);
23912 	ledger_get_balance(new_task->ledger,
23913 	    ledger_entry,
23914 	    &new_balance);
23915 	if (new_balance == old_balance) {
23916 		/* new == old: done */
23917 	} else if (new_balance > old_balance) {
23918 		/* new > old ==> new -= new - old */
23919 		delta = new_balance - old_balance;
23920 		ledger_debit(new_task->ledger,
23921 		    ledger_entry,
23922 		    delta);
23923 	} else {
23924 		/* new < old ==> new += old - new */
23925 		delta = old_balance - new_balance;
23926 		ledger_credit(new_task->ledger,
23927 		    ledger_entry,
23928 		    delta);
23929 	}
23930 }
23931 
23932 /*
23933  * vm_map_get_pmap:
23934  * returns the pmap associated with the vm_map
23935  */
23936 pmap_t
vm_map_get_pmap(vm_map_t map)23937 vm_map_get_pmap(vm_map_t map)
23938 {
23939 	return vm_map_pmap(map);
23940 }
23941 
23942 ppnum_t
vm_map_get_phys_page(vm_map_t map,vm_offset_t addr)23943 vm_map_get_phys_page(
23944 	vm_map_t                map,
23945 	vm_offset_t             addr)
23946 {
23947 	vm_object_offset_t      offset;
23948 	vm_object_t             object;
23949 	vm_map_offset_t         map_offset;
23950 	vm_map_entry_t          entry;
23951 	ppnum_t                 phys_page = 0;
23952 
23953 	map_offset = vm_map_trunc_page(addr, PAGE_MASK);
23954 
23955 	vm_map_lock(map);
23956 	while (vm_map_lookup_entry(map, map_offset, &entry)) {
23957 		if (entry->is_sub_map) {
23958 			vm_map_t        old_map;
23959 			vm_map_lock(VME_SUBMAP(entry));
23960 			old_map = map;
23961 			map = VME_SUBMAP(entry);
23962 			map_offset = (VME_OFFSET(entry) +
23963 			    (map_offset - entry->vme_start));
23964 			vm_map_unlock(old_map);
23965 			continue;
23966 		}
23967 		if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
23968 			vm_map_unlock(map);
23969 			return (ppnum_t) 0;
23970 		}
23971 		if (VME_OBJECT(entry)->phys_contiguous) {
23972 			/* These are  not standard pageable memory mappings */
23973 			/* If they are not present in the object they will  */
23974 			/* have to be picked up from the pager through the  */
23975 			/* fault mechanism.  */
23976 			if (VME_OBJECT(entry)->vo_shadow_offset == 0) {
23977 				/* need to call vm_fault */
23978 				vm_map_unlock(map);
23979 				vm_fault(map, map_offset, VM_PROT_NONE,
23980 				    FALSE /* change_wiring */, VM_KERN_MEMORY_NONE,
23981 				    THREAD_UNINT, NULL, 0);
23982 				vm_map_lock(map);
23983 				continue;
23984 			}
23985 			offset = (VME_OFFSET(entry) +
23986 			    (map_offset - entry->vme_start));
23987 			phys_page = (ppnum_t)
23988 			    ((VME_OBJECT(entry)->vo_shadow_offset
23989 			    + offset) >> PAGE_SHIFT);
23990 			break;
23991 		}
23992 		offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start));
23993 		object = VME_OBJECT(entry);
23994 		vm_object_lock(object);
23995 		while (TRUE) {
23996 			vm_page_t dst_page = vm_page_lookup(object, offset);
23997 			if (dst_page == VM_PAGE_NULL) {
23998 				if (object->shadow) {
23999 					vm_object_t old_object;
24000 					vm_object_lock(object->shadow);
24001 					old_object = object;
24002 					offset = offset + object->vo_shadow_offset;
24003 					object = object->shadow;
24004 					vm_object_unlock(old_object);
24005 				} else {
24006 					vm_object_unlock(object);
24007 					break;
24008 				}
24009 			} else {
24010 				phys_page = (ppnum_t)(VM_PAGE_GET_PHYS_PAGE(dst_page));
24011 				vm_object_unlock(object);
24012 				break;
24013 			}
24014 		}
24015 		break;
24016 	}
24017 
24018 	vm_map_unlock(map);
24019 	return phys_page;
24020 }
24021 
24022 #if CONFIG_MAP_RANGES
24023 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
24024 static bitmap_t vm_map_user_range_large_file_map[BITMAP_LEN(VM_MEMORY_COUNT)];
24025 
24026 static_assert((int)UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
24027 static_assert((int)UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
24028 
24029 /*
24030  * vm_map_range_map_init:
24031  *  initializes the VM range ID map to enable index lookup
24032  *  of user VM ranges based on VM tag from userspace.
24033  */
24034 static void
vm_map_range_map_init(void)24035 vm_map_range_map_init(void)
24036 {
24037 	/*
24038 	 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
24039 	 * - the former is malloc metadata which should be kept separate
24040 	 * - the latter has its own ranges
24041 	 */
24042 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
24043 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
24044 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
24045 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
24046 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
24047 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
24048 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
24049 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
24050 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
24051 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
24052 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
24053 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
24054 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
24055 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
24056 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
24057 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
24058 	bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOACCELERATOR);
24059 	bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOSURFACE);
24060 }
24061 
24062 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)24063 vm_map_range_random_uniform(
24064 	vm_map_size_t           req_size,
24065 	vm_map_offset_t         min_addr,
24066 	vm_map_offset_t         max_addr,
24067 	vm_map_offset_t         offmask)
24068 {
24069 	vm_map_offset_t random_addr;
24070 	struct mach_vm_range alloc;
24071 
24072 	req_size = (req_size + offmask) & ~offmask;
24073 	min_addr = (min_addr + offmask) & ~offmask;
24074 	max_addr = max_addr & ~offmask;
24075 
24076 	read_random(&random_addr, sizeof(random_addr));
24077 	random_addr %= (max_addr - req_size - min_addr);
24078 	random_addr &= ~offmask;
24079 
24080 	alloc.min_address = min_addr + random_addr;
24081 	alloc.max_address = min_addr + random_addr + req_size;
24082 	return alloc;
24083 }
24084 
24085 static vm_map_offset_t
vm_map_range_offmask(void)24086 vm_map_range_offmask(void)
24087 {
24088 	uint32_t pte_depth;
24089 
24090 	/*
24091 	 * PTE optimizations
24092 	 *
24093 	 *
24094 	 * 16k pages systems
24095 	 * ~~~~~~~~~~~~~~~~~
24096 	 *
24097 	 * A single L1 (sub-)page covers the address space.
24098 	 * - L2 pages cover 64G,
24099 	 * - L3 pages cover 32M.
24100 	 *
24101 	 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
24102 	 * As a result, we really only need to align the ranges to 32M to avoid
24103 	 * partial L3 pages.
24104 	 *
24105 	 * On macOS, the usage of L2 pages will increase, so as a result we will
24106 	 * want to align ranges to 64G in order to utilize them fully.
24107 	 *
24108 	 *
24109 	 * 4k pages systems
24110 	 * ~~~~~~~~~~~~~~~~
24111 	 *
24112 	 * A single L0 (sub-)page covers the address space.
24113 	 * - L1 pages cover 512G,
24114 	 * - L2 pages cover 1G,
24115 	 * - L3 pages cover 2M.
24116 	 *
24117 	 * The long tail of processes on a system will tend to have a VA usage
24118 	 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
24119 	 * This is achievable with a single L1 and a few L2s without
24120 	 * randomization.
24121 	 *
24122 	 * However once randomization is introduced, the system will immediately
24123 	 * need several L1s and many more L2s. As a result:
24124 	 *
24125 	 * - on embedded devices, the cost of these extra pages isn't
24126 	 *   sustainable, and we just disable the feature entirely,
24127 	 *
24128 	 * - on macOS we align ranges to a 512G boundary so that the extra L1
24129 	 *   pages can be used to their full potential.
24130 	 */
24131 
24132 	/*
24133 	 * note, this function assumes _non exotic mappings_
24134 	 * which is why it uses the native kernel's PAGE_SHIFT.
24135 	 */
24136 #if XNU_PLATFORM_MacOSX
24137 	pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
24138 #else /* !XNU_PLATFORM_MacOSX */
24139 	pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
24140 #endif /* !XNU_PLATFORM_MacOSX */
24141 
24142 	if (pte_depth == 0) {
24143 		return 0;
24144 	}
24145 
24146 	return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
24147 }
24148 
24149 /*
24150  * vm_map_range_configure:
24151  *	configures the user vm_map ranges by increasing the maximum VA range of
24152  *  the map and carving out a range at the end of VA space (searching backwards
24153  *  in the newly expanded map).
24154  */
24155 kern_return_t
vm_map_range_configure(vm_map_t map,__unused bool needs_extra_jumbo_va)24156 vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va)
24157 {
24158 	const vm_map_offset_t offmask = vm_map_range_offmask();
24159 	struct mach_vm_range data_range;
24160 	vm_map_offset_t default_end;
24161 	kern_return_t kr;
24162 
24163 	if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
24164 		/*
24165 		 * No point doing vm ranges in a 32bit address space.
24166 		 */
24167 		return KERN_NOT_SUPPORTED;
24168 	}
24169 
24170 	/* Should not be applying ranges to kernel map or kernel map submaps */
24171 	assert(vm_map_pmap(map) != kernel_pmap);
24172 
24173 #if XNU_PLATFORM_MacOSX
24174 
24175 	/*
24176 	 * on macOS, the address space is a massive 47 bits (128T),
24177 	 * with several carve outs that processes can't use:
24178 	 * - the shared region
24179 	 * - the commpage region
24180 	 * - the GPU carve out (if applicable)
24181 	 *
24182 	 * and when nano-malloc is in use it desires memory at the 96T mark.
24183 	 *
24184 	 * However, their location is architecture dependent:
24185 	 * - On intel, the shared region and commpage are
24186 	 *   at the very end of the usable address space (above +127T),
24187 	 *   and there is no GPU carve out, and pthread wants to place
24188 	 *   threads at the 112T mark (0x70T).
24189 	 *
24190 	 * - On arm64, these are in the same spot as on embedded devices:
24191 	 *   o shared region:   [ 6G,  10G)  [ will likely grow over time ]
24192 	 *   o commpage region: [63G,  64G)
24193 	 *   o GPU carve out:   [64G, 448G)
24194 	 *
24195 	 * This is conveninent because the mappings at the end of the address
24196 	 * space (when they exist) are made by the kernel.
24197 	 *
24198 	 * The policy is to allocate a random 1T for the data heap
24199 	 * in the end of the address-space in the:
24200 	 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
24201 	 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
24202 	 */
24203 
24204 	/* see NANOZONE_SIGNATURE in libmalloc */
24205 #if __x86_64__
24206 	default_end = 0x71ull << 40;
24207 #else
24208 	default_end = 0x61ull << 40;
24209 #endif
24210 	data_range  = vm_map_range_random_uniform(1ull << 40,
24211 	        default_end, 0x7full << 40, offmask);
24212 
24213 #else /* !XNU_PLATFORM_MacOSX */
24214 
24215 	/*
24216 	 * Embedded devices:
24217 	 *
24218 	 *   The default VA Size scales with the device physical memory.
24219 	 *
24220 	 *   Out of that:
24221 	 *   - the "zero" page typically uses 4G + some slide
24222 	 *   - the shared region uses SHARED_REGION_SIZE bytes (4G)
24223 	 *
24224 	 *   Without the use of jumbo or any adjustment to the address space,
24225 	 *   a default VM map typically looks like this:
24226 	 *
24227 	 *       0G -->╒════════════╕
24228 	 *             │  pagezero  │
24229 	 *             │  + slide   │
24230 	 *      ~4G -->╞════════════╡<-- vm_map_min(map)
24231 	 *             │            │
24232 	 *       6G -->├────────────┤
24233 	 *             │   shared   │
24234 	 *             │   region   │
24235 	 *      10G -->├────────────┤
24236 	 *             │            │
24237 	 *   max_va -->├────────────┤<-- vm_map_max(map)
24238 	 *             │            │
24239 	 *             ╎   jumbo    ╎
24240 	 *             ╎            ╎
24241 	 *             │            │
24242 	 *      63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
24243 	 *             │  commpage  │
24244 	 *      64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
24245 	 *             │            │
24246 	 *             ╎    GPU     ╎
24247 	 *             ╎  carveout  ╎
24248 	 *             │            │
24249 	 *     448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
24250 	 *             │            │
24251 	 *             ╎            ╎
24252 	 *             ╎            ╎
24253 	 *             │            │
24254 	 *     512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
24255 	 *
24256 	 *   When this drawing was made, "max_va" was smaller than
24257 	 *   ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
24258 	 *   12G of address space for the zero-page, slide, files,
24259 	 *   binaries, heap ...
24260 	 *
24261 	 *   We will want to make a "heap/data" carve out inside
24262 	 *   the jumbo range of half of that usable space, assuming
24263 	 *   that this is less than a forth of the jumbo range.
24264 	 *
24265 	 *   The assert below intends to catch when max_va grows
24266 	 *   too large for this heuristic.
24267 	 */
24268 
24269 	vm_map_lock_read(map);
24270 	default_end = vm_map_max(map);
24271 	vm_map_unlock_read(map);
24272 
24273 	/*
24274 	 * Check that we're not already jumbo'd,
24275 	 * or our address space was somehow modified.
24276 	 *
24277 	 * If so we cannot guarantee that we can set up the ranges
24278 	 * safely without interfering with the existing map.
24279 	 */
24280 	if (default_end > vm_compute_max_offset(true)) {
24281 		return KERN_NO_SPACE;
24282 	}
24283 
24284 	if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
24285 		/*
24286 		 * an override boot-arg was set, disable user-ranges
24287 		 *
24288 		 * XXX: this is problematic because it means these boot-args
24289 		 *      no longer test the behavior changing the value
24290 		 *      of ARM64_MAX_OFFSET_DEVICE_* would have.
24291 		 */
24292 		return KERN_NOT_SUPPORTED;
24293 	}
24294 
24295 	/* expand the default VM space to 64GB */
24296 	vm_map_set_jumbo(map);
24297 
24298 	assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
24299 	data_range = vm_map_range_random_uniform(GiB(10),
24300 	    default_end + PAGE_SIZE, vm_map_max(map), offmask);
24301 
24302 #endif /* !XNU_PLATFORM_MacOSX */
24303 
24304 	/*
24305 	 * Poke holes so that ASAN or people listing regions
24306 	 * do not think this space is free.
24307 	 */
24308 
24309 	if (default_end != data_range.min_address) {
24310 		kr = vm_map_enter(map, &default_end,
24311 		    data_range.min_address - default_end,
24312 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24313 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24314 		assert(kr == KERN_SUCCESS);
24315 	}
24316 
24317 	if (data_range.max_address != vm_map_max(map)) {
24318 		vm_map_entry_t entry;
24319 		vm_size_t size;
24320 
24321 		/*
24322 		 * Extend the end of the hole to the next VM entry or the end of the map,
24323 		 * whichever comes first.
24324 		 */
24325 		vm_map_lock_read(map);
24326 		vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
24327 		if (entry == vm_map_to_entry(map) || entry->vme_start > vm_map_max(map)) {
24328 			size = vm_map_max(map) - data_range.max_address;
24329 		} else {
24330 			size = entry->vme_start - data_range.max_address;
24331 		}
24332 		vm_map_unlock_read(map);
24333 
24334 		kr = vm_map_enter(map, &data_range.max_address, size,
24335 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24336 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24337 		assert(kr == KERN_SUCCESS);
24338 	}
24339 
24340 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24341 	if (needs_extra_jumbo_va) {
24342 		/* This will grow the address space to MACH_VM_MAX_ADDRESS */
24343 		vm_map_set_extra_jumbo(map);
24344 	}
24345 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24346 
24347 	vm_map_lock(map);
24348 	map->default_range.min_address = vm_map_min(map);
24349 	map->default_range.max_address = default_end;
24350 	map->data_range = data_range;
24351 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24352 	/* If process has "extra jumbo" entitlement, enable large file range */
24353 	if (needs_extra_jumbo_va) {
24354 		map->large_file_range = vm_map_range_random_uniform(TiB(1),
24355 		    MACH_VM_JUMBO_ADDRESS, MACH_VM_MAX_ADDRESS, offmask);
24356 	}
24357 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24358 	map->uses_user_ranges = true;
24359 	vm_map_unlock(map);
24360 
24361 	return KERN_SUCCESS;
24362 }
24363 
24364 /*
24365  * vm_map_range_fork:
24366  *	clones the array of ranges from old_map to new_map in support
24367  *  of a VM map fork.
24368  */
24369 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)24370 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
24371 {
24372 	if (!old_map->uses_user_ranges) {
24373 		/* nothing to do */
24374 		return;
24375 	}
24376 
24377 	new_map->default_range = old_map->default_range;
24378 	new_map->data_range = old_map->data_range;
24379 
24380 	if (old_map->extra_ranges_count) {
24381 		vm_map_user_range_t otable, ntable;
24382 		uint16_t count;
24383 
24384 		otable = old_map->extra_ranges;
24385 		count  = old_map->extra_ranges_count;
24386 		ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
24387 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
24388 		memcpy(ntable, otable,
24389 		    count * sizeof(struct vm_map_user_range));
24390 
24391 		new_map->extra_ranges_count = count;
24392 		new_map->extra_ranges = ntable;
24393 	}
24394 
24395 	new_map->uses_user_ranges = true;
24396 }
24397 
24398 /*
24399  * vm_map_get_user_range:
24400  *	copy the VM user range for the given VM map and range ID.
24401  */
24402 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)24403 vm_map_get_user_range(
24404 	vm_map_t                map,
24405 	vm_map_range_id_t       range_id,
24406 	mach_vm_range_t         range)
24407 {
24408 	if (map == NULL || !map->uses_user_ranges || range == NULL) {
24409 		return KERN_INVALID_ARGUMENT;
24410 	}
24411 
24412 	switch (range_id) {
24413 	case UMEM_RANGE_ID_DEFAULT:
24414 		*range = map->default_range;
24415 		return KERN_SUCCESS;
24416 
24417 	case UMEM_RANGE_ID_HEAP:
24418 		*range = map->data_range;
24419 		return KERN_SUCCESS;
24420 
24421 	case UMEM_RANGE_ID_LARGE_FILE:
24422 		/*
24423 		 * Because this function tells a user-space process about the user
24424 		 * ranges in its VM map, this case communicates whether the large file
24425 		 * range is in use. Note that this is different from how the large file
24426 		 * range ID is handled in `vm_map_get_range()`: there, we "resolve" the
24427 		 * VA policy and return either the large file range or data range,
24428 		 * depending on whether the large file range is enabled.
24429 		 */
24430 		if (map->large_file_range.min_address != map->large_file_range.max_address) {
24431 			/* large file range is configured and should be used */
24432 			*range = map->large_file_range;
24433 		} else {
24434 			return KERN_INVALID_ARGUMENT;
24435 		}
24436 		return KERN_SUCCESS;
24437 
24438 	default:
24439 		return KERN_INVALID_ARGUMENT;
24440 	}
24441 }
24442 
24443 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)24444 vm_map_user_range_resolve(
24445 	vm_map_t                map,
24446 	mach_vm_address_t       addr,
24447 	mach_vm_size_t          size,
24448 	mach_vm_range_t         range)
24449 {
24450 	struct mach_vm_range tmp;
24451 
24452 	vm_map_lock_assert_held(map);
24453 
24454 	static_assert((int)UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
24455 	static_assert((int)UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
24456 
24457 	if (mach_vm_range_contains(&map->default_range, addr, size)) {
24458 		if (range) {
24459 			*range = map->default_range;
24460 		}
24461 		return UMEM_RANGE_ID_DEFAULT;
24462 	}
24463 
24464 	if (mach_vm_range_contains(&map->data_range, addr, size)) {
24465 		if (range) {
24466 			*range = map->data_range;
24467 		}
24468 		return UMEM_RANGE_ID_HEAP;
24469 	}
24470 
24471 	if (mach_vm_range_contains(&map->large_file_range, addr, size)) {
24472 		if (range) {
24473 			*range = map->large_file_range;
24474 		}
24475 		return UMEM_RANGE_ID_LARGE_FILE;
24476 	}
24477 
24478 	for (size_t i = 0; i < map->extra_ranges_count; i++) {
24479 		vm_map_user_range_t r = &map->extra_ranges[i];
24480 
24481 		tmp.min_address = r->vmur_min_address;
24482 		tmp.max_address = r->vmur_max_address;
24483 
24484 		if (mach_vm_range_contains(&tmp, addr, size)) {
24485 			if (range) {
24486 				*range = tmp;
24487 			}
24488 			return r->vmur_range_id;
24489 		}
24490 	}
24491 
24492 	if (range) {
24493 		range->min_address = range->max_address = 0;
24494 	}
24495 	return UMEM_RANGE_ID_DEFAULT;
24496 }
24497 #endif /* CONFIG_MAP_RANGES */
24498 
24499 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map,__unused vm_map_size_t size)24500 vm_map_kernel_flags_update_range_id(
24501 	vm_map_kernel_flags_t *vmkf,
24502 	vm_map_t map,
24503 	__unused vm_map_size_t size)
24504 {
24505 	if (map == kernel_map) {
24506 		if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
24507 			vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
24508 		}
24509 #if CONFIG_MAP_RANGES
24510 	} else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
24511 	    vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT) {
24512 		if (bitmap_test(vm_map_user_range_large_file_map, vmkf->vm_tag)
24513 		    || size >= VM_LARGE_FILE_THRESHOLD) {
24514 			/*
24515 			 * if the map doesn't have the large file range configured,
24516 			 * the range will get resolved to the heap range in `vm_map_get_range`
24517 			 */
24518 			vmkf->vmkf_range_id = UMEM_RANGE_ID_LARGE_FILE;
24519 		} else if (bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
24520 			vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
24521 		}
24522 #endif /* CONFIG_MAP_RANGES */
24523 	}
24524 }
24525 
24526 /*
24527  * vm_map_entry_has_device_pager:
24528  * Check if the vm map entry specified by the virtual address has a device pager.
24529  * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
24530  */
24531 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)24532 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
24533 {
24534 	vm_map_entry_t entry;
24535 	vm_object_t object;
24536 	boolean_t result;
24537 
24538 	if (map == NULL) {
24539 		return FALSE;
24540 	}
24541 
24542 	vm_map_lock(map);
24543 	while (TRUE) {
24544 		if (!vm_map_lookup_entry(map, vaddr, &entry)) {
24545 			result = FALSE;
24546 			break;
24547 		}
24548 		if (entry->is_sub_map) {
24549 			// Check the submap
24550 			vm_map_t submap = VME_SUBMAP(entry);
24551 			assert(submap != NULL);
24552 			vm_map_lock(submap);
24553 			vm_map_unlock(map);
24554 			map = submap;
24555 			continue;
24556 		}
24557 		object = VME_OBJECT(entry);
24558 		if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
24559 			result = TRUE;
24560 			break;
24561 		}
24562 		result = FALSE;
24563 		break;
24564 	}
24565 
24566 	vm_map_unlock(map);
24567 	return result;
24568 }
24569 
24570 #if MACH_ASSERT
24571 
24572 extern int pmap_ledgers_panic;
24573 extern int pmap_ledgers_panic_leeway;
24574 
24575 #define LEDGER_DRIFT(__LEDGER)                    \
24576 	int             __LEDGER##_over;          \
24577 	ledger_amount_t __LEDGER##_over_total;    \
24578 	ledger_amount_t __LEDGER##_over_max;      \
24579 	int             __LEDGER##_under;         \
24580 	ledger_amount_t __LEDGER##_under_total;   \
24581 	ledger_amount_t __LEDGER##_under_max
24582 
24583 struct {
24584 	uint64_t        num_pmaps_checked;
24585 
24586 	LEDGER_DRIFT(phys_footprint);
24587 	LEDGER_DRIFT(internal);
24588 	LEDGER_DRIFT(internal_compressed);
24589 	LEDGER_DRIFT(external);
24590 	LEDGER_DRIFT(reusable);
24591 	LEDGER_DRIFT(iokit_mapped);
24592 	LEDGER_DRIFT(alternate_accounting);
24593 	LEDGER_DRIFT(alternate_accounting_compressed);
24594 	LEDGER_DRIFT(page_table);
24595 	LEDGER_DRIFT(purgeable_volatile);
24596 	LEDGER_DRIFT(purgeable_nonvolatile);
24597 	LEDGER_DRIFT(purgeable_volatile_compressed);
24598 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24599 	LEDGER_DRIFT(tagged_nofootprint);
24600 	LEDGER_DRIFT(tagged_footprint);
24601 	LEDGER_DRIFT(tagged_nofootprint_compressed);
24602 	LEDGER_DRIFT(tagged_footprint_compressed);
24603 	LEDGER_DRIFT(network_volatile);
24604 	LEDGER_DRIFT(network_nonvolatile);
24605 	LEDGER_DRIFT(network_volatile_compressed);
24606 	LEDGER_DRIFT(network_nonvolatile_compressed);
24607 	LEDGER_DRIFT(media_nofootprint);
24608 	LEDGER_DRIFT(media_footprint);
24609 	LEDGER_DRIFT(media_nofootprint_compressed);
24610 	LEDGER_DRIFT(media_footprint_compressed);
24611 	LEDGER_DRIFT(graphics_nofootprint);
24612 	LEDGER_DRIFT(graphics_footprint);
24613 	LEDGER_DRIFT(graphics_nofootprint_compressed);
24614 	LEDGER_DRIFT(graphics_footprint_compressed);
24615 	LEDGER_DRIFT(neural_nofootprint);
24616 	LEDGER_DRIFT(neural_footprint);
24617 	LEDGER_DRIFT(neural_nofootprint_compressed);
24618 	LEDGER_DRIFT(neural_footprint_compressed);
24619 	LEDGER_DRIFT(neural_nofootprint_total);
24620 } pmap_ledgers_drift;
24621 
24622 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24623 vm_map_pmap_check_ledgers(
24624 	pmap_t          pmap,
24625 	ledger_t        ledger,
24626 	int             pid,
24627 	char            *procname)
24628 {
24629 	ledger_amount_t bal;
24630 	boolean_t       do_panic;
24631 
24632 	do_panic = FALSE;
24633 
24634 	pmap_ledgers_drift.num_pmaps_checked++;
24635 
24636 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
24637 MACRO_BEGIN                                                             \
24638 	int panic_on_negative = TRUE;                                   \
24639 	ledger_get_balance(ledger,                                      \
24640 	                   task_ledgers.__LEDGER,                       \
24641 	                   &bal);                                       \
24642 	ledger_get_panic_on_negative(ledger,                            \
24643 	                             task_ledgers.__LEDGER,             \
24644 	                             &panic_on_negative);               \
24645 	if (bal != 0) {                                                 \
24646 	        if (panic_on_negative ||                                \
24647 	            (pmap_ledgers_panic &&                              \
24648 	             pmap_ledgers_panic_leeway > 0 &&                   \
24649 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
24650 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24651 	                do_panic = TRUE;                                \
24652 	        }                                                       \
24653 	        printf("LEDGER BALANCE proc %d (%s) "                   \
24654 	               "\"%s\" = %lld\n",                               \
24655 	               pid, procname, #__LEDGER, bal);                  \
24656 	        if (bal > 0) {                                          \
24657 	                pmap_ledgers_drift.__LEDGER##_over++;           \
24658 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24659 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24660 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24661 	                }                                               \
24662 	        } else if (bal < 0) {                                   \
24663 	                pmap_ledgers_drift.__LEDGER##_under++;          \
24664 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24665 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24666 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24667 	                }                                               \
24668 	        }                                                       \
24669 	}                                                               \
24670 MACRO_END
24671 
24672 	LEDGER_CHECK_BALANCE(phys_footprint);
24673 	LEDGER_CHECK_BALANCE(internal);
24674 	LEDGER_CHECK_BALANCE(internal_compressed);
24675 	LEDGER_CHECK_BALANCE(external);
24676 	LEDGER_CHECK_BALANCE(reusable);
24677 	LEDGER_CHECK_BALANCE(iokit_mapped);
24678 	LEDGER_CHECK_BALANCE(alternate_accounting);
24679 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24680 	LEDGER_CHECK_BALANCE(page_table);
24681 	LEDGER_CHECK_BALANCE(purgeable_volatile);
24682 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24683 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24684 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24685 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
24686 	LEDGER_CHECK_BALANCE(tagged_footprint);
24687 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24688 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24689 	LEDGER_CHECK_BALANCE(network_volatile);
24690 	LEDGER_CHECK_BALANCE(network_nonvolatile);
24691 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
24692 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24693 	LEDGER_CHECK_BALANCE(media_nofootprint);
24694 	LEDGER_CHECK_BALANCE(media_footprint);
24695 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24696 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
24697 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
24698 	LEDGER_CHECK_BALANCE(graphics_footprint);
24699 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24700 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24701 	LEDGER_CHECK_BALANCE(neural_nofootprint);
24702 	LEDGER_CHECK_BALANCE(neural_footprint);
24703 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24704 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24705 	LEDGER_CHECK_BALANCE(neural_nofootprint_total);
24706 
24707 	if (do_panic) {
24708 		if (pmap_ledgers_panic) {
24709 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24710 			    pmap, pid, procname);
24711 		} else {
24712 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24713 			    pmap, pid, procname);
24714 		}
24715 	}
24716 }
24717 
24718 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24719 vm_map_pmap_set_process(
24720 	vm_map_t map,
24721 	int pid,
24722 	char *procname)
24723 {
24724 	pmap_set_process(vm_map_pmap(map), pid, procname);
24725 }
24726 
24727 #endif /* MACH_ASSERT */
24728 
24729 /**
24730  * Check if a given given map operation size is valid for the given map, taking
24731  * in to account whether or not the map operation has overridden the soft limit.
24732  *
24733  * This function is meant to be inlined wherever possible as it can, in some
24734  * modes, generates telemetry events which capture shallow backtraces. To
24735  * maximize the usefulness of this backtrace, we want to minize the depth at
24736  * which the backtrace is taken.
24737  */
24738 __attribute__((always_inline))
24739 bool
vm_map_is_map_size_valid(vm_map_t target_map,vm_size_t size,bool no_soft_limit)24740 vm_map_is_map_size_valid(
24741 	vm_map_t target_map,
24742 	vm_size_t size,
24743 	bool no_soft_limit)
24744 {
24745 #ifdef __x86_64__
24746 	// Do not enforce any additional limits on x64
24747 	(void)target_map;
24748 	(void)size;
24749 	(void)no_soft_limit;
24750 	return true;
24751 #else
24752 	if (__probable(target_map->pmap != kernel_pmap ||
24753 	    size < VM_KERNEL_SIMPLE_MAX_SIZE || no_soft_limit)) {
24754 		// Allocation size matches policy
24755 		return true;
24756 	}
24757 
24758 	switch (vm_map_kernel_alloc_limit_mode) {
24759 	default:
24760 	case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_BYPASS:
24761 		return true;
24762 	case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_TRAP:
24763 		trap_telemetry_report_kernel_soft_error(
24764 			TRAP_TELEMETRY_KERNEL_SOFT_ERROR_VM_KERNEL_MAX_ALLOC_SIZE,
24765 			/* report_once_per_site */ false);
24766 		return true;
24767 	case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_REJECT:
24768 		return false;
24769 	case VM_MAP_KERNEL_ALLOC_LIMIT_MODE_PANIC:
24770 		panic("1,000,000K ought to be enough for anybody "
24771 		    "(requested %lu bytes)", size);
24772 	}
24773 #endif /* __x86_64__ */
24774 }
24775 
24776 vm_map_serial_t
vm_map_maybe_serial_id(vm_map_t maybe_vm_map)24777 vm_map_maybe_serial_id(vm_map_t maybe_vm_map)
24778 {
24779 	return maybe_vm_map != NULL ? maybe_vm_map->serial_id : VM_MAP_SERIAL_NONE;
24780 }
24781