xref: /xnu-11215.61.5/osfmk/vm/vm_map.c (revision 4f1223e81cd707a65cc109d0b8ad6653699da3c4)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68 
69 #include <vm/vm_options.h>
70 
71 #include <libkern/OSAtomic.h>
72 
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm_server.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83 
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 
91 #include <vm/cpm_internal.h>
92 #include <vm/memory_types.h>
93 #include <vm/vm_compressor_xnu.h>
94 #include <vm/vm_compressor_pager_internal.h>
95 #include <vm/vm_init_xnu.h>
96 #include <vm/vm_fault_internal.h>
97 #include <vm/vm_map_internal.h>
98 #include <vm/vm_object_internal.h>
99 #include <vm/vm_page_internal.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/pmap.h>
102 #include <vm/vm_kern_internal.h>
103 #include <ipc/ipc_port.h>
104 #include <kern/sched_prim.h>
105 #include <kern/misc_protos.h>
106 
107 #include <mach/vm_map_server.h>
108 #include <mach/mach_host_server.h>
109 #include <vm/vm_memtag.h>
110 #include <vm/vm_protos_internal.h>
111 #include <vm/vm_purgeable_internal.h>
112 
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_shared_region_internal.h>
115 #include <vm/vm_map_store_internal.h>
116 #include <vm/vm_memory_entry_xnu.h>
117 #include <vm/memory_object_internal.h>
118 #include <vm/vm_memory_entry.h>
119 #include <vm/vm_sanitize_internal.h>
120 #if DEVELOPMENT || DEBUG
121 #include <vm/vm_compressor_info.h>
122 #endif /* DEVELOPMENT || DEBUG */
123 #include <san/kasan.h>
124 
125 #include <sys/resource.h>
126 #include <sys/random.h>
127 #include <sys/codesign.h>
128 #include <sys/code_signing.h>
129 #include <sys/mman.h>
130 #include <sys/reboot.h>
131 #include <sys/kdebug_triage.h>
132 #include <sys/reason.h>
133 
134 #include <libkern/section_keywords.h>
135 
136 #if DEVELOPMENT || DEBUG
137 extern int proc_selfcsflags(void);
138 int vm_log_xnu_user_debug = 0;
139 int panic_on_unsigned_execute = 0;
140 int panic_on_mlock_failure = 0;
141 #endif /* DEVELOPMENT || DEBUG */
142 
143 #if DEVELOPMENT || DEBUG
144 int debug4k_filter = 0;
145 char debug4k_proc_name[1024] = "";
146 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
147 int debug4k_panic_on_misaligned_sharing = 0;
148 const char *debug4k_category_name[] = {
149 	"error",        /* 0 */
150 	"life",         /* 1 */
151 	"load",         /* 2 */
152 	"fault",        /* 3 */
153 	"copy",         /* 4 */
154 	"share",        /* 5 */
155 	"adjust",       /* 6 */
156 	"pmap",         /* 7 */
157 	"mementry",     /* 8 */
158 	"iokit",        /* 9 */
159 	"upl",          /* 10 */
160 	"exc",          /* 11 */
161 	"vfs"           /* 12 */
162 };
163 #endif /* DEVELOPMENT || DEBUG */
164 int debug4k_no_cow_copyin = 0;
165 
166 
167 #if __arm64__
168 extern const int fourk_binary_compatibility_unsafe;
169 #endif /* __arm64__ */
170 extern int proc_selfpid(void);
171 extern char *proc_name_address(void *p);
172 extern const char *proc_best_name(struct proc *p);
173 
174 #if VM_MAP_DEBUG_APPLE_PROTECT
175 int vm_map_debug_apple_protect = 0;
176 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
177 #if VM_MAP_DEBUG_FOURK
178 int vm_map_debug_fourk = 0;
179 #endif /* VM_MAP_DEBUG_FOURK */
180 
181 #if DEBUG || DEVELOPMENT
182 static TUNABLE(bool, vm_map_executable_immutable,
183     "vm_map_executable_immutable", true);
184 #else
185 #define vm_map_executable_immutable true
186 #endif
187 
188 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
189 
190 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
191 /* Internal prototypes
192  */
193 
194 typedef struct vm_map_zap {
195 	vm_map_entry_t          vmz_head;
196 	vm_map_entry_t         *vmz_tail;
197 } *vm_map_zap_t;
198 
199 #define VM_MAP_ZAP_DECLARE(zap) \
200 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
201 
202 extern kern_return_t vm_map_wire_external(
203 	vm_map_t                map,
204 	vm_map_offset_ut        start_u,
205 	vm_map_offset_ut        end_u,
206 	vm_prot_ut              prot_u,
207 	boolean_t               user_wire) __exported;
208 
209 #if XNU_PLATFORM_MacOSX
210 extern /* exported via Private.<arch>.MacOSX.exports on macOS */
211 #else
212 static
213 #endif
214 kern_return_t vm_map_copyin_common(
215 	vm_map_t                src_map,
216 	vm_map_address_ut       src_addr,
217 	vm_map_size_ut          len,
218 	boolean_t               src_destroy,
219 	boolean_t               src_volatile,
220 	vm_map_copy_t          *copy_result,                           /* OUT */
221 	boolean_t               use_maxprot);
222 
223 static vm_map_entry_t   vm_map_entry_insert(
224 	vm_map_t                map,
225 	vm_map_entry_t          insp_entry,
226 	vm_map_offset_t         start,
227 	vm_map_offset_t         end,
228 	vm_object_t             object,
229 	vm_object_offset_t      offset,
230 	vm_map_kernel_flags_t   vmk_flags,
231 	boolean_t               needs_copy,
232 	vm_prot_t               cur_protection,
233 	vm_prot_t               max_protection,
234 	vm_inherit_t            inheritance,
235 	boolean_t               clear_map_aligned);
236 
237 static void vm_map_simplify_range(
238 	vm_map_t        map,
239 	vm_map_offset_t start,
240 	vm_map_offset_t end);   /* forward */
241 
242 static boolean_t        vm_map_range_check(
243 	vm_map_t        map,
244 	vm_map_offset_t start,
245 	vm_map_offset_t end,
246 	vm_map_entry_t  *entry);
247 
248 static void vm_map_submap_pmap_clean(
249 	vm_map_t        map,
250 	vm_map_offset_t start,
251 	vm_map_offset_t end,
252 	vm_map_t        sub_map,
253 	vm_map_offset_t offset);
254 
255 static void             vm_map_pmap_enter(
256 	vm_map_t                map,
257 	vm_map_offset_t         addr,
258 	vm_map_offset_t         end_addr,
259 	vm_object_t             object,
260 	vm_object_offset_t      offset,
261 	vm_prot_t               protection);
262 
263 static void             _vm_map_clip_end(
264 	struct vm_map_header    *map_header,
265 	vm_map_entry_t          entry,
266 	vm_map_offset_t         end);
267 
268 static void             _vm_map_clip_start(
269 	struct vm_map_header    *map_header,
270 	vm_map_entry_t          entry,
271 	vm_map_offset_t         start);
272 
273 static kmem_return_t vm_map_delete(
274 	vm_map_t        map,
275 	vm_map_offset_t start,
276 	vm_map_offset_t end,
277 	vmr_flags_t     flags,
278 	kmem_guard_t    guard,
279 	vm_map_zap_t    zap);
280 
281 static void             vm_map_copy_insert(
282 	vm_map_t        map,
283 	vm_map_entry_t  after_where,
284 	vm_map_copy_t   copy);
285 
286 static kern_return_t    vm_map_copy_overwrite_unaligned(
287 	vm_map_t        dst_map,
288 	vm_map_entry_t  entry,
289 	vm_map_copy_t   copy,
290 	vm_map_address_t start,
291 	boolean_t       discard_on_success);
292 
293 static kern_return_t    vm_map_copy_overwrite_aligned(
294 	vm_map_t        dst_map,
295 	vm_map_entry_t  tmp_entry,
296 	vm_map_copy_t   copy,
297 	vm_map_offset_t start,
298 	pmap_t          pmap);
299 
300 static kern_return_t    vm_map_copyin_kernel_buffer(
301 	vm_map_t        src_map,
302 	vm_map_address_t src_addr,
303 	vm_map_size_t   len,
304 	boolean_t       src_destroy,
305 	vm_map_copy_t   *copy_result);  /* OUT */
306 
307 static kern_return_t    vm_map_copyout_kernel_buffer(
308 	vm_map_t        map,
309 	vm_map_address_t *addr, /* IN/OUT */
310 	vm_map_copy_t   copy,
311 	vm_map_size_t   copy_size,
312 	boolean_t       overwrite,
313 	boolean_t       consume_on_success);
314 
315 static void             vm_map_fork_share(
316 	vm_map_t        old_map,
317 	vm_map_entry_t  old_entry,
318 	vm_map_t        new_map);
319 
320 static boolean_t        vm_map_fork_copy(
321 	vm_map_t        old_map,
322 	vm_map_entry_t  *old_entry_p,
323 	vm_map_t        new_map,
324 	int             vm_map_copyin_flags);
325 
326 static kern_return_t    vm_map_wire_nested(
327 	vm_map_t                   map,
328 	vm_map_offset_t            start,
329 	vm_map_offset_t            end,
330 	vm_prot_t                  caller_prot,
331 	vm_tag_t                   tag,
332 	boolean_t                  user_wire,
333 	pmap_t                     map_pmap,
334 	vm_map_offset_t            pmap_addr,
335 	ppnum_t                   *physpage_p);
336 
337 static kern_return_t    vm_map_unwire_nested(
338 	vm_map_t                   map,
339 	vm_map_offset_t            start,
340 	vm_map_offset_t            end,
341 	boolean_t                  user_wire,
342 	pmap_t                     map_pmap,
343 	vm_map_offset_t            pmap_addr);
344 
345 static kern_return_t    vm_map_overwrite_submap_recurse(
346 	vm_map_t                   dst_map,
347 	vm_map_offset_t            dst_addr,
348 	vm_map_size_t              dst_size);
349 
350 static kern_return_t    vm_map_copy_overwrite_nested(
351 	vm_map_t                   dst_map,
352 	vm_map_offset_t            dst_addr,
353 	vm_map_copy_t              copy,
354 	boolean_t                  interruptible,
355 	pmap_t                     pmap,
356 	boolean_t                  discard_on_success);
357 
358 static kern_return_t    vm_map_remap_extract(
359 	vm_map_t                map,
360 	vm_map_offset_t         addr,
361 	vm_map_size_t           size,
362 	boolean_t               copy,
363 	vm_map_copy_t           map_copy,
364 	vm_prot_t               *cur_protection,
365 	vm_prot_t               *max_protection,
366 	vm_inherit_t            inheritance,
367 	vm_map_kernel_flags_t   vmk_flags);
368 
369 static void             vm_map_region_look_for_page(
370 	vm_map_t                   map,
371 	vm_map_offset_t            va,
372 	vm_object_t                object,
373 	vm_object_offset_t         offset,
374 	int                        max_refcnt,
375 	unsigned short             depth,
376 	vm_region_extended_info_t  extended,
377 	mach_msg_type_number_t count);
378 
379 static boolean_t        vm_map_region_has_obj_ref(
380 	vm_map_entry_t             entry,
381 	vm_object_t                object);
382 
383 
384 static kern_return_t    vm_map_willneed(
385 	vm_map_t        map,
386 	vm_map_offset_t start,
387 	vm_map_offset_t end);
388 
389 static kern_return_t    vm_map_reuse_pages(
390 	vm_map_t        map,
391 	vm_map_offset_t start,
392 	vm_map_offset_t end);
393 
394 static kern_return_t    vm_map_reusable_pages(
395 	vm_map_t        map,
396 	vm_map_offset_t start,
397 	vm_map_offset_t end);
398 
399 static kern_return_t    vm_map_can_reuse(
400 	vm_map_t        map,
401 	vm_map_offset_t start,
402 	vm_map_offset_t end);
403 
404 static kern_return_t    vm_map_zero(
405 	vm_map_t        map,
406 	vm_map_offset_t start,
407 	vm_map_offset_t end);
408 
409 static kern_return_t    vm_map_random_address_for_size(
410 	vm_map_t                map,
411 	vm_map_offset_t        *address,
412 	vm_map_size_t           size,
413 	vm_map_kernel_flags_t   vmk_flags);
414 
415 
416 #if CONFIG_MAP_RANGES
417 
418 static vm_map_range_id_t vm_map_user_range_resolve(
419 	vm_map_t                map,
420 	mach_vm_address_t       addr,
421 	mach_vm_address_t       size,
422 	mach_vm_range_t         range);
423 
424 #endif /* CONFIG_MAP_RANGES */
425 #if MACH_ASSERT
426 static kern_return_t    vm_map_pageout(
427 	vm_map_t        map,
428 	vm_map_offset_t start,
429 	vm_map_offset_t end);
430 #endif /* MACH_ASSERT */
431 
432 kern_return_t vm_map_corpse_footprint_collect(
433 	vm_map_t        old_map,
434 	vm_map_entry_t  old_entry,
435 	vm_map_t        new_map);
436 void vm_map_corpse_footprint_collect_done(
437 	vm_map_t        new_map);
438 void vm_map_corpse_footprint_destroy(
439 	vm_map_t        map);
440 kern_return_t vm_map_corpse_footprint_query_page_info(
441 	vm_map_t        map,
442 	vm_map_offset_t va,
443 	int             *disposition_p);
444 void vm_map_footprint_query_page_info(
445 	vm_map_t        map,
446 	vm_map_entry_t  map_entry,
447 	vm_map_offset_t curr_s_offset,
448 	int             *disposition_p);
449 
450 #if CONFIG_MAP_RANGES
451 static void vm_map_range_map_init(void);
452 #endif /* CONFIG_MAP_RANGES */
453 
454 pid_t find_largest_process_vm_map_entries(void);
455 
456 __attribute__((always_inline))
457 int
vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)458 vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)
459 {
460 	int flags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
461 
462 	/* in vmk flags the meaning of fixed/anywhere is inverted */
463 	return flags ^ (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
464 }
465 
466 __attribute__((always_inline, overloadable))
467 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags,vm_tag_t vm_tag)468 vm_map_kernel_flags_set_vmflags(
469 	vm_map_kernel_flags_t  *vmk_flags,
470 	int                     vm_flags,
471 	vm_tag_t                vm_tag)
472 {
473 	vm_flags ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
474 	vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
475 	vmk_flags->__vm_flags |= (vm_flags & VM_FLAGS_ANY_MASK);
476 	vmk_flags->vm_tag = vm_tag;
477 }
478 
479 __attribute__((always_inline, overloadable))
480 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_and_tag)481 vm_map_kernel_flags_set_vmflags(
482 	vm_map_kernel_flags_t  *vmk_flags,
483 	int                     vm_flags_and_tag)
484 {
485 	vm_flags_and_tag ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
486 	vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
487 	vmk_flags->__vm_flags |= (vm_flags_and_tag & VM_FLAGS_ANY_MASK);
488 	VM_GET_FLAGS_ALIAS(vm_flags_and_tag, vmk_flags->vm_tag);
489 }
490 
491 __attribute__((always_inline))
492 void
vm_map_kernel_flags_and_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_mask)493 vm_map_kernel_flags_and_vmflags(
494 	vm_map_kernel_flags_t  *vmk_flags,
495 	int                     vm_flags_mask)
496 {
497 	/* this function doesn't handle the inverted FIXED/ANYWHERE */
498 	assert(vm_flags_mask & VM_FLAGS_ANYWHERE);
499 	vmk_flags->__vm_flags &= vm_flags_mask;
500 }
501 
502 __attribute__((always_inline))
503 bool
vm_map_kernel_flags_check_vm_and_kflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)504 vm_map_kernel_flags_check_vm_and_kflags(
505 	vm_map_kernel_flags_t   vmk_flags,
506 	int                     vm_flags_mask)
507 {
508 	return (vmk_flags.__vm_flags & ~vm_flags_mask) == 0;
509 }
510 
511 bool
vm_map_kernel_flags_check_vmflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)512 vm_map_kernel_flags_check_vmflags(
513 	vm_map_kernel_flags_t   vmk_flags,
514 	int                     vm_flags_mask)
515 {
516 	int vmflags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
517 
518 	/* Note: up to 16 still has good calling conventions */
519 	static_assert(sizeof(vm_map_kernel_flags_t) == 8);
520 
521 #if DEBUG || DEVELOPMENT
522 	/*
523 	 * All of this compiles to nothing if all checks pass.
524 	 */
525 #define check(field, value)  ({ \
526 	vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE; \
527 	fl.__vm_flags = (value); \
528 	fl.field = 0; \
529 	assert(fl.__vm_flags == 0); \
530 })
531 
532 	/* bits 0-7 */
533 	check(vmf_fixed, VM_FLAGS_ANYWHERE); // kind of a lie this is inverted
534 	check(vmf_purgeable, VM_FLAGS_PURGABLE);
535 	check(vmf_4gb_chunk, VM_FLAGS_4GB_CHUNK);
536 	check(vmf_random_addr, VM_FLAGS_RANDOM_ADDR);
537 	check(vmf_no_cache, VM_FLAGS_NO_CACHE);
538 	check(vmf_resilient_codesign, VM_FLAGS_RESILIENT_CODESIGN);
539 	check(vmf_resilient_media, VM_FLAGS_RESILIENT_MEDIA);
540 	check(vmf_permanent, VM_FLAGS_PERMANENT);
541 
542 	/* bits 8-15 */
543 	check(vmf_tpro, VM_FLAGS_TPRO);
544 	check(vmf_overwrite, VM_FLAGS_OVERWRITE);
545 
546 	/* bits 16-23 */
547 	check(vmf_superpage_size, VM_FLAGS_SUPERPAGE_MASK);
548 	check(vmf_return_data_addr, VM_FLAGS_RETURN_DATA_ADDR);
549 	check(vmf_return_4k_data_addr, VM_FLAGS_RETURN_4K_DATA_ADDR);
550 
551 	{
552 		vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE;
553 
554 		/* check user tags will never clip */
555 		fl.vm_tag = VM_MEMORY_COUNT - 1;
556 		assert(fl.vm_tag == VM_MEMORY_COUNT - 1);
557 
558 		/* check kernel tags will never clip */
559 		fl.vm_tag = VM_MAX_TAG_VALUE - 1;
560 		assert(fl.vm_tag == VM_MAX_TAG_VALUE - 1);
561 	}
562 
563 
564 #undef check
565 #endif /* DEBUG || DEVELOPMENT */
566 
567 	return (vmflags & ~vm_flags_mask) == 0;
568 }
569 
570 /*
571  * Macros to copy a vm_map_entry. We must be careful to correctly
572  * manage the wired page count. vm_map_entry_copy() creates a new
573  * map entry to the same memory - the wired count in the new entry
574  * must be set to zero. vm_map_entry_copy_full() creates a new
575  * entry that is identical to the old entry.  This preserves the
576  * wire count; it's used for map splitting and zone changing in
577  * vm_map_copyout.
578  */
579 
580 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)581 vm_map_entry_copy_csm_assoc(
582 	vm_map_t map __unused,
583 	vm_map_entry_t new __unused,
584 	vm_map_entry_t old __unused)
585 {
586 #if CODE_SIGNING_MONITOR
587 	/* when code signing monitor is enabled, we want to reset on copy */
588 	new->csm_associated = FALSE;
589 #else
590 	/* when code signing monitor is not enabled, assert as a sanity check */
591 	assert(new->csm_associated == FALSE);
592 #endif
593 #if DEVELOPMENT || DEBUG
594 	if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
595 		printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug\n",
596 		    proc_selfpid(),
597 		    (get_bsdtask_info(current_task())
598 		    ? proc_name_address(get_bsdtask_info(current_task()))
599 		    : "?"),
600 		    __FUNCTION__, __LINE__,
601 		    map, new, new->vme_start, new->vme_end);
602 	}
603 #endif /* DEVELOPMENT || DEBUG */
604 #if XNU_TARGET_OS_OSX
605 	/*
606 	 * On macOS, entries with "vme_xnu_user_debug" can be copied during fork()
607 	 * and we want the child's entry to keep its "vme_xnu_user_debug" to avoid
608 	 * trigggering CSM assertions when the child accesses its mapping.
609 	 */
610 #else /* XNU_TARGET_OS_OSX */
611 	new->vme_xnu_user_debug = FALSE;
612 #endif /* XNU_TARGET_OS_OSX */
613 }
614 
615 /*
616  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
617  * But for security reasons on some platforms, we don't want the
618  * new mapping to be "used for jit", so we reset the flag here.
619  */
620 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)621 vm_map_entry_copy_code_signing(
622 	vm_map_t map,
623 	vm_map_entry_t new,
624 	vm_map_entry_t old __unused)
625 {
626 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
627 		assert(new->used_for_jit == old->used_for_jit);
628 	} else {
629 		if (old->used_for_jit) {
630 			DTRACE_VM3(cs_wx,
631 			    uint64_t, new->vme_start,
632 			    uint64_t, new->vme_end,
633 			    vm_prot_t, new->protection);
634 			printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
635 			    proc_selfpid(),
636 			    (get_bsdtask_info(current_task())
637 			    ? proc_name_address(get_bsdtask_info(current_task()))
638 			    : "?"),
639 			    __FUNCTION__,
640 			    "removing execute access");
641 			new->protection &= ~VM_PROT_EXECUTE;
642 			new->max_protection &= ~VM_PROT_EXECUTE;
643 		}
644 		new->used_for_jit = FALSE;
645 	}
646 }
647 
648 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)649 vm_map_entry_copy_full(
650 	vm_map_entry_t new,
651 	vm_map_entry_t old)
652 {
653 #if MAP_ENTRY_CREATION_DEBUG
654 	btref_put(new->vme_creation_bt);
655 	btref_retain(old->vme_creation_bt);
656 #endif
657 #if MAP_ENTRY_INSERTION_DEBUG
658 	btref_put(new->vme_insertion_bt);
659 	btref_retain(old->vme_insertion_bt);
660 #endif
661 #if VM_BTLOG_TAGS
662 	/* Discard the btref that might be in the new entry */
663 	if (new->vme_kernel_object) {
664 		btref_put(new->vme_tag_btref);
665 	}
666 	/* Retain the btref in the old entry to account for its copy */
667 	if (old->vme_kernel_object) {
668 		btref_retain(old->vme_tag_btref);
669 	}
670 #endif /* VM_BTLOG_TAGS */
671 	*new = *old;
672 }
673 
674 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)675 vm_map_entry_copy(
676 	vm_map_t map,
677 	vm_map_entry_t new,
678 	vm_map_entry_t old)
679 {
680 	vm_map_entry_copy_full(new, old);
681 
682 	new->is_shared = FALSE;
683 	new->needs_wakeup = FALSE;
684 	new->in_transition = FALSE;
685 	new->wired_count = 0;
686 	new->user_wired_count = 0;
687 	new->vme_permanent = FALSE;
688 	vm_map_entry_copy_code_signing(map, new, old);
689 	vm_map_entry_copy_csm_assoc(map, new, old);
690 	if (new->iokit_acct) {
691 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
692 		new->iokit_acct = FALSE;
693 		new->use_pmap = TRUE;
694 	}
695 	new->vme_resilient_codesign = FALSE;
696 	new->vme_resilient_media = FALSE;
697 	new->vme_atomic = FALSE;
698 	new->vme_no_copy_on_read = FALSE;
699 }
700 
701 /*
702  * Normal lock_read_to_write() returns FALSE/0 on failure.
703  * These functions evaluate to zero on success and non-zero value on failure.
704  */
705 __attribute__((always_inline))
706 int
vm_map_lock_read_to_write(vm_map_t map)707 vm_map_lock_read_to_write(vm_map_t map)
708 {
709 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
710 		DTRACE_VM(vm_map_lock_upgrade);
711 		return 0;
712 	}
713 	return 1;
714 }
715 
716 __attribute__((always_inline))
717 boolean_t
vm_map_try_lock(vm_map_t map)718 vm_map_try_lock(vm_map_t map)
719 {
720 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
721 		DTRACE_VM(vm_map_lock_w);
722 		return TRUE;
723 	}
724 	return FALSE;
725 }
726 
727 __attribute__((always_inline))
728 boolean_t
vm_map_try_lock_read(vm_map_t map)729 vm_map_try_lock_read(vm_map_t map)
730 {
731 	if (lck_rw_try_lock_shared(&(map)->lock)) {
732 		DTRACE_VM(vm_map_lock_r);
733 		return TRUE;
734 	}
735 	return FALSE;
736 }
737 
738 /*!
739  * @function kdp_vm_map_is_acquired_exclusive
740  *
741  * @abstract
742  * Checks if vm map is acquired exclusive.
743  *
744  * @discussion
745  * NOT SAFE: To be used only by kernel debugger.
746  *
747  * @param map map to check
748  *
749  * @returns TRUE if the map is acquired exclusively.
750  */
751 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)752 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
753 {
754 	return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
755 }
756 
757 /*
758  * Routines to get the page size the caller should
759  * use while inspecting the target address space.
760  * Use the "_safely" variant if the caller is dealing with a user-provided
761  * array whose size depends on the page size, to avoid any overflow or
762  * underflow of a user-allocated buffer.
763  */
764 int
vm_self_region_page_shift_safely(vm_map_t target_map)765 vm_self_region_page_shift_safely(
766 	vm_map_t target_map)
767 {
768 	int effective_page_shift = 0;
769 
770 	if (PAGE_SIZE == (4096)) {
771 		/* x86_64 and 4k watches: always use 4k */
772 		return PAGE_SHIFT;
773 	}
774 	/* did caller provide an explicit page size for this thread to use? */
775 	effective_page_shift = thread_self_region_page_shift();
776 	if (effective_page_shift) {
777 		/* use the explicitly-provided page size */
778 		return effective_page_shift;
779 	}
780 	/* no explicit page size: use the caller's page size... */
781 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
782 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
783 		/* page size match: safe to use */
784 		return effective_page_shift;
785 	}
786 	/* page size mismatch */
787 	return -1;
788 }
789 int
vm_self_region_page_shift(vm_map_t target_map)790 vm_self_region_page_shift(
791 	vm_map_t target_map)
792 {
793 	int effective_page_shift;
794 
795 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
796 	if (effective_page_shift == -1) {
797 		/* no safe value but OK to guess for caller */
798 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
799 		    VM_MAP_PAGE_SHIFT(target_map));
800 	}
801 	return effective_page_shift;
802 }
803 
804 
805 /*
806  *	Decide if we want to allow processes to execute from their data or stack areas.
807  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
808  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
809  *	or allow_stack_exec to enable data execution for that type of data area for that particular
810  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
811  *	specific pmap files since the default behavior varies according to architecture.  The
812  *	main reason it varies is because of the need to provide binary compatibility with old
813  *	applications that were written before these restrictions came into being.  In the old
814  *	days, an app could execute anything it could read, but this has slowly been tightened
815  *	up over time.  The default behavior is:
816  *
817  *	32-bit PPC apps		may execute from both stack and data areas
818  *	32-bit Intel apps	may exeucte from data areas but not stack
819  *	64-bit PPC/Intel apps	may not execute from either data or stack
820  *
821  *	An application on any architecture may override these defaults by explicitly
822  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
823  *	system call.  This code here just determines what happens when an app tries to
824  *      execute from a page that lacks execute permission.
825  *
826  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
827  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
828  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
829  *	execution from data areas for a particular binary even if the arch normally permits it. As
830  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
831  *	to support some complicated use cases, notably browsers with out-of-process plugins that
832  *	are not all NX-safe.
833  */
834 
835 extern int allow_data_exec, allow_stack_exec;
836 
837 int
override_nx(vm_map_t map,uint32_t user_tag)838 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
839 {
840 	int current_abi;
841 
842 	if (map->pmap == kernel_pmap) {
843 		return FALSE;
844 	}
845 
846 	/*
847 	 * Determine if the app is running in 32 or 64 bit mode.
848 	 */
849 
850 	if (vm_map_is_64bit(map)) {
851 		current_abi = VM_ABI_64;
852 	} else {
853 		current_abi = VM_ABI_32;
854 	}
855 
856 	/*
857 	 * Determine if we should allow the execution based on whether it's a
858 	 * stack or data area and the current architecture.
859 	 */
860 
861 	if (user_tag == VM_MEMORY_STACK) {
862 		return allow_stack_exec & current_abi;
863 	}
864 
865 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
866 }
867 
868 
869 /*
870  *	Virtual memory maps provide for the mapping, protection,
871  *	and sharing of virtual memory objects.  In addition,
872  *	this module provides for an efficient virtual copy of
873  *	memory from one map to another.
874  *
875  *	Synchronization is required prior to most operations.
876  *
877  *	Maps consist of an ordered doubly-linked list of simple
878  *	entries; a single hint is used to speed up lookups.
879  *
880  *	Sharing maps have been deleted from this version of Mach.
881  *	All shared objects are now mapped directly into the respective
882  *	maps.  This requires a change in the copy on write strategy;
883  *	the asymmetric (delayed) strategy is used for shared temporary
884  *	objects instead of the symmetric (shadow) strategy.  All maps
885  *	are now "top level" maps (either task map, kernel map or submap
886  *	of the kernel map).
887  *
888  *	Since portions of maps are specified by start/end addreses,
889  *	which may not align with existing map entries, all
890  *	routines merely "clip" entries to these start/end values.
891  *	[That is, an entry is split into two, bordering at a
892  *	start or end value.]  Note that these clippings may not
893  *	always be necessary (as the two resulting entries are then
894  *	not changed); however, the clipping is done for convenience.
895  *	No attempt is currently made to "glue back together" two
896  *	abutting entries.
897  *
898  *	The symmetric (shadow) copy strategy implements virtual copy
899  *	by copying VM object references from one map to
900  *	another, and then marking both regions as copy-on-write.
901  *	It is important to note that only one writeable reference
902  *	to a VM object region exists in any map when this strategy
903  *	is used -- this means that shadow object creation can be
904  *	delayed until a write operation occurs.  The symmetric (delayed)
905  *	strategy allows multiple maps to have writeable references to
906  *	the same region of a vm object, and hence cannot delay creating
907  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
908  *	Copying of permanent objects is completely different; see
909  *	vm_object_copy_strategically() in vm_object.c.
910  */
911 
912 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
913 
914 #define VM_MAP_ZONE_NAME        "maps"
915 #define VM_MAP_ZFLAGS           (ZC_NOENCRYPT | ZC_VM)
916 
917 #define VM_MAP_ENTRY_ZONE_NAME  "VM map entries"
918 #define VM_MAP_ENTRY_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
919 
920 #define VM_MAP_HOLES_ZONE_NAME  "VM map holes"
921 #define VM_MAP_HOLES_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
922 
923 /*
924  * Asserts that a vm_map_copy object is coming from the
925  * vm_map_copy_zone to ensure that it isn't a fake constructed
926  * anywhere else.
927  */
928 void
vm_map_copy_require(struct vm_map_copy * copy)929 vm_map_copy_require(struct vm_map_copy *copy)
930 {
931 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
932 }
933 
934 /*
935  *	vm_map_require:
936  *
937  *	Ensures that the argument is memory allocated from the genuine
938  *	vm map zone. (See zone_id_require_allow_foreign).
939  */
940 void
vm_map_require(vm_map_t map)941 vm_map_require(vm_map_t map)
942 {
943 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
944 }
945 
946 #define VM_MAP_EARLY_COUNT_MAX         16
947 static __startup_data vm_offset_t      map_data;
948 static __startup_data vm_size_t        map_data_size;
949 static __startup_data vm_offset_t      kentry_data;
950 static __startup_data vm_size_t        kentry_data_size;
951 static __startup_data vm_offset_t      map_holes_data;
952 static __startup_data vm_size_t        map_holes_data_size;
953 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
954 static __startup_data uint32_t         early_map_count;
955 
956 #if XNU_TARGET_OS_OSX
957 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
958 #else /* XNU_TARGET_OS_OSX */
959 #define         NO_COALESCE_LIMIT  0
960 #endif /* XNU_TARGET_OS_OSX */
961 
962 /* Skip acquiring locks if we're in the midst of a kernel core dump */
963 unsigned int not_in_kdp = 1;
964 
965 unsigned int vm_map_set_cache_attr_count = 0;
966 
967 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)968 vm_map_set_cache_attr(
969 	vm_map_t        map,
970 	vm_map_offset_t va)
971 {
972 	vm_map_entry_t  map_entry;
973 	vm_object_t     object;
974 	kern_return_t   kr = KERN_SUCCESS;
975 
976 	vm_map_lock_read(map);
977 
978 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
979 	    map_entry->is_sub_map) {
980 		/*
981 		 * that memory is not properly mapped
982 		 */
983 		kr = KERN_INVALID_ARGUMENT;
984 		goto done;
985 	}
986 	object = VME_OBJECT(map_entry);
987 
988 	if (object == VM_OBJECT_NULL) {
989 		/*
990 		 * there should be a VM object here at this point
991 		 */
992 		kr = KERN_INVALID_ARGUMENT;
993 		goto done;
994 	}
995 	vm_object_lock(object);
996 	object->set_cache_attr = TRUE;
997 	vm_object_unlock(object);
998 
999 	vm_map_set_cache_attr_count++;
1000 done:
1001 	vm_map_unlock_read(map);
1002 
1003 	return kr;
1004 }
1005 
1006 
1007 #if CONFIG_CODE_DECRYPTION
1008 /*
1009  * vm_map_apple_protected:
1010  * This remaps the requested part of the object with an object backed by
1011  * the decrypting pager.
1012  * crypt_info contains entry points and session data for the crypt module.
1013  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
1014  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
1015  */
1016 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)1017 vm_map_apple_protected(
1018 	vm_map_t                map,
1019 	vm_map_offset_t         start,
1020 	vm_map_offset_t         end,
1021 	vm_object_offset_t      crypto_backing_offset,
1022 	struct pager_crypt_info *crypt_info,
1023 	uint32_t                cryptid)
1024 {
1025 	boolean_t       map_locked;
1026 	kern_return_t   kr;
1027 	vm_map_entry_t  map_entry;
1028 	struct vm_map_entry tmp_entry;
1029 	memory_object_t unprotected_mem_obj;
1030 	vm_object_t     protected_object;
1031 	vm_map_offset_t map_addr;
1032 	vm_map_offset_t start_aligned, end_aligned;
1033 	vm_object_offset_t      crypto_start, crypto_end;
1034 	boolean_t       cache_pager;
1035 
1036 	map_locked = FALSE;
1037 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
1038 
1039 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
1040 		return KERN_INVALID_ADDRESS;
1041 	}
1042 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
1043 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
1044 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
1045 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
1046 
1047 #if __arm64__
1048 	/*
1049 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
1050 	 * so we might have to loop and establish up to 3 mappings:
1051 	 *
1052 	 * + the first 16K-page, which might overlap with the previous
1053 	 *   4K-aligned mapping,
1054 	 * + the center,
1055 	 * + the last 16K-page, which might overlap with the next
1056 	 *   4K-aligned mapping.
1057 	 * Each of these mapping might be backed by a vnode pager (if
1058 	 * properly page-aligned) or a "fourk_pager", itself backed by a
1059 	 * vnode pager (if 4K-aligned but not page-aligned).
1060 	 */
1061 #endif /* __arm64__ */
1062 
1063 	map_addr = start_aligned;
1064 	for (map_addr = start_aligned;
1065 	    map_addr < end;
1066 	    map_addr = tmp_entry.vme_end) {
1067 		vm_map_lock(map);
1068 		map_locked = TRUE;
1069 
1070 		/* lookup the protected VM object */
1071 		if (!vm_map_lookup_entry(map,
1072 		    map_addr,
1073 		    &map_entry) ||
1074 		    map_entry->is_sub_map ||
1075 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
1076 			/* that memory is not properly mapped */
1077 			kr = KERN_INVALID_ARGUMENT;
1078 			goto done;
1079 		}
1080 
1081 		/* ensure mapped memory is mapped as executable except
1082 		 *  except for model decryption flow */
1083 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
1084 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
1085 			kr = KERN_INVALID_ARGUMENT;
1086 			goto done;
1087 		}
1088 
1089 		/* get the protected object to be decrypted */
1090 		protected_object = VME_OBJECT(map_entry);
1091 		if (protected_object == VM_OBJECT_NULL) {
1092 			/* there should be a VM object here at this point */
1093 			kr = KERN_INVALID_ARGUMENT;
1094 			goto done;
1095 		}
1096 		/* ensure protected object stays alive while map is unlocked */
1097 		vm_object_reference(protected_object);
1098 
1099 		/* limit the map entry to the area we want to cover */
1100 		vm_map_clip_start(map, map_entry, start_aligned);
1101 		vm_map_clip_end(map, map_entry, end_aligned);
1102 
1103 		tmp_entry = *map_entry;
1104 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
1105 		vm_map_unlock(map);
1106 		map_locked = FALSE;
1107 
1108 		/*
1109 		 * This map entry might be only partially encrypted
1110 		 * (if not fully "page-aligned").
1111 		 */
1112 		crypto_start = 0;
1113 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
1114 		if (tmp_entry.vme_start < start) {
1115 			if (tmp_entry.vme_start != start_aligned) {
1116 				kr = KERN_INVALID_ADDRESS;
1117 				vm_object_deallocate(protected_object);
1118 				goto done;
1119 			}
1120 			crypto_start += (start - tmp_entry.vme_start);
1121 		}
1122 		if (tmp_entry.vme_end > end) {
1123 			if (tmp_entry.vme_end != end_aligned) {
1124 				kr = KERN_INVALID_ADDRESS;
1125 				vm_object_deallocate(protected_object);
1126 				goto done;
1127 			}
1128 			crypto_end -= (tmp_entry.vme_end - end);
1129 		}
1130 
1131 		/*
1132 		 * This "extra backing offset" is needed to get the decryption
1133 		 * routine to use the right key.  It adjusts for the possibly
1134 		 * relative offset of an interposed "4K" pager...
1135 		 */
1136 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
1137 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
1138 		}
1139 
1140 		cache_pager = TRUE;
1141 #if XNU_TARGET_OS_OSX
1142 		if (vm_map_is_alien(map)) {
1143 			cache_pager = FALSE;
1144 		}
1145 #endif /* XNU_TARGET_OS_OSX */
1146 
1147 		/*
1148 		 * Lookup (and create if necessary) the protected memory object
1149 		 * matching that VM object.
1150 		 * If successful, this also grabs a reference on the memory object,
1151 		 * to guarantee that it doesn't go away before we get a chance to map
1152 		 * it.
1153 		 */
1154 		unprotected_mem_obj = apple_protect_pager_setup(
1155 			protected_object,
1156 			VME_OFFSET(&tmp_entry),
1157 			crypto_backing_offset,
1158 			crypt_info,
1159 			crypto_start,
1160 			crypto_end,
1161 			cache_pager);
1162 
1163 		/* release extra ref on protected object */
1164 		vm_object_deallocate(protected_object);
1165 
1166 		if (unprotected_mem_obj == NULL) {
1167 			kr = KERN_FAILURE;
1168 			goto done;
1169 		}
1170 
1171 		/* can overwrite an immutable mapping */
1172 		vm_map_kernel_flags_t vmk_flags = {
1173 			.vmf_fixed = true,
1174 			.vmf_overwrite = true,
1175 			.vmkf_overwrite_immutable = true,
1176 		};
1177 		/* make the new mapping as "permanent" as the one it replaces */
1178 		vmk_flags.vmf_permanent = tmp_entry.vme_permanent;
1179 
1180 		/* map this memory object in place of the current one */
1181 		map_addr = tmp_entry.vme_start;
1182 		kr = mach_vm_map_kernel(map,
1183 		    vm_sanitize_wrap_addr_ref(&map_addr),
1184 		    (tmp_entry.vme_end -
1185 		    tmp_entry.vme_start),
1186 		    (mach_vm_offset_t) 0,
1187 		    vmk_flags,
1188 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1189 		    0,
1190 		    TRUE,
1191 		    tmp_entry.protection,
1192 		    tmp_entry.max_protection,
1193 		    tmp_entry.inheritance);
1194 		assertf(kr == KERN_SUCCESS,
1195 		    "kr = 0x%x\n", kr);
1196 		assertf(map_addr == tmp_entry.vme_start,
1197 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1198 		    (uint64_t)map_addr,
1199 		    (uint64_t) tmp_entry.vme_start,
1200 		    &tmp_entry);
1201 
1202 #if VM_MAP_DEBUG_APPLE_PROTECT
1203 		if (vm_map_debug_apple_protect) {
1204 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1205 			    " backing:[object:%p,offset:0x%llx,"
1206 			    "crypto_backing_offset:0x%llx,"
1207 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1208 			    map,
1209 			    (uint64_t) map_addr,
1210 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1211 			    tmp_entry.vme_start)),
1212 			    unprotected_mem_obj,
1213 			    protected_object,
1214 			    VME_OFFSET(&tmp_entry),
1215 			    crypto_backing_offset,
1216 			    crypto_start,
1217 			    crypto_end);
1218 		}
1219 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1220 
1221 		/*
1222 		 * Release the reference obtained by
1223 		 * apple_protect_pager_setup().
1224 		 * The mapping (if it succeeded) is now holding a reference on
1225 		 * the memory object.
1226 		 */
1227 		memory_object_deallocate(unprotected_mem_obj);
1228 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1229 
1230 		/* continue with next map entry */
1231 		crypto_backing_offset += (tmp_entry.vme_end -
1232 		    tmp_entry.vme_start);
1233 		crypto_backing_offset -= crypto_start;
1234 	}
1235 	kr = KERN_SUCCESS;
1236 
1237 done:
1238 	if (map_locked) {
1239 		vm_map_unlock(map);
1240 	}
1241 	return kr;
1242 }
1243 #endif  /* CONFIG_CODE_DECRYPTION */
1244 
1245 
1246 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1247 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1248 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1249 
1250 #if XNU_TARGET_OS_OSX
1251 #define MALLOC_NO_COW_DEFAULT 1
1252 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1253 #else /* XNU_TARGET_OS_OSX */
1254 #define MALLOC_NO_COW_DEFAULT 1
1255 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1256 #endif /* XNU_TARGET_OS_OSX */
1257 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1258 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1259 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1260 #if DEBUG
1261 int vm_check_map_sanity = 0;
1262 #endif
1263 
1264 /*
1265  *	vm_map_init:
1266  *
1267  *	Initialize the vm_map module.  Must be called before
1268  *	any other vm_map routines.
1269  *
1270  *	Map and entry structures are allocated from zones -- we must
1271  *	initialize those zones.
1272  *
1273  *	There are three zones of interest:
1274  *
1275  *	vm_map_zone:		used to allocate maps.
1276  *	vm_map_entry_zone:	used to allocate map entries.
1277  *
1278  *	LP32:
1279  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1280  *
1281  *	The kernel allocates map entries from a special zone that is initially
1282  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1283  *	the kernel to allocate more memory to a entry zone when it became
1284  *	empty since the very act of allocating memory implies the creation
1285  *	of a new entry.
1286  */
1287 __startup_func
1288 void
vm_map_init(void)1289 vm_map_init(void)
1290 {
1291 
1292 #if MACH_ASSERT
1293 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1294 	    sizeof(debug4k_filter));
1295 #endif /* MACH_ASSERT */
1296 
1297 	zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1298 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1299 
1300 	/*
1301 	 * Don't quarantine because we always need elements available
1302 	 * Disallow GC on this zone... to aid the GC.
1303 	 */
1304 	zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1305 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1306 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1307 		z->z_elems_rsv = (uint16_t)(32 *
1308 		(ml_early_cpu_max_number() + 1));
1309 	});
1310 
1311 	zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1312 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1313 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1314 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1315 	});
1316 
1317 	zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1318 	    ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1319 
1320 	/*
1321 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1322 	 */
1323 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1324 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1325 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1326 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1327 	    zone_count_free(vm_map_zone),
1328 	    zone_count_free(vm_map_entry_zone),
1329 	    zone_count_free(vm_map_holes_zone));
1330 
1331 	/*
1332 	 * Since these are covered by zones, remove them from stolen page accounting.
1333 	 */
1334 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1335 
1336 #if VM_MAP_DEBUG_APPLE_PROTECT
1337 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1338 	    &vm_map_debug_apple_protect,
1339 	    sizeof(vm_map_debug_apple_protect));
1340 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1341 #if VM_MAP_DEBUG_APPLE_FOURK
1342 	PE_parse_boot_argn("vm_map_debug_fourk",
1343 	    &vm_map_debug_fourk,
1344 	    sizeof(vm_map_debug_fourk));
1345 #endif /* VM_MAP_DEBUG_FOURK */
1346 
1347 	if (malloc_no_cow) {
1348 		vm_memory_malloc_no_cow_mask = 0ULL;
1349 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1350 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1351 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1352 #if XNU_TARGET_OS_OSX
1353 		/*
1354 		 * On macOS, keep copy-on-write for MALLOC_LARGE because
1355 		 * realloc() may use vm_copy() to transfer the old contents
1356 		 * to the new location.
1357 		 */
1358 #else /* XNU_TARGET_OS_OSX */
1359 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1360 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1361 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1362 #endif /* XNU_TARGET_OS_OSX */
1363 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1364 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1365 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1366 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1367 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1368 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1369 		    &vm_memory_malloc_no_cow_mask,
1370 		    sizeof(vm_memory_malloc_no_cow_mask));
1371 	}
1372 
1373 #if CONFIG_MAP_RANGES
1374 	vm_map_range_map_init();
1375 #endif /* CONFIG_MAP_RANGES */
1376 
1377 #if DEBUG
1378 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1379 	if (vm_check_map_sanity) {
1380 		kprintf("VM sanity checking enabled\n");
1381 	} else {
1382 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1383 	}
1384 #endif /* DEBUG */
1385 
1386 #if DEVELOPMENT || DEBUG
1387 	PE_parse_boot_argn("panic_on_unsigned_execute",
1388 	    &panic_on_unsigned_execute,
1389 	    sizeof(panic_on_unsigned_execute));
1390 	PE_parse_boot_argn("panic_on_mlock_failure",
1391 	    &panic_on_mlock_failure,
1392 	    sizeof(panic_on_mlock_failure));
1393 #endif /* DEVELOPMENT || DEBUG */
1394 }
1395 
1396 __startup_func
1397 static void
vm_map_steal_memory(void)1398 vm_map_steal_memory(void)
1399 {
1400 	/*
1401 	 * We need to reserve enough memory to support boostraping VM maps
1402 	 * and the zone subsystem.
1403 	 *
1404 	 * The VM Maps that need to function before zones can support them
1405 	 * are the ones registered with vm_map_will_allocate_early_map(),
1406 	 * which are:
1407 	 * - the kernel map
1408 	 * - the various submaps used by zones (pgz, meta, ...)
1409 	 *
1410 	 * We also need enough entries and holes to support them
1411 	 * until zone_metadata_init() is called, which is when
1412 	 * the zone allocator becomes capable of expanding dynamically.
1413 	 *
1414 	 * We need:
1415 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1416 	 * - To allow for 3-4 entries per map, but the kernel map
1417 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1418 	 *   to describe the submaps, so double it (and make it 8x too)
1419 	 * - To allow for holes between entries,
1420 	 *   hence needs the same budget as entries
1421 	 */
1422 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1423 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1424 	    VM_MAP_EARLY_COUNT_MAX);
1425 
1426 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1427 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1428 	    8 * VM_MAP_EARLY_COUNT_MAX);
1429 
1430 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1431 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1432 	    8 * VM_MAP_EARLY_COUNT_MAX);
1433 
1434 	/*
1435 	 * Steal a contiguous range of memory so that a simple range check
1436 	 * can validate early addresses being freed/crammed to these
1437 	 * zones
1438 	 */
1439 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1440 	    map_holes_data_size);
1441 	kentry_data    = map_data + map_data_size;
1442 	map_holes_data = kentry_data + kentry_data_size;
1443 }
1444 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1445 
1446 __startup_func
1447 static void
vm_kernel_boostraped(void)1448 vm_kernel_boostraped(void)
1449 {
1450 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1451 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1452 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1453 
1454 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1455 	    zone_count_free(vm_map_zone),
1456 	    zone_count_free(vm_map_entry_zone),
1457 	    zone_count_free(vm_map_holes_zone));
1458 }
1459 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1460 
1461 void
vm_map_disable_hole_optimization(vm_map_t map)1462 vm_map_disable_hole_optimization(vm_map_t map)
1463 {
1464 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1465 
1466 	if (map->holelistenabled) {
1467 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1468 
1469 		while (hole_entry != NULL) {
1470 			next_hole_entry = hole_entry->vme_next;
1471 
1472 			hole_entry->vme_next = NULL;
1473 			hole_entry->vme_prev = NULL;
1474 			zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1475 
1476 			if (next_hole_entry == head_entry) {
1477 				hole_entry = NULL;
1478 			} else {
1479 				hole_entry = next_hole_entry;
1480 			}
1481 		}
1482 
1483 		map->holes_list = NULL;
1484 		map->holelistenabled = FALSE;
1485 
1486 		map->first_free = vm_map_first_entry(map);
1487 		SAVE_HINT_HOLE_WRITE(map, NULL);
1488 	}
1489 }
1490 
1491 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1492 vm_kernel_map_is_kernel(vm_map_t map)
1493 {
1494 	return map->pmap == kernel_pmap;
1495 }
1496 
1497 /*
1498  *	vm_map_create:
1499  *
1500  *	Creates and returns a new empty VM map with
1501  *	the given physical map structure, and having
1502  *	the given lower and upper address bounds.
1503  */
1504 
1505 extern vm_map_t vm_map_create_external(
1506 	pmap_t                  pmap,
1507 	vm_map_offset_t         min_off,
1508 	vm_map_offset_t         max_off,
1509 	boolean_t               pageable);
1510 
1511 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1512 vm_map_create_external(
1513 	pmap_t                  pmap,
1514 	vm_map_offset_t         min,
1515 	vm_map_offset_t         max,
1516 	boolean_t               pageable)
1517 {
1518 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1519 
1520 	if (pageable) {
1521 		options |= VM_MAP_CREATE_PAGEABLE;
1522 	}
1523 	return vm_map_create_options(pmap, min, max, options);
1524 }
1525 
1526 __startup_func
1527 void
vm_map_will_allocate_early_map(vm_map_t * owner)1528 vm_map_will_allocate_early_map(vm_map_t *owner)
1529 {
1530 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1531 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1532 	}
1533 
1534 	early_map_owners[early_map_count++] = owner;
1535 }
1536 
1537 __startup_func
1538 void
vm_map_relocate_early_maps(vm_offset_t delta)1539 vm_map_relocate_early_maps(vm_offset_t delta)
1540 {
1541 	for (uint32_t i = 0; i < early_map_count; i++) {
1542 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1543 
1544 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1545 	}
1546 
1547 	early_map_count = ~0u;
1548 }
1549 
1550 /*
1551  *	Routine:	vm_map_relocate_early_elem
1552  *
1553  *	Purpose:
1554  *		Early zone elements are allocated in a temporary part
1555  *		of the address space.
1556  *
1557  *		Once the zones live in their final place, the early
1558  *		VM maps, map entries and map holes need to be relocated.
1559  *
1560  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1561  *		pointers to vm_map_links. Other pointers to other types
1562  *		are fine.
1563  *
1564  *		Fortunately, pointers to those types are self-contained
1565  *		in those zones, _except_ for pointers to VM maps,
1566  *		which are tracked during early boot and fixed with
1567  *		vm_map_relocate_early_maps().
1568  */
1569 __startup_func
1570 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1571 vm_map_relocate_early_elem(
1572 	uint32_t                zone_id,
1573 	vm_offset_t             new_addr,
1574 	vm_offset_t             delta)
1575 {
1576 #define relocate(type_t, field)  ({ \
1577 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1578 	if (*__field) {                                                        \
1579 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1580 	}                                                                      \
1581 })
1582 
1583 	switch (zone_id) {
1584 	case ZONE_ID_VM_MAP:
1585 	case ZONE_ID_VM_MAP_ENTRY:
1586 	case ZONE_ID_VM_MAP_HOLES:
1587 		break;
1588 
1589 	default:
1590 		panic("Unexpected zone ID %d", zone_id);
1591 	}
1592 
1593 	if (zone_id == ZONE_ID_VM_MAP) {
1594 		relocate(vm_map_t, hdr.links.prev);
1595 		relocate(vm_map_t, hdr.links.next);
1596 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1597 #ifdef VM_MAP_STORE_USE_RB
1598 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1599 #endif /* VM_MAP_STORE_USE_RB */
1600 		relocate(vm_map_t, hint);
1601 		relocate(vm_map_t, hole_hint);
1602 		relocate(vm_map_t, first_free);
1603 		return;
1604 	}
1605 
1606 	relocate(struct vm_map_links *, prev);
1607 	relocate(struct vm_map_links *, next);
1608 
1609 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1610 #ifdef VM_MAP_STORE_USE_RB
1611 		relocate(vm_map_entry_t, store.entry.rbe_left);
1612 		relocate(vm_map_entry_t, store.entry.rbe_right);
1613 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1614 #endif /* VM_MAP_STORE_USE_RB */
1615 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1616 			/* no object to relocate because we haven't made any */
1617 			((vm_map_entry_t)new_addr)->vme_submap +=
1618 			    delta >> VME_SUBMAP_SHIFT;
1619 		}
1620 #if MAP_ENTRY_CREATION_DEBUG
1621 		relocate(vm_map_entry_t, vme_creation_maphdr);
1622 #endif /* MAP_ENTRY_CREATION_DEBUG */
1623 	}
1624 
1625 #undef relocate
1626 }
1627 
1628 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1629 vm_map_create_options(
1630 	pmap_t                  pmap,
1631 	vm_map_offset_t         min,
1632 	vm_map_offset_t         max,
1633 	vm_map_create_options_t options)
1634 {
1635 	vm_map_t result;
1636 
1637 #if DEBUG || DEVELOPMENT
1638 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1639 		if (early_map_count != ~0u && early_map_count !=
1640 		    zone_count_allocated(vm_map_zone) + 1) {
1641 			panic("allocating %dth early map, owner not known",
1642 			    zone_count_allocated(vm_map_zone) + 1);
1643 		}
1644 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1645 			panic("allocating %dth early map for non kernel pmap",
1646 			    early_map_count);
1647 		}
1648 	}
1649 #endif /* DEBUG || DEVELOPMENT */
1650 
1651 	result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1652 
1653 	vm_map_store_init(&result->hdr);
1654 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1655 	vm_map_set_page_shift(result, PAGE_SHIFT);
1656 
1657 	result->size_limit      = RLIM_INFINITY;        /* default unlimited */
1658 	result->data_limit      = RLIM_INFINITY;        /* default unlimited */
1659 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1660 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1661 	result->pmap = pmap;
1662 	result->min_offset = min;
1663 	result->max_offset = max;
1664 	result->first_free = vm_map_to_entry(result);
1665 	result->hint = vm_map_to_entry(result);
1666 
1667 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1668 		assert(pmap == kernel_pmap);
1669 		result->never_faults = true;
1670 	}
1671 
1672 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1673 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1674 		result->has_corpse_footprint = true;
1675 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1676 		struct vm_map_links *hole_entry;
1677 
1678 		hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1679 		hole_entry->start = min;
1680 		/*
1681 		 * Holes can be used to track ranges all the way up to
1682 		 * MACH_VM_MAX_ADDRESS or more (e.g. kernel map).
1683 		 */
1684 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1685 		result->holes_list = result->hole_hint = hole_entry;
1686 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1687 		result->holelistenabled = true;
1688 	}
1689 
1690 	vm_map_lock_init(result);
1691 
1692 	return result;
1693 }
1694 
1695 /*
1696  * Adjusts a submap that was made by kmem_suballoc()
1697  * before it knew where it would be mapped,
1698  * so that it has the right min/max offsets.
1699  *
1700  * We do not need to hold any locks:
1701  * only the caller knows about this map,
1702  * and it is not published on any entry yet.
1703  */
1704 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1705 vm_map_adjust_offsets(
1706 	vm_map_t                map,
1707 	vm_map_offset_t         min_off,
1708 	vm_map_offset_t         max_off)
1709 {
1710 	assert(map->min_offset == 0);
1711 	assert(map->max_offset == max_off - min_off);
1712 	assert(map->hdr.nentries == 0);
1713 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1714 
1715 	map->min_offset = min_off;
1716 	map->max_offset = max_off;
1717 
1718 	if (map->holelistenabled) {
1719 		struct vm_map_links *hole = map->holes_list;
1720 
1721 		hole->start = min_off;
1722 #if defined(__arm64__)
1723 		hole->end = max_off;
1724 #else
1725 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1726 #endif
1727 	}
1728 }
1729 
1730 
1731 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1732 vm_map_adjusted_size(vm_map_t map)
1733 {
1734 	const struct vm_reserved_region *regions = NULL;
1735 	size_t num_regions = 0;
1736 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1737 
1738 	if (map == NULL || (map->size == 0)) {
1739 		return 0;
1740 	}
1741 
1742 	map_size = map->size;
1743 
1744 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1745 		/*
1746 		 * No special reserved regions or not an exotic map or the task
1747 		 * is terminating and these special regions might have already
1748 		 * been deallocated.
1749 		 */
1750 		return map_size;
1751 	}
1752 
1753 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1754 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1755 
1756 	while (num_regions) {
1757 		reserved_size += regions[--num_regions].vmrr_size;
1758 	}
1759 
1760 	/*
1761 	 * There are a few places where the map is being switched out due to
1762 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1763 	 * In those cases, we could have the map's regions being deallocated on
1764 	 * a core while some accounting process is trying to get the map's size.
1765 	 * So this assert can't be enabled till all those places are uniform in
1766 	 * their use of the 'map->terminated' bit.
1767 	 *
1768 	 * assert(map_size >= reserved_size);
1769 	 */
1770 
1771 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1772 }
1773 
1774 /*
1775  *	vm_map_entry_create:	[ internal use only ]
1776  *
1777  *	Allocates a VM map entry for insertion in the
1778  *	given map (or map copy).  No fields are filled.
1779  *
1780  *	The VM entry will be zero initialized, except for:
1781  *	- behavior set to VM_BEHAVIOR_DEFAULT
1782  *	- inheritance set to VM_INHERIT_DEFAULT
1783  */
1784 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1785 
1786 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1787 
1788 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1789 _vm_map_entry_create(
1790 	struct vm_map_header    *map_header __unused)
1791 {
1792 	vm_map_entry_t entry = NULL;
1793 
1794 	entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1795 
1796 	/*
1797 	 * Help the compiler with what we know to be true,
1798 	 * so that the further bitfields inits have good codegen.
1799 	 *
1800 	 * See rdar://87041299
1801 	 */
1802 	__builtin_assume(entry->vme_object_value == 0);
1803 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1804 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1805 
1806 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1807 	    "VME_ALIAS_MASK covers tags");
1808 
1809 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1810 	    "can skip zeroing of the behavior field");
1811 	entry->inheritance = VM_INHERIT_DEFAULT;
1812 
1813 #if MAP_ENTRY_CREATION_DEBUG
1814 	entry->vme_creation_maphdr = map_header;
1815 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1816 	    BTREF_GET_NOWAIT);
1817 #endif
1818 	return entry;
1819 }
1820 
1821 /*
1822  *	vm_map_entry_dispose:	[ internal use only ]
1823  *
1824  *	Inverse of vm_map_entry_create.
1825  *
1826  *      write map lock held so no need to
1827  *	do anything special to insure correctness
1828  *      of the stores
1829  */
1830 static void
vm_map_entry_dispose(vm_map_entry_t entry)1831 vm_map_entry_dispose(
1832 	vm_map_entry_t          entry)
1833 {
1834 #if VM_BTLOG_TAGS
1835 	if (entry->vme_kernel_object) {
1836 		btref_put(entry->vme_tag_btref);
1837 	}
1838 #endif /* VM_BTLOG_TAGS */
1839 #if MAP_ENTRY_CREATION_DEBUG
1840 	btref_put(entry->vme_creation_bt);
1841 #endif
1842 #if MAP_ENTRY_INSERTION_DEBUG
1843 	btref_put(entry->vme_insertion_bt);
1844 #endif
1845 	zfree(vm_map_entry_zone, entry);
1846 }
1847 
1848 #define vm_map_copy_entry_dispose(copy_entry) \
1849 	vm_map_entry_dispose(copy_entry)
1850 
1851 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1852 vm_map_zap_first_entry(
1853 	vm_map_zap_t            list)
1854 {
1855 	return list->vmz_head;
1856 }
1857 
1858 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1859 vm_map_zap_last_entry(
1860 	vm_map_zap_t            list)
1861 {
1862 	assert(vm_map_zap_first_entry(list));
1863 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1864 }
1865 
1866 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1867 vm_map_zap_append(
1868 	vm_map_zap_t            list,
1869 	vm_map_entry_t          entry)
1870 {
1871 	entry->vme_next = VM_MAP_ENTRY_NULL;
1872 	*list->vmz_tail = entry;
1873 	list->vmz_tail = &entry->vme_next;
1874 }
1875 
1876 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1877 vm_map_zap_pop(
1878 	vm_map_zap_t            list)
1879 {
1880 	vm_map_entry_t head = list->vmz_head;
1881 
1882 	if (head != VM_MAP_ENTRY_NULL &&
1883 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1884 		list->vmz_tail = &list->vmz_head;
1885 	}
1886 
1887 	return head;
1888 }
1889 
1890 static void
vm_map_zap_dispose(vm_map_zap_t list)1891 vm_map_zap_dispose(
1892 	vm_map_zap_t            list)
1893 {
1894 	vm_map_entry_t          entry;
1895 
1896 	while ((entry = vm_map_zap_pop(list))) {
1897 		if (entry->is_sub_map) {
1898 			vm_map_deallocate(VME_SUBMAP(entry));
1899 		} else {
1900 			vm_object_deallocate(VME_OBJECT(entry));
1901 		}
1902 
1903 		vm_map_entry_dispose(entry);
1904 	}
1905 }
1906 
1907 #if MACH_ASSERT
1908 static boolean_t first_free_check = FALSE;
1909 boolean_t
first_free_is_valid(vm_map_t map)1910 first_free_is_valid(
1911 	vm_map_t        map)
1912 {
1913 	if (!first_free_check) {
1914 		return TRUE;
1915 	}
1916 
1917 	return first_free_is_valid_store( map );
1918 }
1919 #endif /* MACH_ASSERT */
1920 
1921 
1922 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1923 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1924 
1925 #define vm_map_copy_entry_unlink(copy, entry)                           \
1926 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1927 
1928 /*
1929  *	vm_map_destroy:
1930  *
1931  *	Actually destroy a map.
1932  */
1933 void
vm_map_destroy(vm_map_t map)1934 vm_map_destroy(
1935 	vm_map_t        map)
1936 {
1937 	/* final cleanup: this is not allowed to fail */
1938 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1939 
1940 	VM_MAP_ZAP_DECLARE(zap);
1941 
1942 	vm_map_lock(map);
1943 
1944 	map->terminated = true;
1945 	/* clean up regular map entries */
1946 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1947 	    KMEM_GUARD_NONE, &zap);
1948 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1949 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1950 	    KMEM_GUARD_NONE, &zap);
1951 
1952 	vm_map_disable_hole_optimization(map);
1953 	vm_map_corpse_footprint_destroy(map);
1954 
1955 	vm_map_unlock(map);
1956 
1957 	vm_map_zap_dispose(&zap);
1958 
1959 	assert(map->hdr.nentries == 0);
1960 
1961 	if (map->pmap) {
1962 		pmap_destroy(map->pmap);
1963 	}
1964 
1965 	lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1966 
1967 #if CONFIG_MAP_RANGES
1968 	kfree_data(map->extra_ranges,
1969 	    map->extra_ranges_count * sizeof(struct vm_map_user_range));
1970 #endif
1971 
1972 	zfree_id(ZONE_ID_VM_MAP, map);
1973 }
1974 
1975 /*
1976  * Returns pid of the task with the largest number of VM map entries.
1977  * Used in the zone-map-exhaustion jetsam path.
1978  */
1979 pid_t
find_largest_process_vm_map_entries(void)1980 find_largest_process_vm_map_entries(void)
1981 {
1982 	pid_t victim_pid = -1;
1983 	int max_vm_map_entries = 0;
1984 	task_t task = TASK_NULL;
1985 	queue_head_t *task_list = &tasks;
1986 
1987 	lck_mtx_lock(&tasks_threads_lock);
1988 	queue_iterate(task_list, task, task_t, tasks) {
1989 		if (task == kernel_task || !task->active) {
1990 			continue;
1991 		}
1992 
1993 		vm_map_t task_map = task->map;
1994 		if (task_map != VM_MAP_NULL) {
1995 			int task_vm_map_entries = task_map->hdr.nentries;
1996 			if (task_vm_map_entries > max_vm_map_entries) {
1997 				max_vm_map_entries = task_vm_map_entries;
1998 				victim_pid = pid_from_task(task);
1999 			}
2000 		}
2001 	}
2002 	lck_mtx_unlock(&tasks_threads_lock);
2003 
2004 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
2005 	return victim_pid;
2006 }
2007 
2008 
2009 /*
2010  *	vm_map_lookup_entry:	[ internal use only ]
2011  *
2012  *	Calls into the vm map store layer to find the map
2013  *	entry containing (or immediately preceding) the
2014  *	specified address in the given map; the entry is returned
2015  *	in the "entry" parameter.  The boolean
2016  *	result indicates whether the address is
2017  *	actually contained in the map.
2018  */
2019 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2020 vm_map_lookup_entry(
2021 	vm_map_t        map,
2022 	vm_map_offset_t address,
2023 	vm_map_entry_t  *entry)         /* OUT */
2024 {
2025 	bool result = false;
2026 	if (VM_KERNEL_ADDRESS(address)) {
2027 		address = VM_KERNEL_STRIP_UPTR(address);
2028 	}
2029 
2030 #if CONFIG_PROB_GZALLOC
2031 	if (map->pmap == kernel_pmap) {
2032 		assertf(!pgz_owned(address),
2033 		    "it is the responsibility of callers to unguard PGZ addresses");
2034 	}
2035 #endif /* CONFIG_PROB_GZALLOC */
2036 	result = vm_map_store_lookup_entry( map, address, entry );
2037 
2038 	return result;
2039 }
2040 
2041 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2042 vm_map_lookup_entry_or_next(
2043 	vm_map_t        map,
2044 	vm_map_offset_t address,
2045 	vm_map_entry_t  *entry)         /* OUT */
2046 {
2047 	if (vm_map_lookup_entry(map, address, entry)) {
2048 		return true;
2049 	}
2050 
2051 	*entry = (*entry)->vme_next;
2052 	return false;
2053 }
2054 
2055 #if CONFIG_PROB_GZALLOC
2056 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2057 vm_map_lookup_entry_allow_pgz(
2058 	vm_map_t        map,
2059 	vm_map_offset_t address,
2060 	vm_map_entry_t  *entry)         /* OUT */
2061 {
2062 	if (VM_KERNEL_ADDRESS(address)) {
2063 		address = VM_KERNEL_STRIP_UPTR(address);
2064 	}
2065 	return vm_map_store_lookup_entry( map, address, entry );
2066 }
2067 #endif /* CONFIG_PROB_GZALLOC */
2068 
2069 /*
2070  *	Routine:	vm_map_range_invalid_panic
2071  *	Purpose:
2072  *			Panic on detection of an invalid range id.
2073  */
2074 __abortlike
2075 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)2076 vm_map_range_invalid_panic(
2077 	vm_map_t                map,
2078 	vm_map_range_id_t       range_id)
2079 {
2080 	panic("invalid range ID (%u) for map %p", range_id, map);
2081 }
2082 
2083 /*
2084  *	Routine:	vm_map_get_range
2085  *	Purpose:
2086  *			Adjust bounds based on security policy.
2087  */
2088 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)2089 vm_map_get_range(
2090 	vm_map_t                map,
2091 	vm_map_address_t       *address,
2092 	vm_map_kernel_flags_t  *vmk_flags,
2093 	vm_map_size_t           size,
2094 	bool                   *is_ptr)
2095 {
2096 	struct mach_vm_range effective_range = {};
2097 	vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
2098 
2099 	if (map == kernel_map) {
2100 		effective_range = kmem_ranges[range_id];
2101 
2102 		if (startup_phase >= STARTUP_SUB_KMEM) {
2103 			/*
2104 			 * Hint provided by caller is zeroed as the range is restricted to a
2105 			 * subset of the entire kernel_map VA, which could put the hint outside
2106 			 * the range, causing vm_map_store_find_space to fail.
2107 			 */
2108 			*address = 0ull;
2109 			/*
2110 			 * Ensure that range_id passed in by the caller is within meaningful
2111 			 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
2112 			 * to fail as the corresponding range is invalid. Range id larger than
2113 			 * KMEM_RANGE_ID_MAX will lead to an OOB access.
2114 			 */
2115 			if ((range_id == KMEM_RANGE_ID_NONE) ||
2116 			    (range_id > KMEM_RANGE_ID_MAX)) {
2117 				vm_map_range_invalid_panic(map, range_id);
2118 			}
2119 
2120 			/*
2121 			 * Pointer ranges use kmem_locate_space to do allocations.
2122 			 *
2123 			 * Non pointer fronts look like [ Small | Large | Permanent ]
2124 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
2125 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
2126 			 * use the entire range.
2127 			 */
2128 			if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2129 				*is_ptr = true;
2130 			} else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2131 				effective_range = kmem_large_ranges[range_id];
2132 			}
2133 		}
2134 #if CONFIG_MAP_RANGES
2135 	} else if (map->uses_user_ranges) {
2136 		switch (range_id) {
2137 		case UMEM_RANGE_ID_DEFAULT:
2138 			effective_range = map->default_range;
2139 			break;
2140 		case UMEM_RANGE_ID_HEAP:
2141 			effective_range = map->data_range;
2142 			break;
2143 		case UMEM_RANGE_ID_LARGE_FILE:
2144 			if (map->large_file_range.min_address != map->large_file_range.max_address) {
2145 				/* large file range is configured and should be used */
2146 				effective_range = map->large_file_range;
2147 			} else {
2148 				/*
2149 				 * the user asking for this user range might not have the
2150 				 * permissions to use the large file range (i.e., it doesn't
2151 				 * hold the correct entitlement), so we give it the data range
2152 				 * instead
2153 				 */
2154 				effective_range = map->data_range;
2155 			}
2156 			break;
2157 		case UMEM_RANGE_ID_FIXED:
2158 			/*
2159 			 * anywhere allocations with an address in "FIXED"
2160 			 * makes no sense, leave the range empty
2161 			 */
2162 			break;
2163 
2164 		default:
2165 			vm_map_range_invalid_panic(map, range_id);
2166 		}
2167 #endif /* CONFIG_MAP_RANGES */
2168 	} else {
2169 		/*
2170 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
2171 		 * allocations of PAGEZERO to explicit requests since its
2172 		 * normal use is to catch dereferences of NULL and many
2173 		 * applications also treat pointers with a value of 0 as
2174 		 * special and suddenly having address 0 contain useable
2175 		 * memory would tend to confuse those applications.
2176 		 */
2177 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2178 		effective_range.max_address = map->max_offset;
2179 	}
2180 
2181 	return effective_range;
2182 }
2183 
2184 kern_return_t
vm_map_locate_space_anywhere(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2185 vm_map_locate_space_anywhere(
2186 	vm_map_t                map,
2187 	vm_map_size_t           size,
2188 	vm_map_offset_t         mask,
2189 	vm_map_kernel_flags_t   vmk_flags,
2190 	vm_map_offset_t        *start_inout,
2191 	vm_map_entry_t         *entry_out)
2192 {
2193 	struct mach_vm_range effective_range = {};
2194 	vm_map_size_t   guard_offset;
2195 	vm_map_offset_t hint, limit;
2196 	vm_map_entry_t  entry;
2197 	bool            is_kmem_ptr_range = false;
2198 
2199 	/*
2200 	 * Only supported by vm_map_enter() with a fixed address.
2201 	 */
2202 	assert(!vmk_flags.vmf_fixed);
2203 	assert(!vmk_flags.vmkf_beyond_max);
2204 
2205 	if (__improbable(map->wait_for_space)) {
2206 		/*
2207 		 * support for "wait_for_space" is minimal,
2208 		 * its only consumer is the ipc_kernel_copy_map.
2209 		 */
2210 		assert(!map->holelistenabled &&
2211 		    !vmk_flags.vmkf_last_free &&
2212 		    !vmk_flags.vmkf_keep_map_locked &&
2213 		    !vmk_flags.vmkf_map_jit &&
2214 		    !vmk_flags.vmf_random_addr &&
2215 		    *start_inout <= map->min_offset);
2216 	} else if (vmk_flags.vmkf_last_free) {
2217 		assert(!vmk_flags.vmkf_map_jit &&
2218 		    !vmk_flags.vmf_random_addr);
2219 	}
2220 
2221 	if (vmk_flags.vmkf_guard_before) {
2222 		guard_offset = VM_MAP_PAGE_SIZE(map);
2223 		assert(size > guard_offset);
2224 		size -= guard_offset;
2225 	} else {
2226 		assert(size != 0);
2227 		guard_offset = 0;
2228 	}
2229 
2230 	/*
2231 	 * Validate range_id from flags and get associated range
2232 	 */
2233 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2234 	    &is_kmem_ptr_range);
2235 
2236 	if (is_kmem_ptr_range) {
2237 		return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2238 		           vmk_flags.vmkf_last_free, start_inout, entry_out);
2239 	}
2240 
2241 #if XNU_TARGET_OS_OSX
2242 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2243 		assert(map != kernel_map);
2244 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2245 	}
2246 #endif /* XNU_TARGET_OS_OSX */
2247 
2248 again:
2249 	if (vmk_flags.vmkf_last_free) {
2250 		hint = *start_inout;
2251 
2252 		if (hint == 0 || hint > effective_range.max_address) {
2253 			hint = effective_range.max_address;
2254 		}
2255 		if (hint <= effective_range.min_address) {
2256 			return KERN_NO_SPACE;
2257 		}
2258 		limit = effective_range.min_address;
2259 	} else {
2260 		hint = *start_inout;
2261 
2262 		if (vmk_flags.vmkf_map_jit) {
2263 			if (map->jit_entry_exists &&
2264 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2265 				return KERN_INVALID_ARGUMENT;
2266 			}
2267 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2268 				vmk_flags.vmf_random_addr = true;
2269 			}
2270 		}
2271 
2272 		if (vmk_flags.vmf_random_addr) {
2273 			kern_return_t kr;
2274 
2275 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2276 			if (kr != KERN_SUCCESS) {
2277 				return kr;
2278 			}
2279 		}
2280 #if __x86_64__
2281 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2282 		    !map->disable_vmentry_reuse &&
2283 		    map->vmmap_high_start != 0) {
2284 			hint = map->vmmap_high_start;
2285 		}
2286 #endif /* __x86_64__ */
2287 
2288 		if (hint < effective_range.min_address) {
2289 			hint = effective_range.min_address;
2290 		}
2291 		if (effective_range.max_address <= hint) {
2292 			return KERN_NO_SPACE;
2293 		}
2294 
2295 		limit = effective_range.max_address;
2296 	}
2297 	entry = vm_map_store_find_space(map,
2298 	    hint, limit, vmk_flags.vmkf_last_free,
2299 	    guard_offset, size, mask,
2300 	    start_inout);
2301 
2302 	if (__improbable(entry == NULL)) {
2303 		if (map->wait_for_space &&
2304 		    guard_offset + size <=
2305 		    effective_range.max_address - effective_range.min_address) {
2306 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2307 			vm_map_unlock(map);
2308 			thread_block(THREAD_CONTINUE_NULL);
2309 			vm_map_lock(map);
2310 			goto again;
2311 		}
2312 		return KERN_NO_SPACE;
2313 	}
2314 
2315 	if (entry_out) {
2316 		*entry_out = entry;
2317 	}
2318 	return KERN_SUCCESS;
2319 }
2320 
2321 /*!
2322  * @function vm_map_locate_space_fixed()
2323  *
2324  * @brief
2325  * Locate (no reservation) a range in the specified VM map at a fixed address.
2326  *
2327  * @param map           the map to scan for memory, must be locked.
2328  * @param start         the fixed address trying to be reserved
2329  * @param size          the size of the allocation to make.
2330  * @param mask          an alignment mask the allocation must respect,
2331  * @param vmk_flags     the vm map kernel flags to influence this call.
2332  *                      vmk_flags.vmf_anywhere must not be set.
2333  * @param entry_out     the entry right before the hole.
2334  * @param zap_list      a zap list of entries to clean up after the call.
2335  *
2336  * @returns
2337  * - KERN_SUCCESS in case of success and no conflicting entry is found,
2338  *   in which case entry_out is set to the entry before the hole.
2339  *
2340  * - KERN_MEMORY_PRESENT if a conflicting entry is found,
2341  *   in which case entry_out is set the conflicting entry,
2342  *   the callers MUST handle this error explicitly.
2343  *
2344  * - KERN_INVALID_ADDRESS if the specified @c start or @c size
2345  *   would result in a mapping outside of the map.
2346  *
2347  * - KERN_NO_SPACE for various cases of unrecoverable failures.
2348  */
2349 static kern_return_t
vm_map_locate_space_fixed(vm_map_t map,vm_map_offset_t start,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * entry_out,vm_map_zap_t zap_list)2350 vm_map_locate_space_fixed(
2351 	vm_map_t                map,
2352 	vm_map_offset_t         start,
2353 	vm_map_size_t           size,
2354 	vm_map_offset_t         mask,
2355 	vm_map_kernel_flags_t   vmk_flags,
2356 	vm_map_entry_t         *entry_out,
2357 	vm_map_zap_t            zap_list)
2358 {
2359 	vm_map_offset_t effective_min_offset, effective_max_offset;
2360 	vm_map_entry_t  entry;
2361 	vm_map_offset_t end;
2362 
2363 	assert(vmk_flags.vmf_fixed);
2364 
2365 	effective_min_offset = map->min_offset;
2366 	effective_max_offset = map->max_offset;
2367 
2368 	if (vmk_flags.vmkf_beyond_max) {
2369 		/*
2370 		 * Allow an insertion beyond the map's max offset.
2371 		 */
2372 		effective_max_offset = 0x00000000FFFFF000ULL;
2373 		if (vm_map_is_64bit(map)) {
2374 			effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2375 		}
2376 #if XNU_TARGET_OS_OSX
2377 	} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2378 		effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2379 #endif /* XNU_TARGET_OS_OSX */
2380 	}
2381 
2382 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2383 	    !vmk_flags.vmf_overwrite &&
2384 	    map->pmap == kernel_pmap &&
2385 	    vmk_flags.vm_tag == VM_MEMORY_REALLOC) {
2386 		/*
2387 		 * Force realloc() to switch to a new allocation,
2388 		 * to prevent 4k-fragmented virtual ranges.
2389 		 */
2390 //		DEBUG4K_ERROR("no realloc in place");
2391 		return KERN_NO_SPACE;
2392 	}
2393 
2394 	/*
2395 	 *	Verify that:
2396 	 *		the address doesn't itself violate
2397 	 *		the mask requirement.
2398 	 */
2399 
2400 	if ((start & mask) != 0) {
2401 		return KERN_NO_SPACE;
2402 	}
2403 
2404 #if CONFIG_MAP_RANGES
2405 	if (map->uses_user_ranges) {
2406 		struct mach_vm_range r;
2407 
2408 		vm_map_user_range_resolve(map, start, 1, &r);
2409 		if (r.max_address == 0) {
2410 			return KERN_INVALID_ADDRESS;
2411 		}
2412 		effective_min_offset = r.min_address;
2413 		effective_max_offset = r.max_address;
2414 	}
2415 #endif /* CONFIG_MAP_RANGES */
2416 
2417 	if ((startup_phase >= STARTUP_SUB_KMEM) && !vmk_flags.vmkf_submap &&
2418 	    (map == kernel_map)) {
2419 		mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2420 		effective_min_offset = r->min_address;
2421 		effective_max_offset = r->max_address;
2422 	}
2423 
2424 	/*
2425 	 *	...	the address is within bounds
2426 	 */
2427 
2428 	end = start + size;
2429 
2430 	if ((start < effective_min_offset) ||
2431 	    (end > effective_max_offset) ||
2432 	    (start >= end)) {
2433 		return KERN_INVALID_ADDRESS;
2434 	}
2435 
2436 	if (vmk_flags.vmf_overwrite) {
2437 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2438 		kern_return_t remove_kr;
2439 
2440 		/*
2441 		 * Fixed mapping and "overwrite" flag: attempt to
2442 		 * remove all existing mappings in the specified
2443 		 * address range, saving them in our "zap_list".
2444 		 *
2445 		 * This avoids releasing the VM map lock in
2446 		 * vm_map_entry_delete() and allows atomicity
2447 		 * when we want to replace some mappings with a new one.
2448 		 * It also allows us to restore the old VM mappings if the
2449 		 * new mapping fails.
2450 		 */
2451 		remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2452 
2453 		if (vmk_flags.vmkf_overwrite_immutable) {
2454 			/* we can overwrite immutable mappings */
2455 			remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2456 		}
2457 		if (vmk_flags.vmkf_remap_prot_copy) {
2458 			remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2459 		}
2460 		remove_kr = vm_map_delete(map, start, end, remove_flags,
2461 		    KMEM_GUARD_NONE, zap_list).kmr_return;
2462 		if (remove_kr) {
2463 			/* XXX FBDP restore zap_list? */
2464 			return remove_kr;
2465 		}
2466 	}
2467 
2468 	/*
2469 	 *	...	the starting address isn't allocated
2470 	 */
2471 
2472 	if (vm_map_lookup_entry(map, start, &entry)) {
2473 		*entry_out = entry;
2474 		return KERN_MEMORY_PRESENT;
2475 	}
2476 
2477 	/*
2478 	 *	...	the next region doesn't overlap the
2479 	 *		end point.
2480 	 */
2481 
2482 	if ((entry->vme_next != vm_map_to_entry(map)) &&
2483 	    (entry->vme_next->vme_start < end)) {
2484 		return KERN_NO_SPACE;
2485 	}
2486 
2487 	*entry_out = entry;
2488 	return KERN_SUCCESS;
2489 }
2490 
2491 /*
2492  *	Routine:	vm_map_find_space
2493  *	Purpose:
2494  *		Allocate a range in the specified virtual address map,
2495  *		returning the entry allocated for that range.
2496  *		Used by kmem_alloc, etc.
2497  *
2498  *		The map must be NOT be locked. It will be returned locked
2499  *		on KERN_SUCCESS, unlocked on failure.
2500  *
2501  *		If an entry is allocated, the object/offset fields
2502  *		are initialized to zero.
2503  */
2504 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2505 vm_map_find_space(
2506 	vm_map_t                map,
2507 	vm_map_offset_t         hint_address,
2508 	vm_map_size_t           size,
2509 	vm_map_offset_t         mask,
2510 	vm_map_kernel_flags_t   vmk_flags,
2511 	vm_map_entry_t          *o_entry)       /* OUT */
2512 {
2513 	vm_map_entry_t          new_entry, entry;
2514 	kern_return_t           kr;
2515 
2516 	if (size == 0) {
2517 		return KERN_INVALID_ARGUMENT;
2518 	}
2519 
2520 	new_entry = vm_map_entry_create(map);
2521 	new_entry->use_pmap = true;
2522 	new_entry->protection = VM_PROT_DEFAULT;
2523 	new_entry->max_protection = VM_PROT_ALL;
2524 
2525 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2526 		new_entry->map_aligned = true;
2527 	}
2528 	if (vmk_flags.vmf_permanent) {
2529 		new_entry->vme_permanent = true;
2530 	}
2531 
2532 	vm_map_lock(map);
2533 
2534 	kr = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
2535 	    &hint_address, &entry);
2536 	if (kr != KERN_SUCCESS) {
2537 		vm_map_unlock(map);
2538 		vm_map_entry_dispose(new_entry);
2539 		return kr;
2540 	}
2541 	new_entry->vme_start = hint_address;
2542 	new_entry->vme_end = hint_address + size;
2543 
2544 	/*
2545 	 *	At this point,
2546 	 *
2547 	 *	- new_entry's "vme_start" and "vme_end" should define
2548 	 *	  the endpoints of the available new range,
2549 	 *
2550 	 *	- and "entry" should refer to the region before
2551 	 *	  the new range,
2552 	 *
2553 	 *	- and the map should still be locked.
2554 	 */
2555 
2556 	assert(page_aligned(new_entry->vme_start));
2557 	assert(page_aligned(new_entry->vme_end));
2558 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2559 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2560 
2561 	/*
2562 	 *	Insert the new entry into the list
2563 	 */
2564 
2565 	vm_map_store_entry_link(map, entry, new_entry,
2566 	    VM_MAP_KERNEL_FLAGS_NONE);
2567 	map->size += size;
2568 
2569 	/*
2570 	 *	Update the lookup hint
2571 	 */
2572 	SAVE_HINT_MAP_WRITE(map, new_entry);
2573 
2574 	*o_entry = new_entry;
2575 	return KERN_SUCCESS;
2576 }
2577 
2578 int vm_map_pmap_enter_print = FALSE;
2579 int vm_map_pmap_enter_enable = FALSE;
2580 
2581 /*
2582  *	Routine:	vm_map_pmap_enter [internal only]
2583  *
2584  *	Description:
2585  *		Force pages from the specified object to be entered into
2586  *		the pmap at the specified address if they are present.
2587  *		As soon as a page not found in the object the scan ends.
2588  *
2589  *	Returns:
2590  *		Nothing.
2591  *
2592  *	In/out conditions:
2593  *		The source map should not be locked on entry.
2594  */
2595 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2596 vm_map_pmap_enter(
2597 	vm_map_t                map,
2598 	vm_map_offset_t         addr,
2599 	vm_map_offset_t         end_addr,
2600 	vm_object_t             object,
2601 	vm_object_offset_t      offset,
2602 	vm_prot_t               protection)
2603 {
2604 	int                     type_of_fault;
2605 	kern_return_t           kr;
2606 	uint8_t                 object_lock_type = 0;
2607 	struct vm_object_fault_info fault_info = {};
2608 
2609 	if (map->pmap == 0) {
2610 		return;
2611 	}
2612 
2613 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2614 
2615 	while (addr < end_addr) {
2616 		vm_page_t       m;
2617 
2618 
2619 		/*
2620 		 * TODO:
2621 		 * From vm_map_enter(), we come into this function without the map
2622 		 * lock held or the object lock held.
2623 		 * We haven't taken a reference on the object either.
2624 		 * We should do a proper lookup on the map to make sure
2625 		 * that things are sane before we go locking objects that
2626 		 * could have been deallocated from under us.
2627 		 */
2628 
2629 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2630 		vm_object_lock(object);
2631 
2632 		m = vm_page_lookup(object, offset);
2633 
2634 		if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2635 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2636 			vm_object_unlock(object);
2637 			return;
2638 		}
2639 
2640 		if (vm_map_pmap_enter_print) {
2641 			printf("vm_map_pmap_enter:");
2642 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2643 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2644 		}
2645 		type_of_fault = DBG_CACHE_HIT_FAULT;
2646 		kr = vm_fault_enter(m, map->pmap,
2647 		    addr,
2648 		    PAGE_SIZE, 0,
2649 		    protection, protection,
2650 		    VM_PAGE_WIRED(m),
2651 		    FALSE,                 /* change_wiring */
2652 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2653 		    &fault_info,
2654 		    NULL,                  /* need_retry */
2655 		    &type_of_fault,
2656 		    &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2657 
2658 		vm_object_unlock(object);
2659 
2660 		offset += PAGE_SIZE_64;
2661 		addr += PAGE_SIZE;
2662 	}
2663 }
2664 
2665 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2666 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2667 vm_map_random_address_for_size(
2668 	vm_map_t                map,
2669 	vm_map_offset_t        *address,
2670 	vm_map_size_t           size,
2671 	vm_map_kernel_flags_t   vmk_flags)
2672 {
2673 	kern_return_t   kr = KERN_SUCCESS;
2674 	int             tries = 0;
2675 	vm_map_offset_t random_addr = 0;
2676 	vm_map_offset_t hole_end;
2677 
2678 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2679 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2680 	vm_map_size_t   vm_hole_size = 0;
2681 	vm_map_size_t   addr_space_size;
2682 	bool            is_kmem_ptr;
2683 	struct mach_vm_range effective_range;
2684 
2685 	effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2686 	    &is_kmem_ptr);
2687 
2688 	addr_space_size = effective_range.max_address - effective_range.min_address;
2689 	if (size >= addr_space_size) {
2690 		return KERN_NO_SPACE;
2691 	}
2692 	addr_space_size -= size;
2693 
2694 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2695 
2696 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2697 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2698 			random_addr = (vm_map_offset_t)early_random();
2699 		} else {
2700 			random_addr = (vm_map_offset_t)random();
2701 		}
2702 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2703 		random_addr = vm_map_trunc_page(
2704 			effective_range.min_address + (random_addr % addr_space_size),
2705 			VM_MAP_PAGE_MASK(map));
2706 
2707 #if CONFIG_PROB_GZALLOC
2708 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2709 			continue;
2710 		}
2711 #endif /* CONFIG_PROB_GZALLOC */
2712 
2713 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2714 			if (prev_entry == vm_map_to_entry(map)) {
2715 				next_entry = vm_map_first_entry(map);
2716 			} else {
2717 				next_entry = prev_entry->vme_next;
2718 			}
2719 			if (next_entry == vm_map_to_entry(map)) {
2720 				hole_end = vm_map_max(map);
2721 			} else {
2722 				hole_end = next_entry->vme_start;
2723 			}
2724 			vm_hole_size = hole_end - random_addr;
2725 			if (vm_hole_size >= size) {
2726 				*address = random_addr;
2727 				break;
2728 			}
2729 		}
2730 		tries++;
2731 	}
2732 
2733 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2734 		kr = KERN_NO_SPACE;
2735 	}
2736 	return kr;
2737 }
2738 
2739 static boolean_t
vm_memory_malloc_no_cow(int alias)2740 vm_memory_malloc_no_cow(
2741 	int alias)
2742 {
2743 	uint64_t alias_mask;
2744 
2745 	if (!malloc_no_cow) {
2746 		return FALSE;
2747 	}
2748 	if (alias > 63) {
2749 		return FALSE;
2750 	}
2751 	alias_mask = 1ULL << alias;
2752 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2753 		return TRUE;
2754 	}
2755 	return FALSE;
2756 }
2757 
2758 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2759 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2760 /*
2761  *	Routine:	vm_map_enter
2762  *
2763  *	Description:
2764  *		Allocate a range in the specified virtual address map.
2765  *		The resulting range will refer to memory defined by
2766  *		the given memory object and offset into that object.
2767  *
2768  *		Arguments are as defined in the vm_map call.
2769  */
2770 static unsigned int vm_map_enter_restore_successes = 0;
2771 static unsigned int vm_map_enter_restore_failures = 0;
2772 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2773 vm_map_enter(
2774 	vm_map_t                map,
2775 	vm_map_offset_t         *address,       /* IN/OUT */
2776 	vm_map_size_t           size,
2777 	vm_map_offset_t         mask,
2778 	vm_map_kernel_flags_t   vmk_flags,
2779 	vm_object_t             object,
2780 	vm_object_offset_t      offset,
2781 	boolean_t               needs_copy,
2782 	vm_prot_t               cur_protection,
2783 	vm_prot_t               max_protection,
2784 	vm_inherit_t            inheritance)
2785 {
2786 	vm_map_entry_t          entry, new_entry;
2787 	vm_map_offset_t         start, tmp_start, tmp_offset;
2788 	vm_map_offset_t         end, tmp_end;
2789 	vm_map_offset_t         tmp2_start, tmp2_end;
2790 	vm_map_offset_t         step;
2791 	kern_return_t           result = KERN_SUCCESS;
2792 	bool                    map_locked = FALSE;
2793 	bool                    pmap_empty = TRUE;
2794 	bool                    new_mapping_established = FALSE;
2795 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2796 	const bool              anywhere = !vmk_flags.vmf_fixed;
2797 	const bool              purgable = vmk_flags.vmf_purgeable;
2798 	const bool              no_cache = vmk_flags.vmf_no_cache;
2799 	const bool              is_submap = vmk_flags.vmkf_submap;
2800 	const bool              permanent = vmk_flags.vmf_permanent;
2801 	const bool              no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2802 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
2803 	const bool              iokit_acct = vmk_flags.vmkf_iokit_acct;
2804 	const bool              resilient_codesign = vmk_flags.vmf_resilient_codesign;
2805 	const bool              resilient_media = vmk_flags.vmf_resilient_media;
2806 	const bool              entry_for_tpro = vmk_flags.vmf_tpro;
2807 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
2808 	const vm_tag_t          alias = vmk_flags.vm_tag;
2809 	vm_tag_t                user_alias;
2810 	kern_return_t           kr;
2811 	bool                    clear_map_aligned = FALSE;
2812 	vm_map_size_t           chunk_size = 0;
2813 	vm_object_t             caller_object;
2814 	VM_MAP_ZAP_DECLARE(zap_old_list);
2815 	VM_MAP_ZAP_DECLARE(zap_new_list);
2816 
2817 	caller_object = object;
2818 
2819 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2820 
2821 	if (vmk_flags.vmf_4gb_chunk) {
2822 #if defined(__LP64__)
2823 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2824 #else /* __LP64__ */
2825 		chunk_size = ANON_CHUNK_SIZE;
2826 #endif /* __LP64__ */
2827 	} else {
2828 		chunk_size = ANON_CHUNK_SIZE;
2829 	}
2830 
2831 
2832 
2833 	if (superpage_size) {
2834 		if (object != VM_OBJECT_NULL) {
2835 			/* caller can't provide their own VM object */
2836 			return KERN_INVALID_ARGUMENT;
2837 		}
2838 		switch (superpage_size) {
2839 			/*
2840 			 * Note that the current implementation only supports
2841 			 * a single size for superpages, SUPERPAGE_SIZE, per
2842 			 * architecture. As soon as more sizes are supposed
2843 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2844 			 * with a lookup of the size depending on superpage_size.
2845 			 */
2846 #ifdef __x86_64__
2847 		case SUPERPAGE_SIZE_ANY:
2848 			/* handle it like 2 MB and round up to page size */
2849 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2850 			OS_FALLTHROUGH;
2851 		case SUPERPAGE_SIZE_2MB:
2852 			break;
2853 #endif
2854 		default:
2855 			return KERN_INVALID_ARGUMENT;
2856 		}
2857 		mask = SUPERPAGE_SIZE - 1;
2858 		if (size & (SUPERPAGE_SIZE - 1)) {
2859 			return KERN_INVALID_ARGUMENT;
2860 		}
2861 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2862 	}
2863 
2864 
2865 	if ((cur_protection & VM_PROT_WRITE) &&
2866 	    (cur_protection & VM_PROT_EXECUTE) &&
2867 #if XNU_TARGET_OS_OSX
2868 	    map->pmap != kernel_pmap &&
2869 	    (cs_process_global_enforcement() ||
2870 	    (vmk_flags.vmkf_cs_enforcement_override
2871 	    ? vmk_flags.vmkf_cs_enforcement
2872 	    : (vm_map_cs_enforcement(map)
2873 #if __arm64__
2874 	    || !VM_MAP_IS_EXOTIC(map)
2875 #endif /* __arm64__ */
2876 	    ))) &&
2877 #endif /* XNU_TARGET_OS_OSX */
2878 #if CODE_SIGNING_MONITOR
2879 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2880 #endif
2881 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2882 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2883 	    !entry_for_jit) {
2884 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2885 
2886 		DTRACE_VM3(cs_wx,
2887 		    uint64_t, 0,
2888 		    uint64_t, 0,
2889 		    vm_prot_t, cur_protection);
2890 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2891 		    proc_selfpid(),
2892 		    (get_bsdtask_info(current_task())
2893 		    ? proc_name_address(get_bsdtask_info(current_task()))
2894 		    : "?"),
2895 		    __FUNCTION__,
2896 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2897 		cur_protection &= ~VM_PROT_EXECUTE;
2898 		if (vm_protect_wx_fail) {
2899 			return KERN_PROTECTION_FAILURE;
2900 		}
2901 	}
2902 
2903 	if (entry_for_jit
2904 	    && cur_protection != VM_PROT_ALL) {
2905 		/*
2906 		 * Native macOS processes and all non-macOS processes are
2907 		 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2908 		 * the RWX requirement was not enforced, and thus, we must live
2909 		 * with our sins. We are now dealing with a JIT mapping without
2910 		 * RWX.
2911 		 *
2912 		 * We deal with these by letting the MAP_JIT stick in order
2913 		 * to avoid CS violations when these pages are mapped executable
2914 		 * down the line. In order to appease the page table monitor (you
2915 		 * know what I'm talking about), these pages will end up being
2916 		 * marked as XNU_USER_DEBUG, which will be allowed because we
2917 		 * don't enforce the code signing monitor on macOS systems. If
2918 		 * the user-space application ever changes permissions to RWX,
2919 		 * which they are allowed to since the mapping was originally
2920 		 * created with MAP_JIT, then they'll switch over to using the
2921 		 * XNU_USER_JIT type, and won't be allowed to downgrade any
2922 		 * more after that.
2923 		 *
2924 		 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
2925 		 * strictly disallowed.
2926 		 */
2927 
2928 #if XNU_TARGET_OS_OSX
2929 		/*
2930 		 * Continue to allow non-RWX JIT
2931 		 */
2932 #else
2933 		/* non-macOS: reject JIT regions without RWX */
2934 		DTRACE_VM3(cs_wx,
2935 		    uint64_t, 0,
2936 		    uint64_t, 0,
2937 		    vm_prot_t, cur_protection);
2938 		printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
2939 		    proc_selfpid(),
2940 		    (get_bsdtask_info(current_task())
2941 		    ? proc_name_address(get_bsdtask_info(current_task()))
2942 		    : "?"),
2943 		    __FUNCTION__,
2944 		    cur_protection);
2945 		return KERN_PROTECTION_FAILURE;
2946 #endif
2947 	}
2948 
2949 	/*
2950 	 * If the task has requested executable lockdown,
2951 	 * deny any new executable mapping.
2952 	 */
2953 	if (map->map_disallow_new_exec == TRUE) {
2954 		if (cur_protection & VM_PROT_EXECUTE) {
2955 			return KERN_PROTECTION_FAILURE;
2956 		}
2957 	}
2958 
2959 	if (resilient_codesign) {
2960 		assert(!is_submap);
2961 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2962 		if ((cur_protection | max_protection) & reject_prot) {
2963 			return KERN_PROTECTION_FAILURE;
2964 		}
2965 	}
2966 
2967 	if (resilient_media) {
2968 		assert(!is_submap);
2969 //		assert(!needs_copy);
2970 		if (object != VM_OBJECT_NULL &&
2971 		    !object->internal) {
2972 			/*
2973 			 * This mapping is directly backed by an external
2974 			 * memory manager (e.g. a vnode pager for a file):
2975 			 * we would not have any safe place to inject
2976 			 * a zero-filled page if an actual page is not
2977 			 * available, without possibly impacting the actual
2978 			 * contents of the mapped object (e.g. the file),
2979 			 * so we can't provide any media resiliency here.
2980 			 */
2981 			return KERN_INVALID_ARGUMENT;
2982 		}
2983 	}
2984 
2985 	if (entry_for_tpro) {
2986 		/*
2987 		 * TPRO overrides the effective permissions of the region
2988 		 * and explicitly maps as RW. Ensure we have been passed
2989 		 * the expected permissions. We accept `cur_protections`
2990 		 * RO as that will be handled on fault.
2991 		 */
2992 		if (!(max_protection & VM_PROT_READ) ||
2993 		    !(max_protection & VM_PROT_WRITE) ||
2994 		    !(cur_protection & VM_PROT_READ)) {
2995 			return KERN_PROTECTION_FAILURE;
2996 		}
2997 
2998 		/*
2999 		 * We can now downgrade the cur_protection to RO. This is a mild lie
3000 		 * to the VM layer. But TPRO will be responsible for toggling the
3001 		 * protections between RO/RW
3002 		 */
3003 		cur_protection = VM_PROT_READ;
3004 	}
3005 
3006 	if (is_submap) {
3007 		vm_map_t submap;
3008 		if (purgable) {
3009 			/* submaps can not be purgeable */
3010 			return KERN_INVALID_ARGUMENT;
3011 		}
3012 		if (object == VM_OBJECT_NULL) {
3013 			/* submaps can not be created lazily */
3014 			return KERN_INVALID_ARGUMENT;
3015 		}
3016 		submap = (vm_map_t) object;
3017 		if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
3018 			/* page size mismatch */
3019 			return KERN_INVALID_ARGUMENT;
3020 		}
3021 	}
3022 	if (vmk_flags.vmkf_already) {
3023 		/*
3024 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
3025 		 * is already present.  For it to be meaningul, the requested
3026 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
3027 		 * we shouldn't try and remove what was mapped there first
3028 		 * (!VM_FLAGS_OVERWRITE).
3029 		 */
3030 		if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
3031 			return KERN_INVALID_ARGUMENT;
3032 		}
3033 	}
3034 
3035 	if (size == 0 ||
3036 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
3037 		*address = 0;
3038 		return KERN_INVALID_ARGUMENT;
3039 	}
3040 
3041 	if (map->pmap == kernel_pmap) {
3042 		user_alias = VM_KERN_MEMORY_NONE;
3043 	} else {
3044 		user_alias = alias;
3045 	}
3046 
3047 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
3048 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
3049 	}
3050 
3051 #define RETURN(value)   { result = value; goto BailOut; }
3052 
3053 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
3054 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
3055 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
3056 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
3057 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
3058 	}
3059 
3060 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3061 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
3062 		/*
3063 		 * In most cases, the caller rounds the size up to the
3064 		 * map's page size.
3065 		 * If we get a size that is explicitly not map-aligned here,
3066 		 * we'll have to respect the caller's wish and mark the
3067 		 * mapping as "not map-aligned" to avoid tripping the
3068 		 * map alignment checks later.
3069 		 */
3070 		clear_map_aligned = TRUE;
3071 	}
3072 	if (!anywhere &&
3073 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3074 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
3075 		/*
3076 		 * We've been asked to map at a fixed address and that
3077 		 * address is not aligned to the map's specific alignment.
3078 		 * The caller should know what it's doing (i.e. most likely
3079 		 * mapping some fragmented copy map, transferring memory from
3080 		 * a VM map with a different alignment), so clear map_aligned
3081 		 * for this new VM map entry and proceed.
3082 		 */
3083 		clear_map_aligned = TRUE;
3084 	}
3085 
3086 	/*
3087 	 * Only zero-fill objects are allowed to be purgable.
3088 	 * LP64todo - limit purgable objects to 32-bits for now
3089 	 */
3090 	if (purgable &&
3091 	    (offset != 0 ||
3092 	    (object != VM_OBJECT_NULL &&
3093 	    (object->vo_size != size ||
3094 	    object->purgable == VM_PURGABLE_DENY))
3095 #if __LP64__
3096 	    || size > ANON_MAX_SIZE
3097 #endif
3098 	    )) {
3099 		return KERN_INVALID_ARGUMENT;
3100 	}
3101 
3102 	vm_map_lock(map);
3103 	map_locked = TRUE;
3104 
3105 	if (anywhere) {
3106 		result = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
3107 		    address, &entry);
3108 		start = *address;
3109 	} else {
3110 		start = *address;
3111 		result = vm_map_locate_space_fixed(map, start, size, mask,
3112 		    vmk_flags, &entry, &zap_old_list);
3113 	}
3114 
3115 	end = start + size;
3116 
3117 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
3118 
3119 	/*
3120 	 * Check if what's already there is what we want.
3121 	 */
3122 	if (result == KERN_MEMORY_PRESENT) {
3123 		assert(!anywhere);
3124 		if (!(vmk_flags.vmkf_already)) {
3125 			RETURN(KERN_NO_SPACE);
3126 		}
3127 		tmp_start = start;
3128 		tmp_offset = offset;
3129 		if (entry->vme_start < start) {
3130 			tmp_start -= start - entry->vme_start;
3131 			tmp_offset -= start - entry->vme_start;
3132 		}
3133 		for (; entry->vme_start < end;
3134 		    entry = entry->vme_next) {
3135 			/*
3136 			 * Check if the mapping's attributes
3137 			 * match the existing map entry.
3138 			 */
3139 			if (entry == vm_map_to_entry(map) ||
3140 			    entry->vme_start != tmp_start ||
3141 			    entry->is_sub_map != is_submap ||
3142 			    VME_OFFSET(entry) != tmp_offset ||
3143 			    entry->needs_copy != needs_copy ||
3144 			    entry->protection != cur_protection ||
3145 			    entry->max_protection != max_protection ||
3146 			    entry->inheritance != inheritance ||
3147 			    entry->iokit_acct != iokit_acct ||
3148 			    VME_ALIAS(entry) != alias) {
3149 				/* not the same mapping ! */
3150 				RETURN(KERN_NO_SPACE);
3151 			}
3152 			/*
3153 			 * Check if the same object is being mapped.
3154 			 */
3155 			if (is_submap) {
3156 				if (VME_SUBMAP(entry) !=
3157 				    (vm_map_t) object) {
3158 					/* not the same submap */
3159 					RETURN(KERN_NO_SPACE);
3160 				}
3161 			} else {
3162 				if (VME_OBJECT(entry) != object) {
3163 					/* not the same VM object... */
3164 					vm_object_t obj2;
3165 
3166 					obj2 = VME_OBJECT(entry);
3167 					if ((obj2 == VM_OBJECT_NULL || obj2->internal) &&
3168 					    (object == VM_OBJECT_NULL || object->internal)) {
3169 						/*
3170 						 * ... but both are
3171 						 * anonymous memory,
3172 						 * so equivalent.
3173 						 */
3174 					} else {
3175 						RETURN(KERN_NO_SPACE);
3176 					}
3177 				}
3178 			}
3179 
3180 			tmp_offset += entry->vme_end - entry->vme_start;
3181 			tmp_start += entry->vme_end - entry->vme_start;
3182 			if (entry->vme_end >= end) {
3183 				/* reached the end of our mapping */
3184 				break;
3185 			}
3186 		}
3187 		/* it all matches:  let's use what's already there ! */
3188 		RETURN(KERN_MEMORY_PRESENT);
3189 	}
3190 
3191 	if (result != KERN_SUCCESS) {
3192 		goto BailOut;
3193 	}
3194 
3195 
3196 	/*
3197 	 *	At this point,
3198 	 *		"start" and "end" should define the endpoints of the
3199 	 *			available new range, and
3200 	 *		"entry" should refer to the region before the new
3201 	 *			range, and
3202 	 *
3203 	 *		the map should be locked.
3204 	 */
3205 
3206 	/*
3207 	 *	See whether we can avoid creating a new entry (and object) by
3208 	 *	extending one of our neighbors.  [So far, we only attempt to
3209 	 *	extend from below.]  Note that we can never extend/join
3210 	 *	purgable objects because they need to remain distinct
3211 	 *	entities in order to implement their "volatile object"
3212 	 *	semantics.
3213 	 */
3214 
3215 	if (purgable ||
3216 	    entry_for_jit ||
3217 	    entry_for_tpro ||
3218 	    vm_memory_malloc_no_cow(user_alias)) {
3219 		if (superpage_size) {
3220 			/*
3221 			 * For "super page" allocations, we will allocate
3222 			 * special physically-contiguous VM objects later on,
3223 			 * so we should not have flags instructing us to create
3224 			 * a differently special VM object here.
3225 			 */
3226 			RETURN(KERN_INVALID_ARGUMENT);
3227 		}
3228 
3229 		if (object == VM_OBJECT_NULL) {
3230 			assert(!superpage_size);
3231 			object = vm_object_allocate(size);
3232 			vm_object_lock(object);
3233 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3234 			VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3235 			if (malloc_no_cow_except_fork &&
3236 			    !purgable &&
3237 			    !entry_for_jit &&
3238 			    !entry_for_tpro &&
3239 			    vm_memory_malloc_no_cow(user_alias)) {
3240 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3241 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3242 			}
3243 			if (entry_for_jit) {
3244 				object->vo_inherit_copy_none = true;
3245 			}
3246 			if (purgable) {
3247 				task_t owner;
3248 				VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3249 				if (map->pmap == kernel_pmap) {
3250 					/*
3251 					 * Purgeable mappings made in a kernel
3252 					 * map are "owned" by the kernel itself
3253 					 * rather than the current user task
3254 					 * because they're likely to be used by
3255 					 * more than this user task (see
3256 					 * execargs_purgeable_allocate(), for
3257 					 * example).
3258 					 */
3259 					owner = kernel_task;
3260 				} else {
3261 					owner = current_task();
3262 				}
3263 				assert(object->vo_owner == NULL);
3264 				assert(object->resident_page_count == 0);
3265 				assert(object->wired_page_count == 0);
3266 				vm_purgeable_nonvolatile_enqueue(object, owner);
3267 			}
3268 			vm_object_unlock(object);
3269 			offset = (vm_object_offset_t)0;
3270 		}
3271 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3272 		/* no coalescing if address space uses sub-pages */
3273 	} else if ((is_submap == FALSE) &&
3274 	    (object == VM_OBJECT_NULL) &&
3275 	    (entry != vm_map_to_entry(map)) &&
3276 	    (entry->vme_end == start) &&
3277 	    (!entry->is_shared) &&
3278 	    (!entry->is_sub_map) &&
3279 	    (!entry->in_transition) &&
3280 	    (!entry->needs_wakeup) &&
3281 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3282 	    (entry->protection == cur_protection) &&
3283 	    (entry->max_protection == max_protection) &&
3284 	    (entry->inheritance == inheritance) &&
3285 	    ((user_alias == VM_MEMORY_REALLOC) ||
3286 	    (VME_ALIAS(entry) == alias)) &&
3287 	    (entry->no_cache == no_cache) &&
3288 	    (entry->vme_permanent == permanent) &&
3289 	    /* no coalescing for immutable executable mappings */
3290 	    !((entry->protection & VM_PROT_EXECUTE) &&
3291 	    entry->vme_permanent) &&
3292 	    (!entry->superpage_size && !superpage_size) &&
3293 	    /*
3294 	     * No coalescing if not map-aligned, to avoid propagating
3295 	     * that condition any further than needed:
3296 	     */
3297 	    (!entry->map_aligned || !clear_map_aligned) &&
3298 	    (!entry->zero_wired_pages) &&
3299 	    (!entry->used_for_jit && !entry_for_jit) &&
3300 #if __arm64e__
3301 	    (!entry->used_for_tpro && !entry_for_tpro) &&
3302 #endif
3303 	    (!entry->csm_associated) &&
3304 	    (entry->iokit_acct == iokit_acct) &&
3305 	    (!entry->vme_resilient_codesign) &&
3306 	    (!entry->vme_resilient_media) &&
3307 	    (!entry->vme_atomic) &&
3308 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
3309 
3310 	    ((entry->vme_end - entry->vme_start) + size <=
3311 	    (user_alias == VM_MEMORY_REALLOC ?
3312 	    ANON_CHUNK_SIZE :
3313 	    NO_COALESCE_LIMIT)) &&
3314 
3315 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
3316 		if (vm_object_coalesce(VME_OBJECT(entry),
3317 		    VM_OBJECT_NULL,
3318 		    VME_OFFSET(entry),
3319 		    (vm_object_offset_t) 0,
3320 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
3321 		    (vm_map_size_t)(end - entry->vme_end))) {
3322 			/*
3323 			 *	Coalesced the two objects - can extend
3324 			 *	the previous map entry to include the
3325 			 *	new range.
3326 			 */
3327 			map->size += (end - entry->vme_end);
3328 			assert(entry->vme_start < end);
3329 			assert(VM_MAP_PAGE_ALIGNED(end,
3330 			    VM_MAP_PAGE_MASK(map)));
3331 			if (__improbable(vm_debug_events)) {
3332 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3333 			}
3334 			entry->vme_end = end;
3335 			if (map->holelistenabled) {
3336 				vm_map_store_update_first_free(map, entry, TRUE);
3337 			} else {
3338 				vm_map_store_update_first_free(map, map->first_free, TRUE);
3339 			}
3340 			new_mapping_established = TRUE;
3341 			RETURN(KERN_SUCCESS);
3342 		}
3343 	}
3344 
3345 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3346 	new_entry = NULL;
3347 
3348 	if (vmk_flags.vmkf_submap_adjust) {
3349 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3350 		offset = start;
3351 	}
3352 
3353 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3354 		tmp2_end = tmp2_start + step;
3355 		/*
3356 		 *	Create a new entry
3357 		 *
3358 		 * XXX FBDP
3359 		 * The reserved "page zero" in each process's address space can
3360 		 * be arbitrarily large.  Splitting it into separate objects and
3361 		 * therefore different VM map entries serves no purpose and just
3362 		 * slows down operations on the VM map, so let's not split the
3363 		 * allocation into chunks if the max protection is NONE.  That
3364 		 * memory should never be accessible, so it will never get to the
3365 		 * default pager.
3366 		 */
3367 		tmp_start = tmp2_start;
3368 		if (!is_submap &&
3369 		    object == VM_OBJECT_NULL &&
3370 		    size > chunk_size &&
3371 		    max_protection != VM_PROT_NONE &&
3372 		    superpage_size == 0) {
3373 			tmp_end = tmp_start + chunk_size;
3374 		} else {
3375 			tmp_end = tmp2_end;
3376 		}
3377 		do {
3378 			if (!is_submap &&
3379 			    object != VM_OBJECT_NULL &&
3380 			    object->internal &&
3381 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3382 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3383 				DTRACE_VM5(vm_map_enter_overmap,
3384 				    vm_map_t, map,
3385 				    vm_map_address_t, tmp_start,
3386 				    vm_map_address_t, tmp_end,
3387 				    vm_object_offset_t, offset,
3388 				    vm_object_size_t, object->vo_size);
3389 			}
3390 			new_entry = vm_map_entry_insert(map,
3391 			    entry, tmp_start, tmp_end,
3392 			    object, offset, vmk_flags,
3393 			    needs_copy,
3394 			    cur_protection, max_protection,
3395 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3396 			    VM_INHERIT_NONE : inheritance),
3397 			    clear_map_aligned);
3398 
3399 			assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3400 
3401 			if (resilient_codesign) {
3402 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3403 				if (!((cur_protection | max_protection) & reject_prot)) {
3404 					new_entry->vme_resilient_codesign = TRUE;
3405 				}
3406 			}
3407 
3408 			if (resilient_media &&
3409 			    (object == VM_OBJECT_NULL ||
3410 			    object->internal)) {
3411 				new_entry->vme_resilient_media = TRUE;
3412 			}
3413 
3414 			assert(!new_entry->iokit_acct);
3415 			if (!is_submap &&
3416 			    object != VM_OBJECT_NULL &&
3417 			    object->internal &&
3418 			    (object->purgable != VM_PURGABLE_DENY ||
3419 			    object->vo_ledger_tag)) {
3420 				assert(new_entry->use_pmap);
3421 				assert(!new_entry->iokit_acct);
3422 				/*
3423 				 * Turn off pmap accounting since
3424 				 * purgeable (or tagged) objects have their
3425 				 * own ledgers.
3426 				 */
3427 				new_entry->use_pmap = FALSE;
3428 			} else if (!is_submap &&
3429 			    iokit_acct &&
3430 			    object != VM_OBJECT_NULL &&
3431 			    object->internal) {
3432 				/* alternate accounting */
3433 				assert(!new_entry->iokit_acct);
3434 				assert(new_entry->use_pmap);
3435 				new_entry->iokit_acct = TRUE;
3436 				new_entry->use_pmap = FALSE;
3437 				DTRACE_VM4(
3438 					vm_map_iokit_mapped_region,
3439 					vm_map_t, map,
3440 					vm_map_offset_t, new_entry->vme_start,
3441 					vm_map_offset_t, new_entry->vme_end,
3442 					int, VME_ALIAS(new_entry));
3443 				vm_map_iokit_mapped_region(
3444 					map,
3445 					(new_entry->vme_end -
3446 					new_entry->vme_start));
3447 			} else if (!is_submap) {
3448 				assert(!new_entry->iokit_acct);
3449 				assert(new_entry->use_pmap);
3450 			}
3451 
3452 			if (is_submap) {
3453 				vm_map_t        submap;
3454 				boolean_t       submap_is_64bit;
3455 				boolean_t       use_pmap;
3456 
3457 				assert(new_entry->is_sub_map);
3458 				assert(!new_entry->use_pmap);
3459 				assert(!new_entry->iokit_acct);
3460 				submap = (vm_map_t) object;
3461 				submap_is_64bit = vm_map_is_64bit(submap);
3462 				use_pmap = vmk_flags.vmkf_nested_pmap;
3463 #ifndef NO_NESTED_PMAP
3464 				if (use_pmap && submap->pmap == NULL) {
3465 					ledger_t ledger = map->pmap->ledger;
3466 					/* we need a sub pmap to nest... */
3467 					submap->pmap = pmap_create_options(ledger, 0,
3468 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3469 					if (submap->pmap == NULL) {
3470 						/* let's proceed without nesting... */
3471 					}
3472 #if defined(__arm64__)
3473 					else {
3474 						pmap_set_nested(submap->pmap);
3475 					}
3476 #endif
3477 				}
3478 				if (use_pmap && submap->pmap != NULL) {
3479 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3480 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3481 						kr = KERN_FAILURE;
3482 					} else {
3483 						kr = pmap_nest(map->pmap,
3484 						    submap->pmap,
3485 						    tmp_start,
3486 						    tmp_end - tmp_start);
3487 					}
3488 					if (kr != KERN_SUCCESS) {
3489 						printf("vm_map_enter: "
3490 						    "pmap_nest(0x%llx,0x%llx) "
3491 						    "error 0x%x\n",
3492 						    (long long)tmp_start,
3493 						    (long long)tmp_end,
3494 						    kr);
3495 					} else {
3496 						/* we're now nested ! */
3497 						new_entry->use_pmap = TRUE;
3498 						pmap_empty = FALSE;
3499 					}
3500 				}
3501 #endif /* NO_NESTED_PMAP */
3502 			}
3503 			entry = new_entry;
3504 
3505 			if (superpage_size) {
3506 				vm_page_t pages, m;
3507 				vm_object_t sp_object;
3508 				vm_object_offset_t sp_offset;
3509 
3510 				assert(object == VM_OBJECT_NULL);
3511 				VME_OFFSET_SET(entry, 0);
3512 
3513 				/* allocate one superpage */
3514 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3515 				if (kr != KERN_SUCCESS) {
3516 					/* deallocate whole range... */
3517 					new_mapping_established = TRUE;
3518 					/* ... but only up to "tmp_end" */
3519 					size -= end - tmp_end;
3520 					RETURN(kr);
3521 				}
3522 
3523 				/* create one vm_object per superpage */
3524 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3525 				vm_object_lock(sp_object);
3526 				sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3527 				VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3528 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3529 				VME_OBJECT_SET(entry, sp_object, false, 0);
3530 				assert(entry->use_pmap);
3531 
3532 				/* enter the base pages into the object */
3533 				for (sp_offset = 0;
3534 				    sp_offset < SUPERPAGE_SIZE;
3535 				    sp_offset += PAGE_SIZE) {
3536 					m = pages;
3537 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3538 					pages = NEXT_PAGE(m);
3539 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3540 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3541 				}
3542 				vm_object_unlock(sp_object);
3543 			}
3544 		} while (tmp_end != tmp2_end &&
3545 		    (tmp_start = tmp_end) &&
3546 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3547 		    tmp_end + chunk_size : tmp2_end));
3548 	}
3549 
3550 	new_mapping_established = TRUE;
3551 
3552 BailOut:
3553 	assert(map_locked == TRUE);
3554 
3555 	/*
3556 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3557 	 * If we have identified and possibly established the new mapping(s),
3558 	 * make sure we did not go beyond the address space limit.
3559 	 */
3560 	if (result == KERN_SUCCESS) {
3561 		if (map->size_limit != RLIM_INFINITY &&
3562 		    map->size > map->size_limit) {
3563 			/*
3564 			 * Establishing the requested mappings would exceed
3565 			 * the process's RLIMIT_AS limit: fail with
3566 			 * KERN_NO_SPACE.
3567 			 */
3568 			result = KERN_NO_SPACE;
3569 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3570 			    proc_selfpid(),
3571 			    (get_bsdtask_info(current_task())
3572 			    ? proc_name_address(get_bsdtask_info(current_task()))
3573 			    : "?"),
3574 			    __FUNCTION__,
3575 			    (uint64_t) map->size,
3576 			    (uint64_t) map->size_limit);
3577 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3578 			    vm_map_size_t, map->size,
3579 			    uint64_t, map->size_limit);
3580 			vm_map_enter_RLIMIT_AS_count++;
3581 		} else if (map->data_limit != RLIM_INFINITY &&
3582 		    map->size > map->data_limit) {
3583 			/*
3584 			 * Establishing the requested mappings would exceed
3585 			 * the process's RLIMIT_DATA limit: fail with
3586 			 * KERN_NO_SPACE.
3587 			 */
3588 			result = KERN_NO_SPACE;
3589 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3590 			    proc_selfpid(),
3591 			    (get_bsdtask_info(current_task())
3592 			    ? proc_name_address(get_bsdtask_info(current_task()))
3593 			    : "?"),
3594 			    __FUNCTION__,
3595 			    (uint64_t) map->size,
3596 			    (uint64_t) map->data_limit);
3597 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3598 			    vm_map_size_t, map->size,
3599 			    uint64_t, map->data_limit);
3600 			vm_map_enter_RLIMIT_DATA_count++;
3601 		}
3602 	}
3603 
3604 	if (result == KERN_SUCCESS) {
3605 		vm_prot_t pager_prot;
3606 		memory_object_t pager;
3607 
3608 #if DEBUG
3609 		if (pmap_empty &&
3610 		    !(vmk_flags.vmkf_no_pmap_check)) {
3611 			assert(pmap_is_empty(map->pmap,
3612 			    *address,
3613 			    *address + size));
3614 		}
3615 #endif /* DEBUG */
3616 
3617 		/*
3618 		 * For "named" VM objects, let the pager know that the
3619 		 * memory object is being mapped.  Some pagers need to keep
3620 		 * track of this, to know when they can reclaim the memory
3621 		 * object, for example.
3622 		 * VM calls memory_object_map() for each mapping (specifying
3623 		 * the protection of each mapping) and calls
3624 		 * memory_object_last_unmap() when all the mappings are gone.
3625 		 */
3626 		pager_prot = max_protection;
3627 		if (needs_copy) {
3628 			/*
3629 			 * Copy-On-Write mapping: won't modify
3630 			 * the memory object.
3631 			 */
3632 			pager_prot &= ~VM_PROT_WRITE;
3633 		}
3634 		if (!is_submap &&
3635 		    object != VM_OBJECT_NULL &&
3636 		    object->named &&
3637 		    object->pager != MEMORY_OBJECT_NULL) {
3638 			vm_object_lock(object);
3639 			pager = object->pager;
3640 			if (object->named &&
3641 			    pager != MEMORY_OBJECT_NULL) {
3642 				assert(object->pager_ready);
3643 				vm_object_mapping_wait(object, THREAD_UNINT);
3644 				/* object might have lost its pager while waiting */
3645 				pager = object->pager;
3646 				if (object->named && pager != MEMORY_OBJECT_NULL) {
3647 					vm_object_mapping_begin(object);
3648 					vm_object_unlock(object);
3649 
3650 					kr = memory_object_map(pager, pager_prot);
3651 					assert(kr == KERN_SUCCESS);
3652 
3653 					vm_object_lock(object);
3654 					vm_object_mapping_end(object);
3655 				}
3656 			}
3657 			vm_object_unlock(object);
3658 		}
3659 	}
3660 
3661 	assert(map_locked == TRUE);
3662 
3663 	if (new_mapping_established) {
3664 		/*
3665 		 * If we release the map lock for any reason below,
3666 		 * another thread could deallocate our new mapping,
3667 		 * releasing the caller's reference on "caller_object",
3668 		 * which was transferred to the mapping.
3669 		 * If this was the only reference, the object could be
3670 		 * destroyed.
3671 		 *
3672 		 * We need to take an extra reference on "caller_object"
3673 		 * to keep it alive if we need to return the caller's
3674 		 * reference to the caller in case of failure.
3675 		 */
3676 		if (is_submap) {
3677 			vm_map_reference((vm_map_t)caller_object);
3678 		} else {
3679 			vm_object_reference(caller_object);
3680 		}
3681 	}
3682 
3683 	if (!keep_map_locked) {
3684 		vm_map_unlock(map);
3685 		map_locked = FALSE;
3686 		entry = VM_MAP_ENTRY_NULL;
3687 		new_entry = VM_MAP_ENTRY_NULL;
3688 	}
3689 
3690 	/*
3691 	 * We can't hold the map lock if we enter this block.
3692 	 */
3693 
3694 	if (result == KERN_SUCCESS) {
3695 		/*	Wire down the new entry if the user
3696 		 *	requested all new map entries be wired.
3697 		 */
3698 		if ((map->wiring_required) || (superpage_size)) {
3699 			assert(!keep_map_locked);
3700 			pmap_empty = FALSE; /* pmap won't be empty */
3701 			kr = vm_map_wire_nested(map, start, end,
3702 			    cur_protection, VM_KERN_MEMORY_MLOCK,
3703 			    TRUE, PMAP_NULL, 0, NULL);
3704 			result = kr;
3705 		}
3706 
3707 	}
3708 
3709 	if (result != KERN_SUCCESS) {
3710 		if (new_mapping_established) {
3711 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3712 
3713 			/*
3714 			 * We have to get rid of the new mappings since we
3715 			 * won't make them available to the user.
3716 			 * Try and do that atomically, to minimize the risk
3717 			 * that someone else create new mappings that range.
3718 			 */
3719 			if (!map_locked) {
3720 				vm_map_lock(map);
3721 				map_locked = TRUE;
3722 			}
3723 			remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3724 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3725 			if (permanent) {
3726 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3727 			}
3728 			(void) vm_map_delete(map,
3729 			    *address, *address + size,
3730 			    remove_flags,
3731 			    KMEM_GUARD_NONE, &zap_new_list);
3732 		}
3733 
3734 		if (vm_map_zap_first_entry(&zap_old_list)) {
3735 			vm_map_entry_t entry1, entry2;
3736 
3737 			/*
3738 			 * The new mapping failed.  Attempt to restore
3739 			 * the old mappings, saved in the "zap_old_map".
3740 			 */
3741 			if (!map_locked) {
3742 				vm_map_lock(map);
3743 				map_locked = TRUE;
3744 			}
3745 
3746 			/* first check if the coast is still clear */
3747 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3748 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3749 
3750 			if (vm_map_lookup_entry(map, start, &entry1) ||
3751 			    vm_map_lookup_entry(map, end, &entry2) ||
3752 			    entry1 != entry2) {
3753 				/*
3754 				 * Part of that range has already been
3755 				 * re-mapped:  we can't restore the old
3756 				 * mappings...
3757 				 */
3758 				vm_map_enter_restore_failures++;
3759 			} else {
3760 				/*
3761 				 * Transfer the saved map entries from
3762 				 * "zap_old_map" to the original "map",
3763 				 * inserting them all after "entry1".
3764 				 */
3765 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3766 					vm_map_size_t entry_size;
3767 
3768 					entry_size = (entry2->vme_end -
3769 					    entry2->vme_start);
3770 					vm_map_store_entry_link(map, entry1, entry2,
3771 					    VM_MAP_KERNEL_FLAGS_NONE);
3772 					map->size += entry_size;
3773 					entry1 = entry2;
3774 				}
3775 				if (map->wiring_required) {
3776 					/*
3777 					 * XXX TODO: we should rewire the
3778 					 * old pages here...
3779 					 */
3780 				}
3781 				vm_map_enter_restore_successes++;
3782 			}
3783 		}
3784 	}
3785 
3786 	/*
3787 	 * The caller is responsible for releasing the lock if it requested to
3788 	 * keep the map locked.
3789 	 */
3790 	if (map_locked && !keep_map_locked) {
3791 		vm_map_unlock(map);
3792 	}
3793 
3794 	vm_map_zap_dispose(&zap_old_list);
3795 	vm_map_zap_dispose(&zap_new_list);
3796 
3797 	if (new_mapping_established) {
3798 		/*
3799 		 * The caller had a reference on "caller_object" and we
3800 		 * transferred that reference to the mapping.
3801 		 * We also took an extra reference on "caller_object" to keep
3802 		 * it alive while the map was unlocked.
3803 		 */
3804 		if (result == KERN_SUCCESS) {
3805 			/*
3806 			 * On success, the caller's reference on the object gets
3807 			 * tranferred to the mapping.
3808 			 * Release our extra reference.
3809 			 */
3810 			if (is_submap) {
3811 				vm_map_deallocate((vm_map_t)caller_object);
3812 			} else {
3813 				vm_object_deallocate(caller_object);
3814 			}
3815 		} else {
3816 			/*
3817 			 * On error, the caller expects to still have a
3818 			 * reference on the object it gave us.
3819 			 * Let's use our extra reference for that.
3820 			 */
3821 		}
3822 	}
3823 
3824 	return result;
3825 
3826 #undef  RETURN
3827 }
3828 
3829 /*
3830  * Counters for the prefault optimization.
3831  */
3832 int64_t vm_prefault_nb_pages = 0;
3833 int64_t vm_prefault_nb_bailout = 0;
3834 
3835 static kern_return_t
vm_map_enter_adjust_offset(vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_offset_t quantity)3836 vm_map_enter_adjust_offset(
3837 	vm_object_offset_t *obj_offs,
3838 	vm_object_offset_t *obj_end,
3839 	vm_object_offset_t  quantity)
3840 {
3841 	if (os_add_overflow(*obj_offs, quantity, obj_offs) ||
3842 	    os_add_overflow(*obj_end, quantity, obj_end) ||
3843 	    vm_map_round_page_mask(*obj_end, PAGE_MASK) == 0) {
3844 		return KERN_INVALID_ARGUMENT;
3845 	}
3846 
3847 	return KERN_SUCCESS;
3848 }
3849 
3850 static __attribute__((always_inline, warn_unused_result))
3851 kern_return_t
vm_map_enter_mem_object_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)3852 vm_map_enter_mem_object_sanitize(
3853 	vm_map_t                target_map,
3854 	vm_map_offset_ut        address_u,
3855 	vm_map_size_ut          initial_size_u,
3856 	vm_map_offset_ut        mask_u,
3857 	vm_object_offset_ut     offset_u,
3858 	vm_prot_ut              cur_protection_u,
3859 	vm_prot_ut              max_protection_u,
3860 	vm_inherit_ut           inheritance_u,
3861 	vm_map_kernel_flags_t   vmk_flags,
3862 	ipc_port_t              port,
3863 	vm_map_address_t       *map_addr,
3864 	vm_map_size_t          *map_size,
3865 	vm_map_offset_t        *mask,
3866 	vm_object_offset_t     *obj_offs,
3867 	vm_object_offset_t     *obj_end,
3868 	vm_object_size_t       *obj_size,
3869 	vm_prot_t              *cur_protection,
3870 	vm_prot_t              *max_protection,
3871 	vm_inherit_t           *inheritance)
3872 {
3873 	kern_return_t           result;
3874 
3875 	result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
3876 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3877 	    VM_PROT_IS_MASK, cur_protection,
3878 	    max_protection);
3879 	if (__improbable(result != KERN_SUCCESS)) {
3880 		return result;
3881 	}
3882 
3883 	result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3884 	    inheritance);
3885 	if (__improbable(result != KERN_SUCCESS)) {
3886 		return result;
3887 	}
3888 
3889 	result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, mask);
3890 	if (__improbable(result != KERN_SUCCESS)) {
3891 		return result;
3892 	}
3893 
3894 	if (vmk_flags.vmf_fixed) {
3895 		vm_map_address_t        map_end;
3896 
3897 		result = vm_sanitize_addr_size(address_u, initial_size_u,
3898 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3899 		    target_map,
3900 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_REALIGN_START,
3901 		    map_addr, &map_end, map_size);
3902 		if (__improbable(result != KERN_SUCCESS)) {
3903 			return result;
3904 		}
3905 	} else {
3906 		*map_addr = vm_sanitize_addr(target_map, address_u);
3907 		result = vm_sanitize_size(0, initial_size_u,
3908 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3909 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
3910 		if (__improbable(result != KERN_SUCCESS)) {
3911 			return result;
3912 		}
3913 	}
3914 
3915 	*obj_size = vm_object_round_page(*map_size);
3916 	if (__improbable(*obj_size == 0)) {
3917 		return KERN_INVALID_ARGUMENT;
3918 	}
3919 
3920 	if (IP_VALID(port)) {
3921 		result = vm_sanitize_addr_size(offset_u, *obj_size,
3922 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3923 		    PAGE_MASK,
3924 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
3925 		    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
3926 		    obj_offs, obj_end, obj_size);
3927 		if (__improbable(result != KERN_SUCCESS)) {
3928 			return result;
3929 		}
3930 	} else {
3931 		*obj_offs = 0;
3932 		*obj_end  = *obj_size;
3933 	}
3934 
3935 	return KERN_SUCCESS;
3936 }
3937 
3938 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset_u,boolean_t copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,upl_page_list_ptr_t page_list,unsigned int page_list_count)3939 vm_map_enter_mem_object(
3940 	vm_map_t                target_map,
3941 	vm_map_offset_ut       *address_u,
3942 	vm_map_size_ut          initial_size_u,
3943 	vm_map_offset_ut        mask_u,
3944 	vm_map_kernel_flags_t   vmk_flags,
3945 	ipc_port_t              port,
3946 	vm_object_offset_ut     offset_u,
3947 	boolean_t               copy,
3948 	vm_prot_ut              cur_protection_u,
3949 	vm_prot_ut              max_protection_u,
3950 	vm_inherit_ut           inheritance_u,
3951 	upl_page_list_ptr_t     page_list,
3952 	unsigned int            page_list_count)
3953 {
3954 	vm_map_offset_t         mask;
3955 	vm_prot_t               cur_protection;
3956 	vm_prot_t               max_protection;
3957 	vm_inherit_t            inheritance;
3958 	vm_map_address_t        map_addr, map_mask;
3959 	vm_map_size_t           map_size;
3960 	vm_object_t             object = VM_OBJECT_NULL;
3961 	vm_object_offset_t      obj_offs, obj_end;
3962 	vm_object_size_t        obj_size;
3963 	kern_return_t           result;
3964 	boolean_t               mask_cur_protection, mask_max_protection;
3965 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
3966 	vm_map_offset_t         offset_in_mapping = 0;
3967 
3968 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
3969 		/* XXX TODO4K prefaulting depends on page size... */
3970 		try_prefault = FALSE;
3971 	}
3972 
3973 	/*
3974 	 * Check arguments for validity
3975 	 */
3976 	if ((target_map == VM_MAP_NULL) ||
3977 	    (try_prefault && (copy || !page_list))) {
3978 		return KERN_INVALID_ARGUMENT;
3979 	}
3980 
3981 	map_mask = vm_map_page_mask(target_map);
3982 
3983 	/*
3984 	 * Sanitize any input parameters that are addr/size/prot/inherit
3985 	 */
3986 	result = vm_map_enter_mem_object_sanitize(
3987 		target_map,
3988 		*address_u,
3989 		initial_size_u,
3990 		mask_u,
3991 		offset_u,
3992 		cur_protection_u,
3993 		max_protection_u,
3994 		inheritance_u,
3995 		vmk_flags,
3996 		port,
3997 		&map_addr,
3998 		&map_size,
3999 		&mask,
4000 		&obj_offs,
4001 		&obj_end,
4002 		&obj_size,
4003 		&cur_protection,
4004 		&max_protection,
4005 		&inheritance);
4006 	if (__improbable(result != KERN_SUCCESS)) {
4007 		return vm_sanitize_get_kr(result);
4008 	}
4009 
4010 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4011 	vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, map_size);
4012 
4013 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4014 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
4015 	cur_protection &= ~VM_PROT_IS_MASK;
4016 	max_protection &= ~VM_PROT_IS_MASK;
4017 
4018 #if __arm64__
4019 	if (cur_protection & VM_PROT_EXECUTE) {
4020 		cur_protection |= VM_PROT_READ;
4021 	}
4022 #endif /* __arm64__ */
4023 
4024 	/*
4025 	 * Find the vm object (if any) corresponding to this port.
4026 	 */
4027 	if (!IP_VALID(port)) {
4028 		object = VM_OBJECT_NULL;
4029 		copy = FALSE;
4030 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4031 		vm_named_entry_t        named_entry;
4032 		vm_object_size_t        initial_size;
4033 
4034 		named_entry = mach_memory_entry_from_port(port);
4035 
4036 		if (vmk_flags.vmf_return_data_addr ||
4037 		    vmk_flags.vmf_return_4k_data_addr) {
4038 			result = vm_map_enter_adjust_offset(&obj_offs,
4039 			    &obj_end, named_entry->data_offset);
4040 			if (__improbable(result)) {
4041 				return result;
4042 			}
4043 		}
4044 
4045 		/* a few checks to make sure user is obeying rules */
4046 		if (mask_max_protection) {
4047 			max_protection &= named_entry->protection;
4048 		}
4049 		if (mask_cur_protection) {
4050 			cur_protection &= named_entry->protection;
4051 		}
4052 		if ((named_entry->protection & max_protection) !=
4053 		    max_protection) {
4054 			return KERN_INVALID_RIGHT;
4055 		}
4056 		if ((named_entry->protection & cur_protection) !=
4057 		    cur_protection) {
4058 			return KERN_INVALID_RIGHT;
4059 		}
4060 
4061 		/*
4062 		 * unwrap is safe because we know obj_size is larger and doesn't
4063 		 * overflow
4064 		 */
4065 		initial_size = VM_SANITIZE_UNSAFE_UNWRAP(initial_size_u);
4066 		if (named_entry->size < obj_offs + initial_size) {
4067 			return KERN_INVALID_ARGUMENT;
4068 		}
4069 
4070 		/* for a vm_map_copy, we can only map it whole */
4071 		if (named_entry->is_copy &&
4072 		    (obj_size != named_entry->size) &&
4073 		    (vm_map_round_page(obj_size, map_mask) == named_entry->size)) {
4074 			/* XXX FBDP use the rounded size... */
4075 			obj_end += named_entry->size - obj_size;
4076 			obj_size = named_entry->size;
4077 		}
4078 
4079 		if (named_entry->offset) {
4080 			/*
4081 			 * the callers parameter offset is defined to be the
4082 			 * offset from beginning of named entry offset in object
4083 			 *
4084 			 * Because we checked above that
4085 			 *   obj_offs + obj_size < named_entry_size
4086 			 * these overflow checks should be redundant...
4087 			 */
4088 			result = vm_map_enter_adjust_offset(&obj_offs,
4089 			    &obj_end, named_entry->offset);
4090 			if (__improbable(result)) {
4091 				return result;
4092 			}
4093 		}
4094 
4095 		if (!VM_MAP_PAGE_ALIGNED(obj_size, map_mask)) {
4096 			/*
4097 			 * Let's not map more than requested;
4098 			 * vm_map_enter() will handle this "not map-aligned"
4099 			 * case.
4100 			 */
4101 			map_size = obj_size;
4102 		}
4103 
4104 		named_entry_lock(named_entry);
4105 
4106 		// rdar://130307561 (Combine copy, object, and submap fields of vm_named_entry into an enum)
4107 		assert(named_entry->is_copy || named_entry->is_object || named_entry->is_sub_map);
4108 
4109 		if (named_entry->is_sub_map) {
4110 			vm_map_t                submap;
4111 
4112 			assert(!named_entry->is_copy);
4113 			assert(!named_entry->is_object);
4114 
4115 			if (vmk_flags.vmf_return_data_addr ||
4116 			    vmk_flags.vmf_return_4k_data_addr) {
4117 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4118 			}
4119 
4120 			submap = named_entry->backing.map;
4121 			vm_map_reference(submap);
4122 			named_entry_unlock(named_entry);
4123 
4124 			vmk_flags.vmkf_submap = TRUE;
4125 			result = vm_map_enter(target_map,
4126 			    &map_addr,
4127 			    map_size,
4128 			    mask,
4129 			    vmk_flags,
4130 			    (vm_object_t)(uintptr_t) submap,
4131 			    obj_offs,
4132 			    copy,
4133 			    cur_protection,
4134 			    max_protection,
4135 			    inheritance);
4136 			if (result != KERN_SUCCESS) {
4137 				vm_map_deallocate(submap);
4138 				return result;
4139 			}
4140 			/*
4141 			 * No need to lock "submap" just to check its
4142 			 * "mapped" flag: that flag is never reset
4143 			 * once it's been set and if we race, we'll
4144 			 * just end up setting it twice, which is OK.
4145 			 */
4146 			if (submap->mapped_in_other_pmaps == FALSE &&
4147 			    vm_map_pmap(submap) != PMAP_NULL &&
4148 			    vm_map_pmap(submap) !=
4149 			    vm_map_pmap(target_map)) {
4150 				/*
4151 				 * This submap is being mapped in a map
4152 				 * that uses a different pmap.
4153 				 * Set its "mapped_in_other_pmaps" flag
4154 				 * to indicate that we now need to
4155 				 * remove mappings from all pmaps rather
4156 				 * than just the submap's pmap.
4157 				 */
4158 				vm_map_lock(submap);
4159 				submap->mapped_in_other_pmaps = TRUE;
4160 				vm_map_unlock(submap);
4161 			}
4162 			goto out;
4163 		}
4164 
4165 		if (named_entry->is_copy) {
4166 			kern_return_t   kr;
4167 			vm_map_copy_t   copy_map;
4168 			vm_map_entry_t  copy_entry;
4169 			vm_map_offset_t copy_addr;
4170 			vm_map_copy_t   target_copy_map;
4171 			vm_map_offset_t overmap_start, overmap_end;
4172 			vm_map_offset_t trimmed_start;
4173 			vm_map_size_t   target_size;
4174 
4175 			assert(!named_entry->is_object);
4176 			assert(!named_entry->is_sub_map);
4177 
4178 			if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4179 			    (VM_FLAGS_FIXED |
4180 			    VM_FLAGS_ANYWHERE |
4181 			    VM_FLAGS_OVERWRITE |
4182 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4183 			    VM_FLAGS_RETURN_DATA_ADDR))) {
4184 				named_entry_unlock(named_entry);
4185 				return KERN_INVALID_ARGUMENT;
4186 			}
4187 
4188 			copy_map = named_entry->backing.copy;
4189 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4190 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4191 				/* unsupported type; should not happen */
4192 				printf("vm_map_enter_mem_object: "
4193 				    "memory_entry->backing.copy "
4194 				    "unsupported type 0x%x\n",
4195 				    copy_map->type);
4196 				named_entry_unlock(named_entry);
4197 				return KERN_INVALID_ARGUMENT;
4198 			}
4199 
4200 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4201 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, obj_offs, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4202 			}
4203 
4204 			if (vmk_flags.vmf_return_data_addr ||
4205 			    vmk_flags.vmf_return_4k_data_addr) {
4206 				offset_in_mapping = obj_offs & map_mask;
4207 				if (vmk_flags.vmf_return_4k_data_addr) {
4208 					offset_in_mapping &= ~((signed)(0xFFF));
4209 				}
4210 			}
4211 
4212 			target_copy_map = VM_MAP_COPY_NULL;
4213 			target_size = copy_map->size;
4214 			overmap_start = 0;
4215 			overmap_end = 0;
4216 			trimmed_start = 0;
4217 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4218 				DEBUG4K_ADJUST("adjusting...\n");
4219 				kr = vm_map_copy_adjust_to_target(
4220 					copy_map,
4221 					obj_offs,
4222 					initial_size,
4223 					target_map,
4224 					copy,
4225 					&target_copy_map,
4226 					&overmap_start,
4227 					&overmap_end,
4228 					&trimmed_start);
4229 				if (kr != KERN_SUCCESS) {
4230 					named_entry_unlock(named_entry);
4231 					return kr;
4232 				}
4233 				target_size = target_copy_map->size;
4234 			} else {
4235 				/*
4236 				 * Assert that the vm_map_copy is coming from the right
4237 				 * zone and hasn't been forged
4238 				 */
4239 				vm_map_copy_require(copy_map);
4240 				target_copy_map = copy_map;
4241 			}
4242 
4243 			vm_map_kernel_flags_t rsv_flags = vmk_flags;
4244 
4245 			vm_map_kernel_flags_and_vmflags(&rsv_flags,
4246 			    (VM_FLAGS_FIXED |
4247 			    VM_FLAGS_ANYWHERE |
4248 			    VM_FLAGS_OVERWRITE |
4249 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4250 			    VM_FLAGS_RETURN_DATA_ADDR));
4251 
4252 			/* reserve a contiguous range */
4253 			kr = vm_map_enter(target_map,
4254 			    &map_addr,
4255 			    vm_map_round_page(target_size, map_mask),
4256 			    mask,
4257 			    rsv_flags,
4258 			    VM_OBJECT_NULL,
4259 			    0,
4260 			    FALSE,               /* copy */
4261 			    cur_protection,
4262 			    max_protection,
4263 			    inheritance);
4264 			if (kr != KERN_SUCCESS) {
4265 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4266 				if (target_copy_map != copy_map) {
4267 					vm_map_copy_discard(target_copy_map);
4268 					target_copy_map = VM_MAP_COPY_NULL;
4269 				}
4270 				named_entry_unlock(named_entry);
4271 				return kr;
4272 			}
4273 
4274 			copy_addr = map_addr;
4275 
4276 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4277 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4278 			    copy_entry = copy_entry->vme_next) {
4279 				vm_map_t                copy_submap = VM_MAP_NULL;
4280 				vm_object_t             copy_object = VM_OBJECT_NULL;
4281 				vm_map_size_t           copy_size;
4282 				vm_object_offset_t      copy_offset;
4283 				boolean_t               do_copy = false;
4284 
4285 				if (copy_entry->is_sub_map) {
4286 					copy_submap = VME_SUBMAP(copy_entry);
4287 					copy_object = (vm_object_t)copy_submap;
4288 				} else {
4289 					copy_object = VME_OBJECT(copy_entry);
4290 				}
4291 				copy_offset = VME_OFFSET(copy_entry);
4292 				copy_size = (copy_entry->vme_end -
4293 				    copy_entry->vme_start);
4294 
4295 				/* sanity check */
4296 				if ((copy_addr + copy_size) >
4297 				    (map_addr +
4298 				    overmap_start + overmap_end +
4299 				    named_entry->size /* XXX full size */)) {
4300 					/* over-mapping too much !? */
4301 					kr = KERN_INVALID_ARGUMENT;
4302 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4303 					/* abort */
4304 					break;
4305 				}
4306 
4307 				/* take a reference on the object */
4308 				if (copy_entry->is_sub_map) {
4309 					vm_map_reference(copy_submap);
4310 				} else {
4311 					if (!copy &&
4312 					    copy_object != VM_OBJECT_NULL &&
4313 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4314 						bool is_writable;
4315 
4316 						/*
4317 						 * We need to resolve our side of this
4318 						 * "symmetric" copy-on-write now; we
4319 						 * need a new object to map and share,
4320 						 * instead of the current one which
4321 						 * might still be shared with the
4322 						 * original mapping.
4323 						 *
4324 						 * Note: A "vm_map_copy_t" does not
4325 						 * have a lock but we're protected by
4326 						 * the named entry's lock here.
4327 						 */
4328 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4329 						VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4330 						assert(copy_object != VME_OBJECT(copy_entry));
4331 						is_writable = false;
4332 						if (copy_entry->protection & VM_PROT_WRITE) {
4333 							is_writable = true;
4334 #if __arm64e__
4335 						} else if (copy_entry->used_for_tpro) {
4336 							is_writable = true;
4337 #endif /* __arm64e__ */
4338 						}
4339 						if (!copy_entry->needs_copy && is_writable) {
4340 							vm_prot_t prot;
4341 
4342 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4343 							vm_object_pmap_protect(copy_object,
4344 							    copy_offset,
4345 							    copy_size,
4346 							    PMAP_NULL,
4347 							    PAGE_SIZE,
4348 							    0,
4349 							    prot);
4350 						}
4351 						copy_entry->needs_copy = FALSE;
4352 						copy_entry->is_shared = TRUE;
4353 						copy_object = VME_OBJECT(copy_entry);
4354 						copy_offset = VME_OFFSET(copy_entry);
4355 						vm_object_lock(copy_object);
4356 						/* we're about to make a shared mapping of this object */
4357 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4358 						VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4359 						vm_object_unlock(copy_object);
4360 					}
4361 
4362 					if (copy_object != VM_OBJECT_NULL &&
4363 					    copy_object->named &&
4364 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4365 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4366 						memory_object_t pager;
4367 						vm_prot_t       pager_prot;
4368 
4369 						/*
4370 						 * For "named" VM objects, let the pager know that the
4371 						 * memory object is being mapped.  Some pagers need to keep
4372 						 * track of this, to know when they can reclaim the memory
4373 						 * object, for example.
4374 						 * VM calls memory_object_map() for each mapping (specifying
4375 						 * the protection of each mapping) and calls
4376 						 * memory_object_last_unmap() when all the mappings are gone.
4377 						 */
4378 						pager_prot = max_protection;
4379 						if (copy) {
4380 							/*
4381 							 * Copy-On-Write mapping: won't modify the
4382 							 * memory object.
4383 							 */
4384 							pager_prot &= ~VM_PROT_WRITE;
4385 						}
4386 						vm_object_lock(copy_object);
4387 						pager = copy_object->pager;
4388 						if (copy_object->named &&
4389 						    pager != MEMORY_OBJECT_NULL &&
4390 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4391 							assert(copy_object->pager_ready);
4392 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4393 							/*
4394 							 * Object might have lost its pager
4395 							 * while waiting.
4396 							 */
4397 							pager = copy_object->pager;
4398 							if (copy_object->named &&
4399 							    pager != MEMORY_OBJECT_NULL) {
4400 								vm_object_mapping_begin(copy_object);
4401 								vm_object_unlock(copy_object);
4402 
4403 								kr = memory_object_map(pager, pager_prot);
4404 								assert(kr == KERN_SUCCESS);
4405 
4406 								vm_object_lock(copy_object);
4407 								vm_object_mapping_end(copy_object);
4408 							}
4409 						}
4410 						vm_object_unlock(copy_object);
4411 					}
4412 
4413 					/*
4414 					 *	Perform the copy if requested
4415 					 */
4416 
4417 					if (copy && copy_object != VM_OBJECT_NULL) {
4418 						vm_object_t             new_object;
4419 						vm_object_offset_t      new_offset;
4420 
4421 						result = vm_object_copy_strategically(copy_object, copy_offset,
4422 						    copy_size,
4423 						    false,                                   /* forking */
4424 						    &new_object, &new_offset,
4425 						    &do_copy);
4426 
4427 
4428 						if (result == KERN_MEMORY_RESTART_COPY) {
4429 							boolean_t success;
4430 							boolean_t src_needs_copy;
4431 
4432 							/*
4433 							 * XXX
4434 							 * We currently ignore src_needs_copy.
4435 							 * This really is the issue of how to make
4436 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4437 							 * non-kernel users to use. Solution forthcoming.
4438 							 * In the meantime, since we don't allow non-kernel
4439 							 * memory managers to specify symmetric copy,
4440 							 * we won't run into problems here.
4441 							 */
4442 							new_object = copy_object;
4443 							new_offset = copy_offset;
4444 							success = vm_object_copy_quickly(new_object,
4445 							    new_offset,
4446 							    copy_size,
4447 							    &src_needs_copy,
4448 							    &do_copy);
4449 							assert(success);
4450 							result = KERN_SUCCESS;
4451 						}
4452 						if (result != KERN_SUCCESS) {
4453 							kr = result;
4454 							break;
4455 						}
4456 
4457 						copy_object = new_object;
4458 						copy_offset = new_offset;
4459 						/*
4460 						 * No extra object reference for the mapping:
4461 						 * the mapping should be the only thing keeping
4462 						 * this new object alive.
4463 						 */
4464 					} else {
4465 						/*
4466 						 * We already have the right object
4467 						 * to map.
4468 						 */
4469 						copy_object = VME_OBJECT(copy_entry);
4470 						/* take an extra ref for the mapping below */
4471 						vm_object_reference(copy_object);
4472 					}
4473 				}
4474 
4475 				/*
4476 				 * If the caller does not want a specific
4477 				 * tag for this new mapping:  use
4478 				 * the tag of the original mapping.
4479 				 */
4480 				vm_map_kernel_flags_t vmk_remap_flags = {
4481 					.vmkf_submap = copy_entry->is_sub_map,
4482 				};
4483 
4484 				vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4485 				    vm_map_kernel_flags_vmflags(vmk_flags),
4486 				    vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4487 
4488 				/* over-map the object into destination */
4489 				vmk_remap_flags.vmf_fixed = true;
4490 				vmk_remap_flags.vmf_overwrite = true;
4491 
4492 				if (!copy && !copy_entry->is_sub_map) {
4493 					/*
4494 					 * copy-on-write should have been
4495 					 * resolved at this point, or we would
4496 					 * end up sharing instead of copying.
4497 					 */
4498 					assert(!copy_entry->needs_copy);
4499 				}
4500 #if XNU_TARGET_OS_OSX
4501 				if (copy_entry->used_for_jit) {
4502 					vmk_remap_flags.vmkf_map_jit = TRUE;
4503 				}
4504 #endif /* XNU_TARGET_OS_OSX */
4505 
4506 				kr = vm_map_enter(target_map,
4507 				    &copy_addr,
4508 				    copy_size,
4509 				    (vm_map_offset_t) 0,
4510 				    vmk_remap_flags,
4511 				    copy_object,
4512 				    copy_offset,
4513 				    ((copy_object == NULL)
4514 				    ? FALSE
4515 				    : (copy || copy_entry->needs_copy)),
4516 				    cur_protection,
4517 				    max_protection,
4518 				    inheritance);
4519 				if (kr != KERN_SUCCESS) {
4520 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4521 					if (copy_entry->is_sub_map) {
4522 						vm_map_deallocate(copy_submap);
4523 					} else {
4524 						vm_object_deallocate(copy_object);
4525 					}
4526 					/* abort */
4527 					break;
4528 				}
4529 
4530 				/* next mapping */
4531 				copy_addr += copy_size;
4532 			}
4533 
4534 			named_entry_unlock(named_entry);
4535 			if (target_copy_map != copy_map) {
4536 				vm_map_copy_discard(target_copy_map);
4537 				target_copy_map = VM_MAP_COPY_NULL;
4538 			}
4539 
4540 			if (kr == KERN_SUCCESS) {
4541 				if (overmap_start) {
4542 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t)offset_in_mapping, (uint64_t)overmap_start, (uint64_t)(map_addr + offset_in_mapping + overmap_start));
4543 				}
4544 				offset_in_mapping += overmap_start;
4545 			} else if (!vmk_flags.vmf_overwrite) {
4546 				/* deallocate the contiguous range */
4547 				vm_map_remove(target_map, map_addr,
4548 				    map_addr + map_size);
4549 			}
4550 			result = kr;
4551 			goto out;
4552 		}
4553 
4554 		if (named_entry->is_object) {
4555 			unsigned int    access;
4556 			unsigned int    wimg_mode;
4557 
4558 			assert(!named_entry->is_copy);
4559 			assert(!named_entry->is_sub_map);
4560 
4561 			/* we are mapping a VM object */
4562 
4563 			access = named_entry->access;
4564 
4565 			if (vmk_flags.vmf_return_data_addr ||
4566 			    vmk_flags.vmf_return_4k_data_addr) {
4567 				offset_in_mapping = obj_offs & map_mask;
4568 				if (vmk_flags.vmf_return_4k_data_addr) {
4569 					offset_in_mapping &= ~((signed)(0xFFF));
4570 				}
4571 				obj_offs -= offset_in_mapping;
4572 				map_size  = vm_map_round_page(initial_size +
4573 				    offset_in_mapping, map_mask);
4574 			}
4575 
4576 			object = vm_named_entry_to_vm_object(named_entry);
4577 			assert(object != VM_OBJECT_NULL);
4578 			vm_object_lock(object);
4579 			named_entry_unlock(named_entry);
4580 
4581 			vm_object_reference_locked(object);
4582 
4583 			wimg_mode = object->wimg_bits;
4584 			vm_prot_to_wimg(access, &wimg_mode);
4585 			if (object->wimg_bits != wimg_mode) {
4586 				vm_object_change_wimg_mode(object, wimg_mode);
4587 			}
4588 
4589 			vm_object_unlock(object);
4590 		} else {
4591 			panic("invalid VM named entry %p", named_entry);
4592 		}
4593 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4594 		/*
4595 		 * JMM - This is temporary until we unify named entries
4596 		 * and raw memory objects.
4597 		 *
4598 		 * Detected fake ip_kotype for a memory object.  In
4599 		 * this case, the port isn't really a port at all, but
4600 		 * instead is just a raw memory object.
4601 		 */
4602 		if (vmk_flags.vmf_return_data_addr ||
4603 		    vmk_flags.vmf_return_4k_data_addr) {
4604 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4605 		}
4606 
4607 		object = memory_object_to_vm_object((memory_object_t)port);
4608 		if (object == VM_OBJECT_NULL) {
4609 			return KERN_INVALID_OBJECT;
4610 		}
4611 		vm_object_reference(object);
4612 
4613 		/* wait for object (if any) to be ready */
4614 		if (object != VM_OBJECT_NULL) {
4615 			if (is_kernel_object(object)) {
4616 				printf("Warning: Attempt to map kernel object"
4617 				    " by a non-private kernel entity\n");
4618 				return KERN_INVALID_OBJECT;
4619 			}
4620 			if (!object->pager_ready) {
4621 				vm_object_lock(object);
4622 
4623 				while (!object->pager_ready) {
4624 					vm_object_sleep(object,
4625 					    VM_OBJECT_EVENT_PAGER_READY,
4626 					    THREAD_UNINT,
4627 					    LCK_SLEEP_EXCLUSIVE);
4628 				}
4629 				vm_object_unlock(object);
4630 			}
4631 		}
4632 	} else {
4633 		return KERN_INVALID_OBJECT;
4634 	}
4635 
4636 	if (object != VM_OBJECT_NULL &&
4637 	    object->named &&
4638 	    object->pager != MEMORY_OBJECT_NULL &&
4639 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4640 		memory_object_t pager;
4641 		vm_prot_t       pager_prot;
4642 		kern_return_t   kr;
4643 
4644 		/*
4645 		 * For "named" VM objects, let the pager know that the
4646 		 * memory object is being mapped.  Some pagers need to keep
4647 		 * track of this, to know when they can reclaim the memory
4648 		 * object, for example.
4649 		 * VM calls memory_object_map() for each mapping (specifying
4650 		 * the protection of each mapping) and calls
4651 		 * memory_object_last_unmap() when all the mappings are gone.
4652 		 */
4653 		pager_prot = max_protection;
4654 		if (copy) {
4655 			/*
4656 			 * Copy-On-Write mapping: won't modify the
4657 			 * memory object.
4658 			 */
4659 			pager_prot &= ~VM_PROT_WRITE;
4660 		}
4661 		vm_object_lock(object);
4662 		pager = object->pager;
4663 		if (object->named &&
4664 		    pager != MEMORY_OBJECT_NULL &&
4665 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4666 			assert(object->pager_ready);
4667 			vm_object_mapping_wait(object, THREAD_UNINT);
4668 			/* object might have lost its pager while waiting */
4669 			pager = object->pager;
4670 			if (object->named && pager != MEMORY_OBJECT_NULL) {
4671 				vm_object_mapping_begin(object);
4672 				vm_object_unlock(object);
4673 
4674 				kr = memory_object_map(pager, pager_prot);
4675 				assert(kr == KERN_SUCCESS);
4676 
4677 				vm_object_lock(object);
4678 				vm_object_mapping_end(object);
4679 			}
4680 		}
4681 		vm_object_unlock(object);
4682 	}
4683 
4684 	/*
4685 	 *	Perform the copy if requested
4686 	 */
4687 
4688 	if (copy) {
4689 		vm_object_t             new_object;
4690 		vm_object_offset_t      new_offset;
4691 
4692 		result = vm_object_copy_strategically(object,
4693 		    obj_offs,
4694 		    map_size,
4695 		    false,                                   /* forking */
4696 		    &new_object, &new_offset,
4697 		    &copy);
4698 
4699 
4700 		if (result == KERN_MEMORY_RESTART_COPY) {
4701 			boolean_t success;
4702 			boolean_t src_needs_copy;
4703 
4704 			/*
4705 			 * XXX
4706 			 * We currently ignore src_needs_copy.
4707 			 * This really is the issue of how to make
4708 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4709 			 * non-kernel users to use. Solution forthcoming.
4710 			 * In the meantime, since we don't allow non-kernel
4711 			 * memory managers to specify symmetric copy,
4712 			 * we won't run into problems here.
4713 			 */
4714 			new_object = object;
4715 			new_offset = obj_offs;
4716 			success = vm_object_copy_quickly(new_object,
4717 			    new_offset,
4718 			    map_size,
4719 			    &src_needs_copy,
4720 			    &copy);
4721 			assert(success);
4722 			result = KERN_SUCCESS;
4723 		}
4724 		/*
4725 		 *	Throw away the reference to the
4726 		 *	original object, as it won't be mapped.
4727 		 */
4728 
4729 		vm_object_deallocate(object);
4730 
4731 		if (result != KERN_SUCCESS) {
4732 			return result;
4733 		}
4734 
4735 		object   = new_object;
4736 		obj_offs = new_offset;
4737 	}
4738 
4739 	/*
4740 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4741 	 * needs to be atomic.
4742 	 */
4743 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4744 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4745 
4746 	result = vm_map_enter(target_map,
4747 	    &map_addr, map_size,
4748 	    (vm_map_offset_t)mask,
4749 	    vmk_flags,
4750 	    object, obj_offs,
4751 	    copy,
4752 	    cur_protection, max_protection,
4753 	    inheritance);
4754 	if (result != KERN_SUCCESS) {
4755 		vm_object_deallocate(object);
4756 	}
4757 
4758 	/*
4759 	 * Try to prefault, and do not forget to release the vm map lock.
4760 	 */
4761 	if (result == KERN_SUCCESS && try_prefault) {
4762 		mach_vm_address_t va = map_addr;
4763 		kern_return_t kr = KERN_SUCCESS;
4764 		unsigned int i = 0;
4765 		int pmap_options;
4766 
4767 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4768 		if (object->internal) {
4769 			pmap_options |= PMAP_OPTIONS_INTERNAL;
4770 		}
4771 
4772 		for (i = 0; i < page_list_count; ++i) {
4773 			if (!UPL_VALID_PAGE(page_list, i)) {
4774 				if (kernel_prefault) {
4775 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4776 					result = KERN_MEMORY_ERROR;
4777 					break;
4778 				}
4779 			} else {
4780 				/*
4781 				 * If this function call failed, we should stop
4782 				 * trying to optimize, other calls are likely
4783 				 * going to fail too.
4784 				 *
4785 				 * We are not gonna report an error for such
4786 				 * failure though. That's an optimization, not
4787 				 * something critical.
4788 				 */
4789 				kr = pmap_enter_options(target_map->pmap,
4790 				    va, UPL_PHYS_PAGE(page_list, i),
4791 				    cur_protection, VM_PROT_NONE,
4792 				    0, TRUE, pmap_options, NULL, PMAP_MAPPING_TYPE_INFER);
4793 				if (kr != KERN_SUCCESS) {
4794 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4795 					if (kernel_prefault) {
4796 						result = kr;
4797 					}
4798 					break;
4799 				}
4800 				OSIncrementAtomic64(&vm_prefault_nb_pages);
4801 			}
4802 
4803 			/* Next virtual address */
4804 			va += PAGE_SIZE;
4805 		}
4806 		if (vmk_flags.vmkf_keep_map_locked) {
4807 			vm_map_unlock(target_map);
4808 		}
4809 	}
4810 
4811 out:
4812 	if (result == KERN_SUCCESS) {
4813 #if KASAN
4814 		if (target_map->pmap == kernel_pmap) {
4815 			kasan_notify_address(map_addr, map_size);
4816 		}
4817 #endif
4818 		*address_u = vm_sanitize_wrap_addr(map_addr + offset_in_mapping);
4819 	}
4820 	return result;
4821 }
4822 
4823 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_ut * address,vm_map_size_ut initial_size,vm_map_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset,vm_prot_ut cur_protection,vm_prot_ut max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4824 vm_map_enter_mem_object_prefault(
4825 	vm_map_t                target_map,
4826 	vm_map_offset_ut       *address,
4827 	vm_map_size_ut          initial_size,
4828 	vm_map_offset_ut        mask,
4829 	vm_map_kernel_flags_t   vmk_flags,
4830 	ipc_port_t              port,
4831 	vm_object_offset_ut     offset,
4832 	vm_prot_ut              cur_protection,
4833 	vm_prot_ut              max_protection,
4834 	upl_page_list_ptr_t     page_list,
4835 	unsigned int            page_list_count)
4836 {
4837 	/* range_id is set by vm_map_enter_mem_object */
4838 	return vm_map_enter_mem_object(target_map,
4839 	           address,
4840 	           initial_size,
4841 	           mask,
4842 	           vmk_flags,
4843 	           port,
4844 	           offset,
4845 	           FALSE,
4846 	           cur_protection,
4847 	           max_protection,
4848 	           VM_INHERIT_DEFAULT,
4849 	           page_list,
4850 	           page_list_count);
4851 }
4852 
4853 static __attribute__((always_inline, warn_unused_result))
4854 kern_return_t
vm_map_enter_mem_object_control_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)4855 vm_map_enter_mem_object_control_sanitize(
4856 	vm_map_t                target_map,
4857 	vm_map_offset_ut        address_u,
4858 	vm_map_size_ut          initial_size_u,
4859 	vm_map_offset_ut        mask_u,
4860 	vm_object_offset_ut     offset_u,
4861 	vm_prot_ut              cur_protection_u,
4862 	vm_prot_ut              max_protection_u,
4863 	vm_inherit_ut           inheritance_u,
4864 	vm_map_kernel_flags_t   vmk_flags,
4865 	vm_map_address_t       *map_addr,
4866 	vm_map_size_t          *map_size,
4867 	vm_map_offset_t        *mask,
4868 	vm_object_offset_t     *obj_offs,
4869 	vm_object_offset_t     *obj_end,
4870 	vm_object_size_t       *obj_size,
4871 	vm_prot_t              *cur_protection,
4872 	vm_prot_t              *max_protection,
4873 	vm_inherit_t           *inheritance)
4874 {
4875 	kern_return_t           kr;
4876 
4877 	kr = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
4878 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4879 	    cur_protection, max_protection);
4880 	if (__improbable(kr != KERN_SUCCESS)) {
4881 		return kr;
4882 	}
4883 
4884 	kr = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL,
4885 	    inheritance);
4886 	if (__improbable(kr != KERN_SUCCESS)) {
4887 		return kr;
4888 	}
4889 
4890 	kr = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, mask);
4891 	if (__improbable(kr != KERN_SUCCESS)) {
4892 		return kr;
4893 	}
4894 	/*
4895 	 * Ensure arithmetic doesn't overflow in vm_object space (kernel
4896 	 * pages).
4897 	 * We keep unaligned values for now. The call we eventually make to
4898 	 * vm_map_enter does guarantee that offset_u is page aligned for EITHER
4899 	 * target_map pages or kernel pages. But this isn't enough to guarantee
4900 	 * kernel space alignment.
4901 	 */
4902 	kr = vm_sanitize_addr_size(offset_u, initial_size_u,
4903 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, PAGE_MASK,
4904 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
4905 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4906 	    obj_offs, obj_end, obj_size);
4907 	if (__improbable(kr != KERN_SUCCESS)) {
4908 		return kr;
4909 	}
4910 
4911 	/*
4912 	 * There is no vm_sanitize_addr_size variant that also adjusts for
4913 	 * a separate offset. Rather than create one for this one-off issue,
4914 	 * we sanitize map_addr and map_size individually, relying on
4915 	 * vm_sanitize_size to incorporate the offset. Then, we perform the
4916 	 * overflow check manually below.
4917 	 */
4918 	*map_addr = vm_sanitize_addr(target_map, address_u);
4919 	kr = vm_sanitize_size(offset_u, initial_size_u,
4920 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4921 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
4922 	if (__improbable(kr != KERN_SUCCESS)) {
4923 		return kr;
4924 	}
4925 
4926 	/*
4927 	 * Ensure arithmetic doesn't overflow in target_map space.
4928 	 * The computation of map_size above accounts for the possibility that
4929 	 * offset_u might be unaligned in target_map space.
4930 	 */
4931 	if (vmk_flags.vmf_fixed) {
4932 		vm_map_address_t map_end;
4933 
4934 		if (__improbable(os_add_overflow(*map_addr, *map_size, &map_end))) {
4935 			return KERN_INVALID_ARGUMENT;
4936 		}
4937 	}
4938 
4939 	return KERN_SUCCESS;
4940 }
4941 
4942 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_ut offset_u,boolean_t needs_copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u)4943 vm_map_enter_mem_object_control(
4944 	vm_map_t                target_map,
4945 	vm_map_offset_ut       *address_u,
4946 	vm_map_size_ut          initial_size_u,
4947 	vm_map_offset_ut        mask_u,
4948 	vm_map_kernel_flags_t   vmk_flags,
4949 	memory_object_control_t control,
4950 	vm_object_offset_ut     offset_u,
4951 	boolean_t               needs_copy,
4952 	vm_prot_ut              cur_protection_u,
4953 	vm_prot_ut              max_protection_u,
4954 	vm_inherit_ut           inheritance_u)
4955 {
4956 	vm_map_offset_t         mask;
4957 	vm_prot_t               cur_protection;
4958 	vm_prot_t               max_protection;
4959 	vm_inherit_t            inheritance;
4960 	vm_map_address_t        map_addr;
4961 	vm_map_size_t           map_size;
4962 	vm_object_t             object;
4963 	vm_object_offset_t      obj_offs, obj_end;
4964 	vm_object_size_t        obj_size;
4965 	kern_return_t           result;
4966 	memory_object_t         pager;
4967 	vm_prot_t               pager_prot;
4968 	kern_return_t           kr;
4969 
4970 	/*
4971 	 * Check arguments for validity
4972 	 */
4973 	if (target_map == VM_MAP_NULL) {
4974 		return KERN_INVALID_ARGUMENT;
4975 	}
4976 
4977 	/*
4978 	 * We only support vmf_return_data_addr-like behavior.
4979 	 */
4980 	vmk_flags.vmf_return_data_addr = true;
4981 
4982 	/*
4983 	 * Sanitize any input parameters that are addr/size/prot/inherit
4984 	 */
4985 	kr = vm_map_enter_mem_object_control_sanitize(target_map,
4986 	    *address_u,
4987 	    initial_size_u,
4988 	    mask_u,
4989 	    offset_u,
4990 	    cur_protection_u,
4991 	    max_protection_u,
4992 	    inheritance_u,
4993 	    vmk_flags,
4994 	    &map_addr,
4995 	    &map_size,
4996 	    &mask,
4997 	    &obj_offs,
4998 	    &obj_end,
4999 	    &obj_size,
5000 	    &cur_protection,
5001 	    &max_protection,
5002 	    &inheritance);
5003 	if (__improbable(kr != KERN_SUCCESS)) {
5004 		return vm_sanitize_get_kr(kr);
5005 	}
5006 
5007 	object = memory_object_control_to_vm_object(control);
5008 
5009 	if (object == VM_OBJECT_NULL) {
5010 		return KERN_INVALID_OBJECT;
5011 	}
5012 
5013 	if (is_kernel_object(object)) {
5014 		printf("Warning: Attempt to map kernel object"
5015 		    " by a non-private kernel entity\n");
5016 		return KERN_INVALID_OBJECT;
5017 	}
5018 
5019 	vm_object_lock(object);
5020 	os_ref_retain_locked_raw(&object->ref_count, &vm_object_refgrp);
5021 
5022 
5023 	/*
5024 	 * For "named" VM objects, let the pager know that the
5025 	 * memory object is being mapped.  Some pagers need to keep
5026 	 * track of this, to know when they can reclaim the memory
5027 	 * object, for example.
5028 	 * VM calls memory_object_map() for each mapping (specifying
5029 	 * the protection of each mapping) and calls
5030 	 * memory_object_last_unmap() when all the mappings are gone.
5031 	 */
5032 	pager_prot = max_protection;
5033 	if (needs_copy) {
5034 		pager_prot &= ~VM_PROT_WRITE;
5035 	}
5036 	pager = object->pager;
5037 	if (object->named &&
5038 	    pager != MEMORY_OBJECT_NULL &&
5039 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5040 		assert(object->pager_ready);
5041 		vm_object_mapping_wait(object, THREAD_UNINT);
5042 		/* object might have lost its pager while waiting */
5043 		pager = object->pager;
5044 		if (object->named && pager != MEMORY_OBJECT_NULL) {
5045 			vm_object_mapping_begin(object);
5046 			vm_object_unlock(object);
5047 
5048 			kr = memory_object_map(pager, pager_prot);
5049 			assert(kr == KERN_SUCCESS);
5050 
5051 			vm_object_lock(object);
5052 			vm_object_mapping_end(object);
5053 		}
5054 	}
5055 	vm_object_unlock(object);
5056 
5057 	/*
5058 	 *	Perform the copy if requested
5059 	 */
5060 
5061 	if (needs_copy) {
5062 		vm_object_t             new_object;
5063 		vm_object_offset_t      new_offset;
5064 
5065 		result = vm_object_copy_strategically(object, obj_offs, obj_size,
5066 		    false,                                   /* forking */
5067 		    &new_object, &new_offset,
5068 		    &needs_copy);
5069 
5070 
5071 		if (result == KERN_MEMORY_RESTART_COPY) {
5072 			boolean_t success;
5073 			boolean_t src_needs_copy;
5074 
5075 			/*
5076 			 * XXX
5077 			 * We currently ignore src_needs_copy.
5078 			 * This really is the issue of how to make
5079 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5080 			 * non-kernel users to use. Solution forthcoming.
5081 			 * In the meantime, since we don't allow non-kernel
5082 			 * memory managers to specify symmetric copy,
5083 			 * we won't run into problems here.
5084 			 */
5085 			new_object = object;
5086 			new_offset = obj_offs;
5087 			success = vm_object_copy_quickly(new_object,
5088 			    new_offset, obj_size,
5089 			    &src_needs_copy,
5090 			    &needs_copy);
5091 			assert(success);
5092 			result = KERN_SUCCESS;
5093 		}
5094 		/*
5095 		 *	Throw away the reference to the
5096 		 *	original object, as it won't be mapped.
5097 		 */
5098 
5099 		vm_object_deallocate(object);
5100 
5101 		if (result != KERN_SUCCESS) {
5102 			return result;
5103 		}
5104 
5105 		object   = new_object;
5106 		obj_offs = new_offset;
5107 	}
5108 
5109 	result = vm_map_enter(target_map,
5110 	    &map_addr, map_size,
5111 	    (vm_map_offset_t)mask,
5112 	    vmk_flags,
5113 	    object,
5114 	    obj_offs,
5115 	    needs_copy,
5116 	    cur_protection, max_protection,
5117 	    inheritance);
5118 
5119 	if (result == KERN_SUCCESS) {
5120 		*address_u = vm_sanitize_wrap_addr(
5121 			map_addr + (obj_offs & vm_map_page_mask(target_map)));
5122 	} else {
5123 		vm_object_deallocate(object);
5124 	}
5125 
5126 	return result;
5127 }
5128 
5129 
5130 /* Not used without nested pmaps */
5131 #ifndef NO_NESTED_PMAP
5132 /*
5133  * Clip and unnest a portion of a nested submap mapping.
5134  */
5135 
5136 
5137 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5138 vm_map_clip_unnest(
5139 	vm_map_t        map,
5140 	vm_map_entry_t  entry,
5141 	vm_map_offset_t start_unnest,
5142 	vm_map_offset_t end_unnest)
5143 {
5144 	vm_map_offset_t old_start_unnest = start_unnest;
5145 	vm_map_offset_t old_end_unnest = end_unnest;
5146 
5147 	assert(entry->is_sub_map);
5148 	assert(VME_SUBMAP(entry) != NULL);
5149 	assert(entry->use_pmap);
5150 
5151 	/*
5152 	 * Query the platform for the optimal unnest range.
5153 	 * DRK: There's some duplication of effort here, since
5154 	 * callers may have adjusted the range to some extent. This
5155 	 * routine was introduced to support 1GiB subtree nesting
5156 	 * for x86 platforms, which can also nest on 2MiB boundaries
5157 	 * depending on size/alignment.
5158 	 */
5159 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5160 		assert(VME_SUBMAP(entry)->is_nested_map);
5161 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5162 		log_unnest_badness(map,
5163 		    old_start_unnest,
5164 		    old_end_unnest,
5165 		    VME_SUBMAP(entry)->is_nested_map,
5166 		    (entry->vme_start +
5167 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5168 		    VME_OFFSET(entry)));
5169 	}
5170 
5171 	if (entry->vme_start > start_unnest ||
5172 	    entry->vme_end < end_unnest) {
5173 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5174 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5175 		    (long long)start_unnest, (long long)end_unnest,
5176 		    (long long)entry->vme_start, (long long)entry->vme_end);
5177 	}
5178 
5179 	if (start_unnest > entry->vme_start) {
5180 		_vm_map_clip_start(&map->hdr,
5181 		    entry,
5182 		    start_unnest);
5183 		if (map->holelistenabled) {
5184 			vm_map_store_update_first_free(map, NULL, FALSE);
5185 		} else {
5186 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5187 		}
5188 	}
5189 	if (entry->vme_end > end_unnest) {
5190 		_vm_map_clip_end(&map->hdr,
5191 		    entry,
5192 		    end_unnest);
5193 		if (map->holelistenabled) {
5194 			vm_map_store_update_first_free(map, NULL, FALSE);
5195 		} else {
5196 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5197 		}
5198 	}
5199 
5200 	pmap_unnest(map->pmap,
5201 	    entry->vme_start,
5202 	    entry->vme_end - entry->vme_start);
5203 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5204 		/* clean up parent map/maps */
5205 		vm_map_submap_pmap_clean(
5206 			map, entry->vme_start,
5207 			entry->vme_end,
5208 			VME_SUBMAP(entry),
5209 			VME_OFFSET(entry));
5210 	}
5211 	entry->use_pmap = FALSE;
5212 	if ((map->pmap != kernel_pmap) &&
5213 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5214 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5215 	}
5216 }
5217 #endif  /* NO_NESTED_PMAP */
5218 
5219 __abortlike
5220 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5221 __vm_map_clip_atomic_entry_panic(
5222 	vm_map_t        map,
5223 	vm_map_entry_t  entry,
5224 	vm_map_offset_t where)
5225 {
5226 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5227 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5228 	    (uint64_t)entry->vme_start,
5229 	    (uint64_t)entry->vme_end,
5230 	    (uint64_t)where);
5231 }
5232 
5233 /*
5234  *	vm_map_clip_start:	[ internal use only ]
5235  *
5236  *	Asserts that the given entry begins at or after
5237  *	the specified address; if necessary,
5238  *	it splits the entry into two.
5239  */
5240 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5241 vm_map_clip_start(
5242 	vm_map_t        map,
5243 	vm_map_entry_t  entry,
5244 	vm_map_offset_t startaddr)
5245 {
5246 #ifndef NO_NESTED_PMAP
5247 	if (entry->is_sub_map &&
5248 	    entry->use_pmap &&
5249 	    startaddr >= entry->vme_start) {
5250 		vm_map_offset_t start_unnest, end_unnest;
5251 
5252 		/*
5253 		 * Make sure "startaddr" is no longer in a nested range
5254 		 * before we clip.  Unnest only the minimum range the platform
5255 		 * can handle.
5256 		 * vm_map_clip_unnest may perform additional adjustments to
5257 		 * the unnest range.
5258 		 */
5259 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5260 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5261 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5262 	}
5263 #endif /* NO_NESTED_PMAP */
5264 	if (startaddr > entry->vme_start) {
5265 		if (!entry->is_sub_map &&
5266 		    VME_OBJECT(entry) &&
5267 		    VME_OBJECT(entry)->phys_contiguous) {
5268 			pmap_remove(map->pmap,
5269 			    (addr64_t)(entry->vme_start),
5270 			    (addr64_t)(entry->vme_end));
5271 		}
5272 		if (entry->vme_atomic) {
5273 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5274 		}
5275 
5276 		DTRACE_VM5(
5277 			vm_map_clip_start,
5278 			vm_map_t, map,
5279 			vm_map_offset_t, entry->vme_start,
5280 			vm_map_offset_t, entry->vme_end,
5281 			vm_map_offset_t, startaddr,
5282 			int, VME_ALIAS(entry));
5283 
5284 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5285 		if (map->holelistenabled) {
5286 			vm_map_store_update_first_free(map, NULL, FALSE);
5287 		} else {
5288 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5289 		}
5290 	}
5291 }
5292 
5293 
5294 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5295 	MACRO_BEGIN \
5296 	if ((startaddr) > (entry)->vme_start) \
5297 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5298 	MACRO_END
5299 
5300 /*
5301  *	This routine is called only when it is known that
5302  *	the entry must be split.
5303  */
5304 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5305 _vm_map_clip_start(
5306 	struct vm_map_header    *map_header,
5307 	vm_map_entry_t          entry,
5308 	vm_map_offset_t         start)
5309 {
5310 	vm_map_entry_t  new_entry;
5311 
5312 	/*
5313 	 *	Split off the front portion --
5314 	 *	note that we must insert the new
5315 	 *	entry BEFORE this one, so that
5316 	 *	this entry has the specified starting
5317 	 *	address.
5318 	 */
5319 
5320 	if (entry->map_aligned) {
5321 		assert(VM_MAP_PAGE_ALIGNED(start,
5322 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5323 	}
5324 
5325 	new_entry = _vm_map_entry_create(map_header);
5326 	vm_map_entry_copy_full(new_entry, entry);
5327 
5328 	new_entry->vme_end = start;
5329 	assert(new_entry->vme_start < new_entry->vme_end);
5330 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5331 	if (__improbable(start >= entry->vme_end)) {
5332 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5333 	}
5334 	assert(start < entry->vme_end);
5335 	entry->vme_start = start;
5336 
5337 #if VM_BTLOG_TAGS
5338 	if (new_entry->vme_kernel_object) {
5339 		btref_retain(new_entry->vme_tag_btref);
5340 	}
5341 #endif /* VM_BTLOG_TAGS */
5342 
5343 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5344 
5345 	if (entry->is_sub_map) {
5346 		vm_map_reference(VME_SUBMAP(new_entry));
5347 	} else {
5348 		vm_object_reference(VME_OBJECT(new_entry));
5349 	}
5350 }
5351 
5352 
5353 /*
5354  *	vm_map_clip_end:	[ internal use only ]
5355  *
5356  *	Asserts that the given entry ends at or before
5357  *	the specified address; if necessary,
5358  *	it splits the entry into two.
5359  */
5360 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5361 vm_map_clip_end(
5362 	vm_map_t        map,
5363 	vm_map_entry_t  entry,
5364 	vm_map_offset_t endaddr)
5365 {
5366 	if (endaddr > entry->vme_end) {
5367 		/*
5368 		 * Within the scope of this clipping, limit "endaddr" to
5369 		 * the end of this map entry...
5370 		 */
5371 		endaddr = entry->vme_end;
5372 	}
5373 #ifndef NO_NESTED_PMAP
5374 	if (entry->is_sub_map && entry->use_pmap) {
5375 		vm_map_offset_t start_unnest, end_unnest;
5376 
5377 		/*
5378 		 * Make sure the range between the start of this entry and
5379 		 * the new "endaddr" is no longer nested before we clip.
5380 		 * Unnest only the minimum range the platform can handle.
5381 		 * vm_map_clip_unnest may perform additional adjustments to
5382 		 * the unnest range.
5383 		 */
5384 		start_unnest = entry->vme_start;
5385 		end_unnest =
5386 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5387 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5388 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5389 	}
5390 #endif /* NO_NESTED_PMAP */
5391 	if (endaddr < entry->vme_end) {
5392 		if (!entry->is_sub_map &&
5393 		    VME_OBJECT(entry) &&
5394 		    VME_OBJECT(entry)->phys_contiguous) {
5395 			pmap_remove(map->pmap,
5396 			    (addr64_t)(entry->vme_start),
5397 			    (addr64_t)(entry->vme_end));
5398 		}
5399 		if (entry->vme_atomic) {
5400 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5401 		}
5402 		DTRACE_VM5(
5403 			vm_map_clip_end,
5404 			vm_map_t, map,
5405 			vm_map_offset_t, entry->vme_start,
5406 			vm_map_offset_t, entry->vme_end,
5407 			vm_map_offset_t, endaddr,
5408 			int, VME_ALIAS(entry));
5409 
5410 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5411 		if (map->holelistenabled) {
5412 			vm_map_store_update_first_free(map, NULL, FALSE);
5413 		} else {
5414 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5415 		}
5416 	}
5417 }
5418 
5419 
5420 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5421 	MACRO_BEGIN \
5422 	if ((endaddr) < (entry)->vme_end) \
5423 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5424 	MACRO_END
5425 
5426 /*
5427  *	This routine is called only when it is known that
5428  *	the entry must be split.
5429  */
5430 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5431 _vm_map_clip_end(
5432 	struct vm_map_header    *map_header,
5433 	vm_map_entry_t          entry,
5434 	vm_map_offset_t         end)
5435 {
5436 	vm_map_entry_t  new_entry;
5437 
5438 	/*
5439 	 *	Create a new entry and insert it
5440 	 *	AFTER the specified entry
5441 	 */
5442 
5443 	if (entry->map_aligned) {
5444 		assert(VM_MAP_PAGE_ALIGNED(end,
5445 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5446 	}
5447 
5448 	new_entry = _vm_map_entry_create(map_header);
5449 	vm_map_entry_copy_full(new_entry, entry);
5450 
5451 	if (__improbable(end <= entry->vme_start)) {
5452 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5453 	}
5454 	assert(entry->vme_start < end);
5455 	new_entry->vme_start = entry->vme_end = end;
5456 	VME_OFFSET_SET(new_entry,
5457 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5458 	assert(new_entry->vme_start < new_entry->vme_end);
5459 
5460 #if VM_BTLOG_TAGS
5461 	if (new_entry->vme_kernel_object) {
5462 		btref_retain(new_entry->vme_tag_btref);
5463 	}
5464 #endif /* VM_BTLOG_TAGS */
5465 
5466 	_vm_map_store_entry_link(map_header, entry, new_entry);
5467 
5468 	if (entry->is_sub_map) {
5469 		vm_map_reference(VME_SUBMAP(new_entry));
5470 	} else {
5471 		vm_object_reference(VME_OBJECT(new_entry));
5472 	}
5473 }
5474 
5475 
5476 /*
5477  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5478  *
5479  *	Asserts that the starting and ending region
5480  *	addresses fall within the valid range of the map.
5481  */
5482 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5483 	MACRO_BEGIN                             \
5484 	if (start < vm_map_min(map))            \
5485 	        start = vm_map_min(map);        \
5486 	if (end > vm_map_max(map))              \
5487 	        end = vm_map_max(map);          \
5488 	if (start > end)                        \
5489 	        start = end;                    \
5490 	MACRO_END
5491 
5492 /*
5493  *	vm_map_range_check:	[ internal use only ]
5494  *
5495  *	Check that the region defined by the specified start and
5496  *	end addresses are wholly contained within a single map
5497  *	entry or set of adjacent map entries of the spacified map,
5498  *	i.e. the specified region contains no unmapped space.
5499  *	If any or all of the region is unmapped, FALSE is returned.
5500  *	Otherwise, TRUE is returned and if the output argument 'entry'
5501  *	is not NULL it points to the map entry containing the start
5502  *	of the region.
5503  *
5504  *	The map is locked for reading on entry and is left locked.
5505  */
5506 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5507 vm_map_range_check(
5508 	vm_map_t                map,
5509 	vm_map_offset_t         start,
5510 	vm_map_offset_t         end,
5511 	vm_map_entry_t          *entry)
5512 {
5513 	vm_map_entry_t          cur;
5514 	vm_map_offset_t         prev;
5515 
5516 	/*
5517 	 *      Basic sanity checks first
5518 	 */
5519 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5520 		return FALSE;
5521 	}
5522 
5523 	/*
5524 	 *      Check first if the region starts within a valid
5525 	 *	mapping for the map.
5526 	 */
5527 	if (!vm_map_lookup_entry(map, start, &cur)) {
5528 		return FALSE;
5529 	}
5530 
5531 	/*
5532 	 *	Optimize for the case that the region is contained
5533 	 *	in a single map entry.
5534 	 */
5535 	if (entry != (vm_map_entry_t *) NULL) {
5536 		*entry = cur;
5537 	}
5538 	if (end <= cur->vme_end) {
5539 		return TRUE;
5540 	}
5541 
5542 	/*
5543 	 *      If the region is not wholly contained within a
5544 	 *      single entry, walk the entries looking for holes.
5545 	 */
5546 	prev = cur->vme_end;
5547 	cur = cur->vme_next;
5548 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5549 		if (end <= cur->vme_end) {
5550 			return TRUE;
5551 		}
5552 		prev = cur->vme_end;
5553 		cur = cur->vme_next;
5554 	}
5555 	return FALSE;
5556 }
5557 
5558 static __attribute__((always_inline, warn_unused_result))
5559 kern_return_t
vm_map_protect_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut new_prot_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * new_prot)5560 vm_map_protect_sanitize(
5561 	vm_map_t                map,
5562 	vm_map_offset_ut        start_u,
5563 	vm_map_offset_ut        end_u,
5564 	vm_prot_ut              new_prot_u,
5565 	vm_map_offset_t        *start,
5566 	vm_map_offset_t        *end,
5567 	vm_prot_t              *new_prot)
5568 {
5569 	kern_return_t           kr;
5570 	vm_map_size_t           size;
5571 
5572 	kr = vm_sanitize_prot(new_prot_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5573 	    map, VM_PROT_COPY, new_prot);
5574 	if (__improbable(kr != KERN_SUCCESS)) {
5575 		return kr;
5576 	}
5577 
5578 	kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5579 	    map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
5580 	if (__improbable(kr != KERN_SUCCESS)) {
5581 		return kr;
5582 	}
5583 
5584 	return KERN_SUCCESS;
5585 }
5586 
5587 /*
5588  *	vm_map_protect:
5589  *
5590  *	Sets the protection of the specified address
5591  *	region in the target map.  If "set_max" is
5592  *	specified, the maximum protection is to be set;
5593  *	otherwise, only the current protection is affected.
5594  */
5595 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t set_max,vm_prot_ut new_prot_u)5596 vm_map_protect(
5597 	vm_map_t                map,
5598 	vm_map_offset_ut        start_u,
5599 	vm_map_offset_ut        end_u,
5600 	boolean_t               set_max,
5601 	vm_prot_ut              new_prot_u)
5602 {
5603 	vm_map_entry_t                  current;
5604 	vm_map_offset_t                 prev;
5605 	vm_map_entry_t                  entry;
5606 	vm_prot_t                       new_prot;
5607 	vm_prot_t                       new_max;
5608 	int                             pmap_options = 0;
5609 	kern_return_t                   kr;
5610 	vm_map_offset_t                 start, original_start;
5611 	vm_map_offset_t                 end;
5612 
5613 	kr = vm_map_protect_sanitize(map,
5614 	    start_u,
5615 	    end_u,
5616 	    new_prot_u,
5617 	    &start,
5618 	    &end,
5619 	    &new_prot);
5620 	if (__improbable(kr != KERN_SUCCESS)) {
5621 		return vm_sanitize_get_kr(kr);
5622 	}
5623 	original_start = start;
5624 
5625 	if (new_prot & VM_PROT_COPY) {
5626 		vm_map_offset_t         new_start;
5627 		vm_prot_t               cur_prot, max_prot;
5628 		vm_map_kernel_flags_t   kflags;
5629 
5630 		/* LP64todo - see below */
5631 		if (start >= map->max_offset) {
5632 			return KERN_INVALID_ADDRESS;
5633 		}
5634 
5635 		if ((new_prot & VM_PROT_ALLEXEC) &&
5636 		    map->pmap != kernel_pmap &&
5637 		    (vm_map_cs_enforcement(map)
5638 #if XNU_TARGET_OS_OSX && __arm64__
5639 		    || !VM_MAP_IS_EXOTIC(map)
5640 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5641 		    ) &&
5642 		    VM_MAP_POLICY_WX_FAIL(map)) {
5643 			DTRACE_VM3(cs_wx,
5644 			    uint64_t, (uint64_t) start,
5645 			    uint64_t, (uint64_t) end,
5646 			    vm_prot_t, new_prot);
5647 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5648 			    proc_selfpid(),
5649 			    (get_bsdtask_info(current_task())
5650 			    ? proc_name_address(get_bsdtask_info(current_task()))
5651 			    : "?"),
5652 			    __FUNCTION__, __LINE__,
5653 #if DEVELOPMENT || DEBUG
5654 			    (uint64_t)start,
5655 			    (uint64_t)end,
5656 #else /* DEVELOPMENT || DEBUG */
5657 			    (uint64_t)0,
5658 			    (uint64_t)0,
5659 #endif /* DEVELOPMENT || DEBUG */
5660 			    new_prot);
5661 			return KERN_PROTECTION_FAILURE;
5662 		}
5663 
5664 		/*
5665 		 * Let vm_map_remap_extract() know that it will need to:
5666 		 * + make a copy of the mapping
5667 		 * + add VM_PROT_WRITE to the max protections
5668 		 * + remove any protections that are no longer allowed from the
5669 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
5670 		 *   example).
5671 		 * Note that "max_prot" is an IN/OUT parameter only for this
5672 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
5673 		 * only.
5674 		 */
5675 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5676 		cur_prot = VM_PROT_NONE;
5677 		kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5678 		kflags.vmkf_remap_prot_copy = true;
5679 		kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5680 		new_start = start;
5681 		kr = vm_map_remap(map,
5682 		    vm_sanitize_wrap_addr_ref(&new_start),
5683 		    end - start,
5684 		    0, /* mask */
5685 		    kflags,
5686 		    map,
5687 		    start,
5688 		    TRUE, /* copy-on-write remapping! */
5689 		    vm_sanitize_wrap_prot_ref(&cur_prot), /* IN/OUT */
5690 		    vm_sanitize_wrap_prot_ref(&max_prot), /* IN/OUT */
5691 		    VM_INHERIT_DEFAULT);
5692 		if (kr != KERN_SUCCESS) {
5693 			return kr;
5694 		}
5695 		new_prot &= ~VM_PROT_COPY;
5696 	}
5697 
5698 	vm_map_lock(map);
5699 restart_after_unlock:
5700 
5701 	/* LP64todo - remove this check when vm_map_commpage64()
5702 	 * no longer has to stuff in a map_entry for the commpage
5703 	 * above the map's max_offset.
5704 	 */
5705 	if (start >= map->max_offset) {
5706 		vm_map_unlock(map);
5707 		return KERN_INVALID_ADDRESS;
5708 	}
5709 
5710 	while (1) {
5711 		/*
5712 		 *      Lookup the entry.  If it doesn't start in a valid
5713 		 *	entry, return an error.
5714 		 */
5715 		if (!vm_map_lookup_entry(map, start, &entry)) {
5716 			vm_map_unlock(map);
5717 			return KERN_INVALID_ADDRESS;
5718 		}
5719 
5720 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5721 			start = SUPERPAGE_ROUND_DOWN(start);
5722 			continue;
5723 		}
5724 		break;
5725 	}
5726 	if (entry->superpage_size) {
5727 		end = SUPERPAGE_ROUND_UP(end);
5728 	}
5729 
5730 	/*
5731 	 *	Make a first pass to check for protection and address
5732 	 *	violations.
5733 	 */
5734 
5735 	current = entry;
5736 	prev = current->vme_start;
5737 	while ((current != vm_map_to_entry(map)) &&
5738 	    (current->vme_start < end)) {
5739 		/*
5740 		 * If there is a hole, return an error.
5741 		 */
5742 		if (current->vme_start != prev) {
5743 			vm_map_unlock(map);
5744 			return KERN_INVALID_ADDRESS;
5745 		}
5746 
5747 		new_max = current->max_protection;
5748 
5749 #if defined(__x86_64__)
5750 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5751 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5752 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5753 		}
5754 #elif CODE_SIGNING_MONITOR
5755 		if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5756 			new_max |= VM_PROT_EXECUTE;
5757 		}
5758 #endif
5759 		if ((new_prot & new_max) != new_prot) {
5760 			vm_map_unlock(map);
5761 			return KERN_PROTECTION_FAILURE;
5762 		}
5763 
5764 		if (current->used_for_jit &&
5765 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5766 			vm_map_unlock(map);
5767 			return KERN_PROTECTION_FAILURE;
5768 		}
5769 
5770 #if __arm64e__
5771 		/* Disallow protecting hw assisted TPRO mappings */
5772 		if (current->used_for_tpro) {
5773 			vm_map_unlock(map);
5774 			return KERN_PROTECTION_FAILURE;
5775 		}
5776 #endif /* __arm64e__ */
5777 
5778 
5779 		if ((new_prot & VM_PROT_WRITE) &&
5780 		    (new_prot & VM_PROT_ALLEXEC) &&
5781 #if XNU_TARGET_OS_OSX
5782 		    map->pmap != kernel_pmap &&
5783 		    (vm_map_cs_enforcement(map)
5784 #if __arm64__
5785 		    || !VM_MAP_IS_EXOTIC(map)
5786 #endif /* __arm64__ */
5787 		    ) &&
5788 #endif /* XNU_TARGET_OS_OSX */
5789 #if CODE_SIGNING_MONITOR
5790 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5791 #endif
5792 		    !(current->used_for_jit)) {
5793 			DTRACE_VM3(cs_wx,
5794 			    uint64_t, (uint64_t) current->vme_start,
5795 			    uint64_t, (uint64_t) current->vme_end,
5796 			    vm_prot_t, new_prot);
5797 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5798 			    proc_selfpid(),
5799 			    (get_bsdtask_info(current_task())
5800 			    ? proc_name_address(get_bsdtask_info(current_task()))
5801 			    : "?"),
5802 			    __FUNCTION__, __LINE__,
5803 #if DEVELOPMENT || DEBUG
5804 			    (uint64_t)current->vme_start,
5805 			    (uint64_t)current->vme_end,
5806 #else /* DEVELOPMENT || DEBUG */
5807 			    (uint64_t)0,
5808 			    (uint64_t)0,
5809 #endif /* DEVELOPMENT || DEBUG */
5810 			    new_prot);
5811 			new_prot &= ~VM_PROT_ALLEXEC;
5812 			if (VM_MAP_POLICY_WX_FAIL(map)) {
5813 				vm_map_unlock(map);
5814 				return KERN_PROTECTION_FAILURE;
5815 			}
5816 		}
5817 
5818 		/*
5819 		 * If the task has requested executable lockdown,
5820 		 * deny both:
5821 		 * - adding executable protections OR
5822 		 * - adding write protections to an existing executable mapping.
5823 		 */
5824 		if (map->map_disallow_new_exec == TRUE) {
5825 			if ((new_prot & VM_PROT_ALLEXEC) ||
5826 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5827 				vm_map_unlock(map);
5828 				return KERN_PROTECTION_FAILURE;
5829 			}
5830 		}
5831 
5832 		prev = current->vme_end;
5833 		current = current->vme_next;
5834 	}
5835 
5836 #if __arm64__
5837 	if (end > prev &&
5838 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5839 		vm_map_entry_t prev_entry;
5840 
5841 		prev_entry = current->vme_prev;
5842 		if (prev_entry != vm_map_to_entry(map) &&
5843 		    !prev_entry->map_aligned &&
5844 		    (vm_map_round_page(prev_entry->vme_end,
5845 		    VM_MAP_PAGE_MASK(map))
5846 		    == end)) {
5847 			/*
5848 			 * The last entry in our range is not "map-aligned"
5849 			 * but it would have reached all the way to "end"
5850 			 * if it had been map-aligned, so this is not really
5851 			 * a hole in the range and we can proceed.
5852 			 */
5853 			prev = end;
5854 		}
5855 	}
5856 #endif /* __arm64__ */
5857 
5858 	if (end > prev) {
5859 		vm_map_unlock(map);
5860 		return KERN_INVALID_ADDRESS;
5861 	}
5862 
5863 	/*
5864 	 *	Go back and fix up protections.
5865 	 *	Clip to start here if the range starts within
5866 	 *	the entry.
5867 	 */
5868 
5869 	current = entry;
5870 	if (current != vm_map_to_entry(map)) {
5871 		/* clip and unnest if necessary */
5872 		vm_map_clip_start(map, current, start);
5873 	}
5874 
5875 	while ((current != vm_map_to_entry(map)) &&
5876 	    (current->vme_start < end)) {
5877 		vm_prot_t       old_prot;
5878 
5879 		if (current->in_transition) {
5880 			wait_result_t wait_result;
5881 			vm_map_offset_t current_start;
5882 
5883 			/*
5884 			 * Another thread is wiring/unwiring this entry.
5885 			 * Let the other thread know we are waiting.
5886 			 */
5887 			current_start = current->vme_start;
5888 			current->needs_wakeup = true;
5889 			/* wait for the other thread to be done */
5890 			wait_result = vm_map_entry_wait(map, TH_UNINT);
5891 			/*
5892 			 * We unlocked the map, so anything could have changed in the
5893 			 * range and we need to re-check from "current_start" to "end".
5894 			 * Our entries might no longer be valid.
5895 			 */
5896 			current = NULL;
5897 			entry = NULL;
5898 			/*
5899 			 * Re-lookup and re-clip "current_start".
5900 			 * If it's no longer mapped,
5901 			 */
5902 			vm_map_lookup_entry_or_next(map, current_start, &current);
5903 			if (current != vm_map_to_entry(map)) {
5904 				vm_map_clip_start(map, current, current_start);
5905 			}
5906 			/* restart from this point */
5907 			start = current_start;
5908 			goto restart_after_unlock;
5909 		}
5910 
5911 		vm_map_clip_end(map, current, end);
5912 
5913 #if DEVELOPMENT || DEBUG
5914 		if (current->csm_associated && vm_log_xnu_user_debug) {
5915 			printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
5916 			    proc_selfpid(),
5917 			    (get_bsdtask_info(current_task())
5918 			    ? proc_name_address(get_bsdtask_info(current_task()))
5919 			    : "?"),
5920 			    __FUNCTION__,
5921 			    (uint64_t)start,
5922 			    (uint64_t)end,
5923 			    new_prot,
5924 			    map, current,
5925 			    current->vme_start,
5926 			    current->vme_end,
5927 			    current->protection,
5928 			    current->max_protection);
5929 		}
5930 #endif /* DEVELOPMENT || DEBUG */
5931 
5932 		if (current->is_sub_map) {
5933 			/* clipping did unnest if needed */
5934 			assert(!current->use_pmap);
5935 		}
5936 
5937 		old_prot = current->protection;
5938 
5939 		if (set_max) {
5940 			current->max_protection = new_prot;
5941 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
5942 			current->protection = (new_prot & old_prot);
5943 		} else {
5944 			current->protection = new_prot;
5945 		}
5946 
5947 #if CODE_SIGNING_MONITOR
5948 		if (!current->vme_xnu_user_debug &&
5949 		    /* a !csm_associated mapping becoming executable */
5950 		    ((!current->csm_associated &&
5951 		    !(old_prot & VM_PROT_EXECUTE) &&
5952 		    (current->protection & VM_PROT_EXECUTE))
5953 		    ||
5954 		    /* a csm_associated mapping becoming writable */
5955 		    (current->csm_associated &&
5956 		    !(old_prot & VM_PROT_WRITE) &&
5957 		    (current->protection & VM_PROT_WRITE)))) {
5958 			/*
5959 			 * This mapping has not already been marked as
5960 			 * "user_debug" and it is either:
5961 			 * 1. not code-signing-monitored and becoming executable
5962 			 * 2. code-signing-monitored and becoming writable,
5963 			 * so inform the CodeSigningMonitor and mark the
5964 			 * mapping as "user_debug" if appropriate.
5965 			 */
5966 			vm_map_kernel_flags_t vmk_flags;
5967 			vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
5968 			/* pretend it's a vm_protect(VM_PROT_COPY)... */
5969 			vmk_flags.vmkf_remap_prot_copy = true;
5970 			kr = vm_map_entry_cs_associate(map, current, vmk_flags);
5971 #if DEVELOPMENT || DEBUG
5972 			if (vm_log_xnu_user_debug) {
5973 				printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
5974 				    proc_selfpid(),
5975 				    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
5976 				    __FUNCTION__, __LINE__,
5977 				    map, current,
5978 				    current->vme_start, current->vme_end,
5979 				    old_prot, current->protection,
5980 				    kr, current->vme_xnu_user_debug);
5981 			}
5982 #endif /* DEVELOPMENT || DEBUG */
5983 		}
5984 #endif /* CODE_SIGNING_MONITOR */
5985 
5986 		/*
5987 		 *	Update physical map if necessary.
5988 		 *	If the request is to turn off write protection,
5989 		 *	we won't do it for real (in pmap). This is because
5990 		 *	it would cause copy-on-write to fail.  We've already
5991 		 *	set, the new protection in the map, so if a
5992 		 *	write-protect fault occurred, it will be fixed up
5993 		 *	properly, COW or not.
5994 		 */
5995 		if (current->protection != old_prot) {
5996 			/* Look one level in we support nested pmaps */
5997 			/* from mapped submaps which are direct entries */
5998 			/* in our map */
5999 
6000 			vm_prot_t prot;
6001 
6002 			prot = current->protection;
6003 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6004 				prot &= ~VM_PROT_WRITE;
6005 			} else {
6006 				assert(!VME_OBJECT(current)->code_signed);
6007 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6008 				if (prot & VM_PROT_WRITE) {
6009 					/*
6010 					 * For write requests on the
6011 					 * compressor, we wil ask the
6012 					 * pmap layer to prevent us from
6013 					 * taking a write fault when we
6014 					 * attempt to access the mapping
6015 					 * next.
6016 					 */
6017 					pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6018 				}
6019 			}
6020 
6021 			if (override_nx(map, VME_ALIAS(current)) && prot) {
6022 				prot |= VM_PROT_EXECUTE;
6023 			}
6024 
6025 #if DEVELOPMENT || DEBUG
6026 			if (!(old_prot & VM_PROT_EXECUTE) &&
6027 			    (prot & VM_PROT_EXECUTE) &&
6028 			    panic_on_unsigned_execute &&
6029 			    (proc_selfcsflags() & CS_KILL)) {
6030 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6031 			}
6032 #endif /* DEVELOPMENT || DEBUG */
6033 
6034 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6035 				if (current->wired_count) {
6036 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6037 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6038 				}
6039 
6040 				/* If the pmap layer cares about this
6041 				 * protection type, force a fault for
6042 				 * each page so that vm_fault will
6043 				 * repopulate the page with the full
6044 				 * set of protections.
6045 				 */
6046 				/*
6047 				 * TODO: We don't seem to need this,
6048 				 * but this is due to an internal
6049 				 * implementation detail of
6050 				 * pmap_protect.  Do we want to rely
6051 				 * on this?
6052 				 */
6053 				prot = VM_PROT_NONE;
6054 			}
6055 
6056 			if (current->is_sub_map && current->use_pmap) {
6057 				pmap_protect(VME_SUBMAP(current)->pmap,
6058 				    current->vme_start,
6059 				    current->vme_end,
6060 				    prot);
6061 			} else {
6062 				pmap_protect_options(map->pmap,
6063 				    current->vme_start,
6064 				    current->vme_end,
6065 				    prot,
6066 				    pmap_options,
6067 				    NULL);
6068 			}
6069 		}
6070 		current = current->vme_next;
6071 	}
6072 
6073 	if (entry == VM_MAP_ENTRY_NULL) {
6074 		/*
6075 		 * Re-lookup the original start of our range.
6076 		 * If it's no longer mapped, start with the next mapping.
6077 		 */
6078 		vm_map_lookup_entry_or_next(map, original_start, &entry);
6079 	}
6080 	current = entry;
6081 	while ((current != vm_map_to_entry(map)) &&
6082 	    (current->vme_start <= end)) {
6083 		vm_map_simplify_entry(map, current);
6084 		current = current->vme_next;
6085 	}
6086 
6087 	vm_map_unlock(map);
6088 	return KERN_SUCCESS;
6089 }
6090 
6091 static __attribute__((always_inline, warn_unused_result))
6092 kern_return_t
vm_map_inherit_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_inherit_t * new_inheritance)6093 vm_map_inherit_sanitize(
6094 	vm_map_t                        map,
6095 	vm_map_offset_ut                start_u,
6096 	vm_map_offset_ut                end_u,
6097 	vm_inherit_ut                   new_inheritance_u,
6098 	vm_map_offset_t                *start,
6099 	vm_map_offset_t                *end,
6100 	vm_inherit_t                   *new_inheritance)
6101 {
6102 	kern_return_t   kr;
6103 	vm_map_size_t   size;
6104 
6105 	kr = vm_sanitize_inherit(new_inheritance_u,
6106 	    VM_SANITIZE_CALLER_VM_MAP_INHERIT, new_inheritance);
6107 	if (__improbable(kr != KERN_SUCCESS)) {
6108 		return kr;
6109 	}
6110 
6111 	kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_INHERIT,
6112 	    map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
6113 	if (__improbable(kr != KERN_SUCCESS)) {
6114 		return kr;
6115 	}
6116 
6117 	return KERN_SUCCESS;
6118 }
6119 
6120 /*
6121  *	vm_map_inherit:
6122  *
6123  *	Sets the inheritance of the specified address
6124  *	range in the target map.  Inheritance
6125  *	affects how the map will be shared with
6126  *	child maps at the time of vm_map_fork.
6127  */
6128 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u)6129 vm_map_inherit(
6130 	vm_map_t                        map,
6131 	vm_map_offset_ut                start_u,
6132 	vm_map_offset_ut                end_u,
6133 	vm_inherit_ut                   new_inheritance_u)
6134 {
6135 	vm_map_entry_t  entry;
6136 	vm_map_entry_t  temp_entry;
6137 	kern_return_t   kr;
6138 	vm_map_offset_t start;
6139 	vm_map_offset_t end;
6140 	vm_inherit_t    new_inheritance;
6141 
6142 	kr = vm_map_inherit_sanitize(map,
6143 	    start_u,
6144 	    end_u,
6145 	    new_inheritance_u,
6146 	    &start,
6147 	    &end,
6148 	    &new_inheritance);
6149 	if (__improbable(kr != KERN_SUCCESS)) {
6150 		return vm_sanitize_get_kr(kr);
6151 	}
6152 
6153 	vm_map_lock(map);
6154 
6155 	VM_MAP_RANGE_CHECK(map, start, end);
6156 
6157 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6158 		entry = temp_entry;
6159 	} else {
6160 		temp_entry = temp_entry->vme_next;
6161 		entry = temp_entry;
6162 	}
6163 
6164 	/* first check entire range for submaps which can't support the */
6165 	/* given inheritance. */
6166 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6167 		if (entry->is_sub_map) {
6168 			if (new_inheritance == VM_INHERIT_COPY) {
6169 				vm_map_unlock(map);
6170 				return KERN_INVALID_ARGUMENT;
6171 			}
6172 		}
6173 
6174 		entry = entry->vme_next;
6175 	}
6176 
6177 	entry = temp_entry;
6178 	if (entry != vm_map_to_entry(map)) {
6179 		/* clip and unnest if necessary */
6180 		vm_map_clip_start(map, entry, start);
6181 	}
6182 
6183 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6184 		vm_map_clip_end(map, entry, end);
6185 		if (entry->is_sub_map) {
6186 			/* clip did unnest if needed */
6187 			assert(!entry->use_pmap);
6188 		}
6189 
6190 		entry->inheritance = new_inheritance;
6191 
6192 		entry = entry->vme_next;
6193 	}
6194 
6195 	vm_map_unlock(map);
6196 	return KERN_SUCCESS;
6197 }
6198 
6199 /*
6200  * Update the accounting for the amount of wired memory in this map.  If the user has
6201  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6202  */
6203 
6204 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6205 add_wire_counts(
6206 	vm_map_t        map,
6207 	vm_map_entry_t  entry,
6208 	boolean_t       user_wire)
6209 {
6210 	vm_map_size_t   size;
6211 
6212 	bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6213 
6214 	if (user_wire) {
6215 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6216 
6217 		/*
6218 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6219 		 * this map entry.
6220 		 */
6221 
6222 		if (entry->user_wired_count == 0) {
6223 			size = entry->vme_end - entry->vme_start;
6224 
6225 			/*
6226 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6227 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6228 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6229 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6230 			 * limit, then we fail.
6231 			 */
6232 
6233 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6234 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6235 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6236 #if DEVELOPMENT || DEBUG
6237 					if (panic_on_mlock_failure) {
6238 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6239 					}
6240 #endif /* DEVELOPMENT || DEBUG */
6241 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6242 				} else {
6243 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6244 #if DEVELOPMENT || DEBUG
6245 					if (panic_on_mlock_failure) {
6246 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6247 					}
6248 #endif /* DEVELOPMENT || DEBUG */
6249 				}
6250 				return KERN_RESOURCE_SHORTAGE;
6251 			}
6252 
6253 			/*
6254 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6255 			 * the total that has been wired in the map.
6256 			 */
6257 
6258 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6259 				return KERN_FAILURE;
6260 			}
6261 
6262 			entry->wired_count++;
6263 			map->user_wire_size += size;
6264 		}
6265 
6266 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6267 			return KERN_FAILURE;
6268 		}
6269 
6270 		entry->user_wired_count++;
6271 	} else {
6272 		/*
6273 		 * The kernel's wiring the memory.  Just bump the count and continue.
6274 		 */
6275 
6276 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6277 			panic("vm_map_wire: too many wirings");
6278 		}
6279 
6280 		entry->wired_count++;
6281 	}
6282 
6283 	if (first_wire) {
6284 		vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6285 	}
6286 
6287 	return KERN_SUCCESS;
6288 }
6289 
6290 /*
6291  * Update the memory wiring accounting now that the given map entry is being unwired.
6292  */
6293 
6294 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6295 subtract_wire_counts(
6296 	vm_map_t        map,
6297 	vm_map_entry_t  entry,
6298 	boolean_t       user_wire)
6299 {
6300 	if (user_wire) {
6301 		/*
6302 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6303 		 */
6304 
6305 		if (entry->user_wired_count == 1) {
6306 			/*
6307 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6308 			 * user wired memory for this map.
6309 			 */
6310 
6311 			assert(entry->wired_count >= 1);
6312 			entry->wired_count--;
6313 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6314 		}
6315 
6316 		assert(entry->user_wired_count >= 1);
6317 		entry->user_wired_count--;
6318 	} else {
6319 		/*
6320 		 * The kernel is unwiring the memory.   Just update the count.
6321 		 */
6322 
6323 		assert(entry->wired_count >= 1);
6324 		entry->wired_count--;
6325 	}
6326 
6327 	vme_btref_consider_and_put(entry);
6328 }
6329 
6330 int cs_executable_wire = 0;
6331 
6332 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6333 vm_map_wire_nested(
6334 	vm_map_t                map,
6335 	vm_map_offset_t         start,
6336 	vm_map_offset_t         end,
6337 	vm_prot_t               caller_prot,
6338 	vm_tag_t                tag,
6339 	boolean_t               user_wire,
6340 	pmap_t                  map_pmap,
6341 	vm_map_offset_t         pmap_addr,
6342 	ppnum_t                *physpage_p)
6343 {
6344 	vm_map_entry_t          entry;
6345 	vm_prot_t               access_type;
6346 	struct vm_map_entry     *first_entry, tmp_entry;
6347 	vm_map_t                real_map;
6348 	vm_map_offset_t         s, e;
6349 	kern_return_t           rc;
6350 	boolean_t               need_wakeup;
6351 	boolean_t               main_map = FALSE;
6352 	wait_interrupt_t        interruptible_state;
6353 	thread_t                cur_thread;
6354 	unsigned int            last_timestamp;
6355 	vm_map_size_t           size;
6356 	boolean_t               wire_and_extract;
6357 	vm_prot_t               extra_prots;
6358 
6359 	extra_prots = VM_PROT_COPY;
6360 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6361 #if XNU_TARGET_OS_OSX
6362 	if (map->pmap == kernel_pmap ||
6363 	    !vm_map_cs_enforcement(map)) {
6364 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6365 	}
6366 #endif /* XNU_TARGET_OS_OSX */
6367 #if CODE_SIGNING_MONITOR
6368 	if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6369 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6370 	}
6371 #endif /* CODE_SIGNING_MONITOR */
6372 
6373 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6374 
6375 	wire_and_extract = FALSE;
6376 	if (physpage_p != NULL) {
6377 		/*
6378 		 * The caller wants the physical page number of the
6379 		 * wired page.  We return only one physical page number
6380 		 * so this works for only one page at a time.
6381 		 *
6382 		 * The only caller (vm_map_wire_and_extract)
6383 		 * guarantees it.
6384 		 */
6385 		assert(end - start == VM_MAP_PAGE_SIZE(map));
6386 		wire_and_extract = TRUE;
6387 		*physpage_p = 0;
6388 	}
6389 
6390 	VM_MAP_RANGE_CHECK(map, start, end);
6391 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6392 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6393 	if (start == end) {
6394 		/* We wired what the caller asked for, zero pages */
6395 		return KERN_SUCCESS;
6396 	}
6397 
6398 	vm_map_lock(map);
6399 	if (map_pmap == NULL) {
6400 		main_map = TRUE;
6401 	}
6402 	last_timestamp = map->timestamp;
6403 
6404 	need_wakeup = FALSE;
6405 	cur_thread = current_thread();
6406 
6407 	s = start;
6408 	rc = KERN_SUCCESS;
6409 
6410 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6411 		entry = first_entry;
6412 		/*
6413 		 * vm_map_clip_start will be done later.
6414 		 * We don't want to unnest any nested submaps here !
6415 		 */
6416 	} else {
6417 		/* Start address is not in map */
6418 		rc = KERN_INVALID_ADDRESS;
6419 		goto done;
6420 	}
6421 
6422 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6423 		/*
6424 		 * At this point, we have wired from "start" to "s".
6425 		 * We still need to wire from "s" to "end".
6426 		 *
6427 		 * "entry" hasn't been clipped, so it could start before "s"
6428 		 * and/or end after "end".
6429 		 */
6430 
6431 		/* "e" is how far we want to wire in this entry */
6432 		e = entry->vme_end;
6433 		if (e > end) {
6434 			e = end;
6435 		}
6436 
6437 		/*
6438 		 * If another thread is wiring/unwiring this entry then
6439 		 * block after informing other thread to wake us up.
6440 		 */
6441 		if (entry->in_transition) {
6442 			wait_result_t wait_result;
6443 
6444 			/*
6445 			 * We have not clipped the entry.  Make sure that
6446 			 * the start address is in range so that the lookup
6447 			 * below will succeed.
6448 			 * "s" is the current starting point: we've already
6449 			 * wired from "start" to "s" and we still have
6450 			 * to wire from "s" to "end".
6451 			 */
6452 
6453 			entry->needs_wakeup = TRUE;
6454 
6455 			/*
6456 			 * wake up anybody waiting on entries that we have
6457 			 * already wired.
6458 			 */
6459 			if (need_wakeup) {
6460 				vm_map_entry_wakeup(map);
6461 				need_wakeup = FALSE;
6462 			}
6463 			/*
6464 			 * User wiring is interruptible
6465 			 */
6466 			wait_result = vm_map_entry_wait(map,
6467 			    (user_wire) ? THREAD_ABORTSAFE :
6468 			    THREAD_UNINT);
6469 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6470 				/*
6471 				 * undo the wirings we have done so far
6472 				 * We do not clear the needs_wakeup flag,
6473 				 * because we cannot tell if we were the
6474 				 * only one waiting.
6475 				 */
6476 				rc = KERN_FAILURE;
6477 				goto done;
6478 			}
6479 
6480 			/*
6481 			 * Cannot avoid a lookup here. reset timestamp.
6482 			 */
6483 			last_timestamp = map->timestamp;
6484 
6485 			/*
6486 			 * The entry could have been clipped, look it up again.
6487 			 * Worse that can happen is, it may not exist anymore.
6488 			 */
6489 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6490 				/*
6491 				 * User: undo everything upto the previous
6492 				 * entry.  let vm_map_unwire worry about
6493 				 * checking the validity of the range.
6494 				 */
6495 				rc = KERN_FAILURE;
6496 				goto done;
6497 			}
6498 			entry = first_entry;
6499 			continue;
6500 		}
6501 
6502 		if (entry->is_sub_map) {
6503 			vm_map_offset_t sub_start;
6504 			vm_map_offset_t sub_end;
6505 			vm_map_offset_t local_start;
6506 			vm_map_offset_t local_end;
6507 			pmap_t          pmap;
6508 
6509 			if (wire_and_extract) {
6510 				/*
6511 				 * Wiring would result in copy-on-write
6512 				 * which would not be compatible with
6513 				 * the sharing we have with the original
6514 				 * provider of this memory.
6515 				 */
6516 				rc = KERN_INVALID_ARGUMENT;
6517 				goto done;
6518 			}
6519 
6520 			vm_map_clip_start(map, entry, s);
6521 			vm_map_clip_end(map, entry, end);
6522 
6523 			sub_start = VME_OFFSET(entry);
6524 			sub_end = entry->vme_end;
6525 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6526 
6527 			local_end = entry->vme_end;
6528 			if (map_pmap == NULL) {
6529 				vm_object_t             object;
6530 				vm_object_offset_t      offset;
6531 				vm_prot_t               prot;
6532 				boolean_t               wired;
6533 				vm_map_entry_t          local_entry;
6534 				vm_map_version_t         version;
6535 				vm_map_t                lookup_map;
6536 
6537 				if (entry->use_pmap) {
6538 					pmap = VME_SUBMAP(entry)->pmap;
6539 					/* ppc implementation requires that */
6540 					/* submaps pmap address ranges line */
6541 					/* up with parent map */
6542 #ifdef notdef
6543 					pmap_addr = sub_start;
6544 #endif
6545 					pmap_addr = s;
6546 				} else {
6547 					pmap = map->pmap;
6548 					pmap_addr = s;
6549 				}
6550 
6551 				if (entry->wired_count) {
6552 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6553 						goto done;
6554 					}
6555 
6556 					/*
6557 					 * The map was not unlocked:
6558 					 * no need to goto re-lookup.
6559 					 * Just go directly to next entry.
6560 					 */
6561 					entry = entry->vme_next;
6562 					s = entry->vme_start;
6563 					continue;
6564 				}
6565 
6566 				/* call vm_map_lookup_and_lock_object to */
6567 				/* cause any needs copy to be   */
6568 				/* evaluated */
6569 				local_start = entry->vme_start;
6570 				lookup_map = map;
6571 				vm_map_lock_write_to_read(map);
6572 				rc = vm_map_lookup_and_lock_object(
6573 					&lookup_map, local_start,
6574 					(access_type | extra_prots),
6575 					OBJECT_LOCK_EXCLUSIVE,
6576 					&version, &object,
6577 					&offset, &prot, &wired,
6578 					NULL,
6579 					&real_map, NULL);
6580 				if (rc != KERN_SUCCESS) {
6581 					vm_map_unlock_read(lookup_map);
6582 					assert(map_pmap == NULL);
6583 					vm_map_unwire_nested(map, start,
6584 					    s, user_wire, PMAP_NULL, 0);
6585 					return rc;
6586 				}
6587 				vm_object_unlock(object);
6588 				if (real_map != lookup_map) {
6589 					vm_map_unlock(real_map);
6590 				}
6591 				vm_map_unlock_read(lookup_map);
6592 				vm_map_lock(map);
6593 
6594 				/* we unlocked, so must re-lookup */
6595 				if (!vm_map_lookup_entry(map,
6596 				    local_start,
6597 				    &local_entry)) {
6598 					rc = KERN_FAILURE;
6599 					goto done;
6600 				}
6601 
6602 				/*
6603 				 * entry could have been "simplified",
6604 				 * so re-clip
6605 				 */
6606 				entry = local_entry;
6607 				assert(s == local_start);
6608 				vm_map_clip_start(map, entry, s);
6609 				vm_map_clip_end(map, entry, end);
6610 				/* re-compute "e" */
6611 				e = entry->vme_end;
6612 				if (e > end) {
6613 					e = end;
6614 				}
6615 
6616 				/* did we have a change of type? */
6617 				if (!entry->is_sub_map) {
6618 					last_timestamp = map->timestamp;
6619 					continue;
6620 				}
6621 			} else {
6622 				local_start = entry->vme_start;
6623 				pmap = map_pmap;
6624 			}
6625 
6626 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6627 				goto done;
6628 			}
6629 
6630 			entry->in_transition = TRUE;
6631 
6632 			vm_map_unlock(map);
6633 			rc = vm_map_wire_nested(VME_SUBMAP(entry),
6634 			    sub_start, sub_end,
6635 			    caller_prot, tag,
6636 			    user_wire, pmap, pmap_addr,
6637 			    NULL);
6638 			vm_map_lock(map);
6639 
6640 			/*
6641 			 * Find the entry again.  It could have been clipped
6642 			 * after we unlocked the map.
6643 			 */
6644 			if (!vm_map_lookup_entry(map, local_start,
6645 			    &first_entry)) {
6646 				panic("vm_map_wire: re-lookup failed");
6647 			}
6648 			entry = first_entry;
6649 
6650 			assert(local_start == s);
6651 			/* re-compute "e" */
6652 			e = entry->vme_end;
6653 			if (e > end) {
6654 				e = end;
6655 			}
6656 
6657 			last_timestamp = map->timestamp;
6658 			while ((entry != vm_map_to_entry(map)) &&
6659 			    (entry->vme_start < e)) {
6660 				assert(entry->in_transition);
6661 				entry->in_transition = FALSE;
6662 				if (entry->needs_wakeup) {
6663 					entry->needs_wakeup = FALSE;
6664 					need_wakeup = TRUE;
6665 				}
6666 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6667 					subtract_wire_counts(map, entry, user_wire);
6668 				}
6669 				entry = entry->vme_next;
6670 			}
6671 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6672 				goto done;
6673 			}
6674 
6675 			/* no need to relookup again */
6676 			s = entry->vme_start;
6677 			continue;
6678 		}
6679 
6680 		/*
6681 		 * If this entry is already wired then increment
6682 		 * the appropriate wire reference count.
6683 		 */
6684 		if (entry->wired_count) {
6685 			if ((entry->protection & access_type) != access_type) {
6686 				/* found a protection problem */
6687 
6688 				/*
6689 				 * XXX FBDP
6690 				 * We should always return an error
6691 				 * in this case but since we didn't
6692 				 * enforce it before, let's do
6693 				 * it only for the new "wire_and_extract"
6694 				 * code path for now...
6695 				 */
6696 				if (wire_and_extract) {
6697 					rc = KERN_PROTECTION_FAILURE;
6698 					goto done;
6699 				}
6700 			}
6701 
6702 			/*
6703 			 * entry is already wired down, get our reference
6704 			 * after clipping to our range.
6705 			 */
6706 			vm_map_clip_start(map, entry, s);
6707 			vm_map_clip_end(map, entry, end);
6708 
6709 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6710 				goto done;
6711 			}
6712 
6713 			if (wire_and_extract) {
6714 				vm_object_t             object;
6715 				vm_object_offset_t      offset;
6716 				vm_page_t               m;
6717 
6718 				/*
6719 				 * We don't have to "wire" the page again
6720 				 * bit we still have to "extract" its
6721 				 * physical page number, after some sanity
6722 				 * checks.
6723 				 */
6724 				assert((entry->vme_end - entry->vme_start)
6725 				    == PAGE_SIZE);
6726 				assert(!entry->needs_copy);
6727 				assert(!entry->is_sub_map);
6728 				assert(VME_OBJECT(entry));
6729 				if (((entry->vme_end - entry->vme_start)
6730 				    != PAGE_SIZE) ||
6731 				    entry->needs_copy ||
6732 				    entry->is_sub_map ||
6733 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
6734 					rc = KERN_INVALID_ARGUMENT;
6735 					goto done;
6736 				}
6737 
6738 				object = VME_OBJECT(entry);
6739 				offset = VME_OFFSET(entry);
6740 				/* need exclusive lock to update m->dirty */
6741 				if (entry->protection & VM_PROT_WRITE) {
6742 					vm_object_lock(object);
6743 				} else {
6744 					vm_object_lock_shared(object);
6745 				}
6746 				m = vm_page_lookup(object, offset);
6747 				assert(m != VM_PAGE_NULL);
6748 				assert(VM_PAGE_WIRED(m));
6749 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6750 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6751 					if (entry->protection & VM_PROT_WRITE) {
6752 						vm_object_lock_assert_exclusive(
6753 							object);
6754 						m->vmp_dirty = TRUE;
6755 					}
6756 				} else {
6757 					/* not already wired !? */
6758 					*physpage_p = 0;
6759 				}
6760 				vm_object_unlock(object);
6761 			}
6762 
6763 			/* map was not unlocked: no need to relookup */
6764 			entry = entry->vme_next;
6765 			s = entry->vme_start;
6766 			continue;
6767 		}
6768 
6769 		/*
6770 		 * Unwired entry or wire request transmitted via submap
6771 		 */
6772 
6773 		/*
6774 		 * Wiring would copy the pages to the shadow object.
6775 		 * The shadow object would not be code-signed so
6776 		 * attempting to execute code from these copied pages
6777 		 * would trigger a code-signing violation.
6778 		 */
6779 
6780 		if ((entry->protection & VM_PROT_EXECUTE)
6781 #if XNU_TARGET_OS_OSX
6782 		    &&
6783 		    map->pmap != kernel_pmap &&
6784 		    (vm_map_cs_enforcement(map)
6785 #if __arm64__
6786 		    || !VM_MAP_IS_EXOTIC(map)
6787 #endif /* __arm64__ */
6788 		    )
6789 #endif /* XNU_TARGET_OS_OSX */
6790 #if CODE_SIGNING_MONITOR
6791 		    &&
6792 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6793 #endif
6794 		    ) {
6795 #if MACH_ASSERT
6796 			printf("pid %d[%s] wiring executable range from "
6797 			    "0x%llx to 0x%llx: rejected to preserve "
6798 			    "code-signing\n",
6799 			    proc_selfpid(),
6800 			    (get_bsdtask_info(current_task())
6801 			    ? proc_name_address(get_bsdtask_info(current_task()))
6802 			    : "?"),
6803 			    (uint64_t) entry->vme_start,
6804 			    (uint64_t) entry->vme_end);
6805 #endif /* MACH_ASSERT */
6806 			DTRACE_VM2(cs_executable_wire,
6807 			    uint64_t, (uint64_t)entry->vme_start,
6808 			    uint64_t, (uint64_t)entry->vme_end);
6809 			cs_executable_wire++;
6810 			rc = KERN_PROTECTION_FAILURE;
6811 			goto done;
6812 		}
6813 
6814 		/*
6815 		 * Perform actions of vm_map_lookup that need the write
6816 		 * lock on the map: create a shadow object for a
6817 		 * copy-on-write region, or an object for a zero-fill
6818 		 * region.
6819 		 */
6820 		size = entry->vme_end - entry->vme_start;
6821 		/*
6822 		 * If wiring a copy-on-write page, we need to copy it now
6823 		 * even if we're only (currently) requesting read access.
6824 		 * This is aggressive, but once it's wired we can't move it.
6825 		 */
6826 		if (entry->needs_copy) {
6827 			if (wire_and_extract) {
6828 				/*
6829 				 * We're supposed to share with the original
6830 				 * provider so should not be "needs_copy"
6831 				 */
6832 				rc = KERN_INVALID_ARGUMENT;
6833 				goto done;
6834 			}
6835 
6836 			VME_OBJECT_SHADOW(entry, size,
6837 			    vm_map_always_shadow(map));
6838 			entry->needs_copy = FALSE;
6839 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6840 			if (wire_and_extract) {
6841 				/*
6842 				 * We're supposed to share with the original
6843 				 * provider so should already have an object.
6844 				 */
6845 				rc = KERN_INVALID_ARGUMENT;
6846 				goto done;
6847 			}
6848 			VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6849 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6850 			assert(entry->use_pmap);
6851 		} else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6852 			if (wire_and_extract) {
6853 				/*
6854 				 * We're supposed to share with the original
6855 				 * provider so should not be COPY_SYMMETRIC.
6856 				 */
6857 				rc = KERN_INVALID_ARGUMENT;
6858 				goto done;
6859 			}
6860 			/*
6861 			 * Force an unrequested "copy-on-write" but only for
6862 			 * the range we're wiring.
6863 			 */
6864 //			printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6865 			vm_map_clip_start(map, entry, s);
6866 			vm_map_clip_end(map, entry, end);
6867 			/* recompute "size" */
6868 			size = entry->vme_end - entry->vme_start;
6869 			/* make a shadow object */
6870 			vm_object_t orig_object;
6871 			vm_object_offset_t orig_offset;
6872 			orig_object = VME_OBJECT(entry);
6873 			orig_offset = VME_OFFSET(entry);
6874 			VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6875 			if (VME_OBJECT(entry) != orig_object) {
6876 				/*
6877 				 * This mapping has not been shared (or it would be
6878 				 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6879 				 * not been copied-on-write (or it would be marked
6880 				 * as "needs_copy" and would have been handled above
6881 				 * and also already write-protected).
6882 				 * We still need to write-protect here to prevent
6883 				 * other threads from modifying these pages while
6884 				 * we're in the process of copying and wiring
6885 				 * the copied pages.
6886 				 * Since the mapping is neither shared nor COWed,
6887 				 * we only need to write-protect the PTEs for this
6888 				 * mapping.
6889 				 */
6890 				vm_object_pmap_protect(orig_object,
6891 				    orig_offset,
6892 				    size,
6893 				    map->pmap,
6894 				    VM_MAP_PAGE_SIZE(map),
6895 				    entry->vme_start,
6896 				    entry->protection & ~VM_PROT_WRITE);
6897 			}
6898 		}
6899 		if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6900 			/*
6901 			 * Make the object COPY_DELAY to get a stable object
6902 			 * to wire.
6903 			 * That should avoid creating long shadow chains while
6904 			 * wiring/unwiring the same range repeatedly.
6905 			 * That also prevents part of the object from being
6906 			 * wired while another part is "needs_copy", which
6907 			 * could result in conflicting rules wrt copy-on-write.
6908 			 */
6909 			vm_object_t object;
6910 
6911 			object = VME_OBJECT(entry);
6912 			vm_object_lock(object);
6913 			if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6914 				assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
6915 				    "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
6916 				    object, (uint64_t)object->vo_size,
6917 				    entry,
6918 				    (uint64_t)entry->vme_start,
6919 				    (uint64_t)entry->vme_end,
6920 				    (uint64_t)VME_OFFSET(entry),
6921 				    (uint64_t)size);
6922 				assertf(os_ref_get_count_raw(&object->ref_count) == 1,
6923 				    "object %p ref_count %d\n",
6924 				    object, os_ref_get_count_raw(&object->ref_count));
6925 				assertf(!entry->needs_copy,
6926 				    "entry %p\n", entry);
6927 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6928 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
6929 			}
6930 			vm_object_unlock(object);
6931 		}
6932 
6933 		vm_map_clip_start(map, entry, s);
6934 		vm_map_clip_end(map, entry, end);
6935 
6936 		/* re-compute "e" */
6937 		e = entry->vme_end;
6938 		if (e > end) {
6939 			e = end;
6940 		}
6941 
6942 		/*
6943 		 * Check for holes and protection mismatch.
6944 		 * Holes: Next entry should be contiguous unless this
6945 		 *	  is the end of the region.
6946 		 * Protection: Access requested must be allowed, unless
6947 		 *	wiring is by protection class
6948 		 */
6949 		if ((entry->vme_end < end) &&
6950 		    ((entry->vme_next == vm_map_to_entry(map)) ||
6951 		    (entry->vme_next->vme_start > entry->vme_end))) {
6952 			/* found a hole */
6953 			rc = KERN_INVALID_ADDRESS;
6954 			goto done;
6955 		}
6956 		if ((entry->protection & access_type) != access_type) {
6957 			/* found a protection problem */
6958 			rc = KERN_PROTECTION_FAILURE;
6959 			goto done;
6960 		}
6961 
6962 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
6963 
6964 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6965 			goto done;
6966 		}
6967 
6968 		entry->in_transition = TRUE;
6969 
6970 		/*
6971 		 * This entry might get split once we unlock the map.
6972 		 * In vm_fault_wire(), we need the current range as
6973 		 * defined by this entry.  In order for this to work
6974 		 * along with a simultaneous clip operation, we make a
6975 		 * temporary copy of this entry and use that for the
6976 		 * wiring.  Note that the underlying objects do not
6977 		 * change during a clip.
6978 		 */
6979 		tmp_entry = *entry;
6980 
6981 		/*
6982 		 * The in_transition state guarentees that the entry
6983 		 * (or entries for this range, if split occured) will be
6984 		 * there when the map lock is acquired for the second time.
6985 		 */
6986 		vm_map_unlock(map);
6987 
6988 		if (!user_wire && cur_thread != THREAD_NULL) {
6989 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
6990 		} else {
6991 			interruptible_state = THREAD_UNINT;
6992 		}
6993 
6994 		if (map_pmap) {
6995 			rc = vm_fault_wire(map,
6996 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
6997 			    physpage_p);
6998 		} else {
6999 			rc = vm_fault_wire(map,
7000 			    &tmp_entry, caller_prot, tag, map->pmap,
7001 			    tmp_entry.vme_start,
7002 			    physpage_p);
7003 		}
7004 
7005 		if (!user_wire && cur_thread != THREAD_NULL) {
7006 			thread_interrupt_level(interruptible_state);
7007 		}
7008 
7009 		vm_map_lock(map);
7010 
7011 		if (last_timestamp + 1 != map->timestamp) {
7012 			/*
7013 			 * Find the entry again.  It could have been clipped
7014 			 * after we unlocked the map.
7015 			 */
7016 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7017 			    &first_entry)) {
7018 				panic("vm_map_wire: re-lookup failed");
7019 			}
7020 
7021 			entry = first_entry;
7022 		}
7023 
7024 		last_timestamp = map->timestamp;
7025 
7026 		while ((entry != vm_map_to_entry(map)) &&
7027 		    (entry->vme_start < tmp_entry.vme_end)) {
7028 			assert(entry->in_transition);
7029 			entry->in_transition = FALSE;
7030 			if (entry->needs_wakeup) {
7031 				entry->needs_wakeup = FALSE;
7032 				need_wakeup = TRUE;
7033 			}
7034 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
7035 				subtract_wire_counts(map, entry, user_wire);
7036 			}
7037 			entry = entry->vme_next;
7038 		}
7039 
7040 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
7041 			goto done;
7042 		}
7043 
7044 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7045 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
7046 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7047 			/* found a "new" hole */
7048 			s = tmp_entry.vme_end;
7049 			rc = KERN_INVALID_ADDRESS;
7050 			goto done;
7051 		}
7052 
7053 		s = entry->vme_start;
7054 	} /* end while loop through map entries */
7055 
7056 done:
7057 	if (rc == KERN_SUCCESS) {
7058 		/* repair any damage we may have made to the VM map */
7059 		vm_map_simplify_range(map, start, end);
7060 	}
7061 
7062 	vm_map_unlock(map);
7063 
7064 	/*
7065 	 * wake up anybody waiting on entries we wired.
7066 	 */
7067 	if (need_wakeup) {
7068 		vm_map_entry_wakeup(map);
7069 	}
7070 
7071 	if (rc != KERN_SUCCESS) {
7072 		/* undo what has been wired so far */
7073 		vm_map_unwire_nested(map, start, s, user_wire,
7074 		    map_pmap, pmap_addr);
7075 		if (physpage_p) {
7076 			*physpage_p = 0;
7077 		}
7078 	}
7079 
7080 	return rc;
7081 }
7082 
7083 static __attribute__((always_inline, warn_unused_result))
7084 kern_return_t
vm_map_wire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size,vm_prot_t * prot)7085 vm_map_wire_sanitize(
7086 	vm_map_t                map,
7087 	vm_map_offset_ut        start_u,
7088 	vm_map_offset_ut        end_u,
7089 	vm_prot_ut              prot_u,
7090 	vm_sanitize_caller_t    vm_sanitize_caller,
7091 	vm_map_offset_t        *start,
7092 	vm_map_offset_t        *end,
7093 	vm_map_size_t          *size,
7094 	vm_prot_t              *prot)
7095 {
7096 	kern_return_t   kr;
7097 
7098 	kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7099 	    VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7100 	    size);
7101 	if (__improbable(kr != KERN_SUCCESS)) {
7102 		return kr;
7103 	}
7104 
7105 	kr = vm_sanitize_prot(prot_u, vm_sanitize_caller, map, prot);
7106 	if (__improbable(kr != KERN_SUCCESS)) {
7107 		return kr;
7108 	}
7109 
7110 	return KERN_SUCCESS;
7111 }
7112 
7113 /*
7114  * Validation function for vm_map_wire_nested().
7115  */
7116 kern_return_t
vm_map_wire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p,vm_sanitize_caller_t vm_sanitize_caller)7117 vm_map_wire_impl(
7118 	vm_map_t                map,
7119 	vm_map_offset_ut        start_u,
7120 	vm_map_offset_ut        end_u,
7121 	vm_prot_ut              prot_u,
7122 	vm_tag_t                tag,
7123 	boolean_t               user_wire,
7124 	ppnum_t                *physpage_p,
7125 	vm_sanitize_caller_t    vm_sanitize_caller)
7126 {
7127 	vm_map_offset_t start, end;
7128 	vm_map_size_t   size;
7129 	vm_prot_t       prot;
7130 	kern_return_t   kr;
7131 
7132 	/*
7133 	 * Sanitize any input parameters that are addr/size/prot/inherit
7134 	 */
7135 	kr = vm_map_wire_sanitize(map,
7136 	    start_u,
7137 	    end_u,
7138 	    prot_u,
7139 	    vm_sanitize_caller,
7140 	    &start,
7141 	    &end,
7142 	    &size,
7143 	    &prot);
7144 	if (__improbable(kr != KERN_SUCCESS)) {
7145 		if (physpage_p) {
7146 			*physpage_p = 0;
7147 		}
7148 		return vm_sanitize_get_kr(kr);
7149 	}
7150 
7151 	return vm_map_wire_nested(map, start, end, prot, tag, user_wire,
7152 	           PMAP_NULL, 0, physpage_p);
7153 }
7154 
7155 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,boolean_t user_wire)7156 vm_map_wire_external(
7157 	vm_map_t                map,
7158 	vm_map_offset_ut        start_u,
7159 	vm_map_offset_ut        end_u,
7160 	vm_prot_ut              prot_u,
7161 	boolean_t               user_wire)
7162 {
7163 	vm_tag_t tag = vm_tag_bt();
7164 
7165 	return vm_map_wire_kernel(map, start_u, end_u, prot_u, tag, user_wire);
7166 }
7167 
7168 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire)7169 vm_map_wire_kernel(
7170 	vm_map_t                map,
7171 	vm_map_offset_ut        start_u,
7172 	vm_map_offset_ut        end_u,
7173 	vm_prot_ut              prot_u,
7174 	vm_tag_t                tag,
7175 	boolean_t               user_wire)
7176 {
7177 	return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7178 	           user_wire, NULL, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7179 }
7180 
7181 #if XNU_PLATFORM_MacOSX
7182 
7183 kern_return_t
vm_map_wire_and_extract(vm_map_t map,vm_map_offset_ut start_u,vm_prot_ut prot_u,boolean_t user_wire,ppnum_t * physpage_p)7184 vm_map_wire_and_extract(
7185 	vm_map_t                map,
7186 	vm_map_offset_ut        start_u,
7187 	vm_prot_ut              prot_u,
7188 	boolean_t               user_wire,
7189 	ppnum_t                *physpage_p)
7190 {
7191 	vm_tag_t         tag    = vm_tag_bt();
7192 	vm_map_size_ut   size_u = vm_sanitize_wrap_size(VM_MAP_PAGE_SIZE(map));
7193 	vm_map_offset_ut end_u  = vm_sanitize_compute_ut_end(start_u, size_u);
7194 
7195 	return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7196 	           user_wire, physpage_p, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7197 }
7198 
7199 #endif /* XNU_PLATFORM_MacOSX */
7200 
7201 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7202 vm_map_unwire_nested(
7203 	vm_map_t                map,
7204 	vm_map_offset_t         start,
7205 	vm_map_offset_t         end,
7206 	boolean_t               user_wire,
7207 	pmap_t                  map_pmap,
7208 	vm_map_offset_t         pmap_addr)
7209 {
7210 	vm_map_entry_t          entry;
7211 	struct vm_map_entry     *first_entry, tmp_entry;
7212 	boolean_t               need_wakeup;
7213 	boolean_t               main_map = FALSE;
7214 	unsigned int            last_timestamp;
7215 
7216 	VM_MAP_RANGE_CHECK(map, start, end);
7217 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7218 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7219 
7220 	if (start == end) {
7221 		/* We unwired what the caller asked for: zero pages */
7222 		return KERN_SUCCESS;
7223 	}
7224 
7225 	vm_map_lock(map);
7226 	if (map_pmap == NULL) {
7227 		main_map = TRUE;
7228 	}
7229 	last_timestamp = map->timestamp;
7230 
7231 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7232 		entry = first_entry;
7233 		/*
7234 		 * vm_map_clip_start will be done later.
7235 		 * We don't want to unnest any nested sub maps here !
7236 		 */
7237 	} else {
7238 		if (!user_wire) {
7239 			panic("vm_map_unwire: start not found");
7240 		}
7241 		/*	Start address is not in map. */
7242 		vm_map_unlock(map);
7243 		return KERN_INVALID_ADDRESS;
7244 	}
7245 
7246 	if (entry->superpage_size) {
7247 		/* superpages are always wired */
7248 		vm_map_unlock(map);
7249 		return KERN_INVALID_ADDRESS;
7250 	}
7251 
7252 	need_wakeup = FALSE;
7253 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7254 		if (entry->in_transition) {
7255 			/*
7256 			 * 1)
7257 			 * Another thread is wiring down this entry. Note
7258 			 * that if it is not for the other thread we would
7259 			 * be unwiring an unwired entry.  This is not
7260 			 * permitted.  If we wait, we will be unwiring memory
7261 			 * we did not wire.
7262 			 *
7263 			 * 2)
7264 			 * Another thread is unwiring this entry.  We did not
7265 			 * have a reference to it, because if we did, this
7266 			 * entry will not be getting unwired now.
7267 			 */
7268 			if (!user_wire) {
7269 				/*
7270 				 * XXX FBDP
7271 				 * This could happen:  there could be some
7272 				 * overlapping vslock/vsunlock operations
7273 				 * going on.
7274 				 * We should probably just wait and retry,
7275 				 * but then we have to be careful that this
7276 				 * entry could get "simplified" after
7277 				 * "in_transition" gets unset and before
7278 				 * we re-lookup the entry, so we would
7279 				 * have to re-clip the entry to avoid
7280 				 * re-unwiring what we have already unwired...
7281 				 * See vm_map_wire_nested().
7282 				 *
7283 				 * Or we could just ignore "in_transition"
7284 				 * here and proceed to decement the wired
7285 				 * count(s) on this entry.  That should be fine
7286 				 * as long as "wired_count" doesn't drop all
7287 				 * the way to 0 (and we should panic if THAT
7288 				 * happens).
7289 				 */
7290 				panic("vm_map_unwire: in_transition entry");
7291 			}
7292 
7293 			entry = entry->vme_next;
7294 			continue;
7295 		}
7296 
7297 		if (entry->is_sub_map) {
7298 			vm_map_offset_t sub_start;
7299 			vm_map_offset_t sub_end;
7300 			vm_map_offset_t local_end;
7301 			pmap_t          pmap;
7302 
7303 			vm_map_clip_start(map, entry, start);
7304 			vm_map_clip_end(map, entry, end);
7305 
7306 			sub_start = VME_OFFSET(entry);
7307 			sub_end = entry->vme_end - entry->vme_start;
7308 			sub_end += VME_OFFSET(entry);
7309 			local_end = entry->vme_end;
7310 			if (map_pmap == NULL) {
7311 				if (entry->use_pmap) {
7312 					pmap = VME_SUBMAP(entry)->pmap;
7313 					pmap_addr = sub_start;
7314 				} else {
7315 					pmap = map->pmap;
7316 					pmap_addr = start;
7317 				}
7318 				if (entry->wired_count == 0 ||
7319 				    (user_wire && entry->user_wired_count == 0)) {
7320 					if (!user_wire) {
7321 						panic("vm_map_unwire: entry is unwired");
7322 					}
7323 					entry = entry->vme_next;
7324 					continue;
7325 				}
7326 
7327 				/*
7328 				 * Check for holes
7329 				 * Holes: Next entry should be contiguous unless
7330 				 * this is the end of the region.
7331 				 */
7332 				if (((entry->vme_end < end) &&
7333 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7334 				    (entry->vme_next->vme_start
7335 				    > entry->vme_end)))) {
7336 					if (!user_wire) {
7337 						panic("vm_map_unwire: non-contiguous region");
7338 					}
7339 /*
7340  *                                       entry = entry->vme_next;
7341  *                                       continue;
7342  */
7343 				}
7344 
7345 				subtract_wire_counts(map, entry, user_wire);
7346 
7347 				if (entry->wired_count != 0) {
7348 					entry = entry->vme_next;
7349 					continue;
7350 				}
7351 
7352 				entry->in_transition = TRUE;
7353 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7354 
7355 				/*
7356 				 * We can unlock the map now. The in_transition state
7357 				 * guarantees existance of the entry.
7358 				 */
7359 				vm_map_unlock(map);
7360 				vm_map_unwire_nested(VME_SUBMAP(entry),
7361 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7362 				vm_map_lock(map);
7363 
7364 				if (last_timestamp + 1 != map->timestamp) {
7365 					/*
7366 					 * Find the entry again.  It could have been
7367 					 * clipped or deleted after we unlocked the map.
7368 					 */
7369 					if (!vm_map_lookup_entry(map,
7370 					    tmp_entry.vme_start,
7371 					    &first_entry)) {
7372 						if (!user_wire) {
7373 							panic("vm_map_unwire: re-lookup failed");
7374 						}
7375 						entry = first_entry->vme_next;
7376 					} else {
7377 						entry = first_entry;
7378 					}
7379 				}
7380 				last_timestamp = map->timestamp;
7381 
7382 				/*
7383 				 * clear transition bit for all constituent entries
7384 				 * that were in the original entry (saved in
7385 				 * tmp_entry).  Also check for waiters.
7386 				 */
7387 				while ((entry != vm_map_to_entry(map)) &&
7388 				    (entry->vme_start < tmp_entry.vme_end)) {
7389 					assert(entry->in_transition);
7390 					entry->in_transition = FALSE;
7391 					if (entry->needs_wakeup) {
7392 						entry->needs_wakeup = FALSE;
7393 						need_wakeup = TRUE;
7394 					}
7395 					entry = entry->vme_next;
7396 				}
7397 				continue;
7398 			} else {
7399 				tmp_entry = *entry;
7400 				vm_map_unlock(map);
7401 				vm_map_unwire_nested(VME_SUBMAP(entry),
7402 				    sub_start, sub_end, user_wire, map_pmap,
7403 				    pmap_addr);
7404 				vm_map_lock(map);
7405 
7406 				if (last_timestamp + 1 != map->timestamp) {
7407 					/*
7408 					 * Find the entry again.  It could have been
7409 					 * clipped or deleted after we unlocked the map.
7410 					 */
7411 					if (!vm_map_lookup_entry(map,
7412 					    tmp_entry.vme_start,
7413 					    &first_entry)) {
7414 						if (!user_wire) {
7415 							panic("vm_map_unwire: re-lookup failed");
7416 						}
7417 						entry = first_entry->vme_next;
7418 					} else {
7419 						entry = first_entry;
7420 					}
7421 				}
7422 				last_timestamp = map->timestamp;
7423 			}
7424 		}
7425 
7426 
7427 		if ((entry->wired_count == 0) ||
7428 		    (user_wire && entry->user_wired_count == 0)) {
7429 			if (!user_wire) {
7430 				panic("vm_map_unwire: entry is unwired");
7431 			}
7432 
7433 			entry = entry->vme_next;
7434 			continue;
7435 		}
7436 
7437 		assert(entry->wired_count > 0 &&
7438 		    (!user_wire || entry->user_wired_count > 0));
7439 
7440 		vm_map_clip_start(map, entry, start);
7441 		vm_map_clip_end(map, entry, end);
7442 
7443 		/*
7444 		 * Check for holes
7445 		 * Holes: Next entry should be contiguous unless
7446 		 *	  this is the end of the region.
7447 		 */
7448 		if (((entry->vme_end < end) &&
7449 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7450 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7451 			if (!user_wire) {
7452 				panic("vm_map_unwire: non-contiguous region");
7453 			}
7454 			entry = entry->vme_next;
7455 			continue;
7456 		}
7457 
7458 		subtract_wire_counts(map, entry, user_wire);
7459 
7460 		if (entry->wired_count != 0) {
7461 			entry = entry->vme_next;
7462 			continue;
7463 		}
7464 
7465 		if (entry->zero_wired_pages) {
7466 			entry->zero_wired_pages = FALSE;
7467 		}
7468 
7469 		entry->in_transition = TRUE;
7470 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7471 
7472 		/*
7473 		 * We can unlock the map now. The in_transition state
7474 		 * guarantees existance of the entry.
7475 		 */
7476 		vm_map_unlock(map);
7477 		if (map_pmap) {
7478 			vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7479 			    pmap_addr, tmp_entry.vme_end);
7480 		} else {
7481 			vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7482 			    tmp_entry.vme_start, tmp_entry.vme_end);
7483 		}
7484 		vm_map_lock(map);
7485 
7486 		if (last_timestamp + 1 != map->timestamp) {
7487 			/*
7488 			 * Find the entry again.  It could have been clipped
7489 			 * or deleted after we unlocked the map.
7490 			 */
7491 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7492 			    &first_entry)) {
7493 				if (!user_wire) {
7494 					panic("vm_map_unwire: re-lookup failed");
7495 				}
7496 				entry = first_entry->vme_next;
7497 			} else {
7498 				entry = first_entry;
7499 			}
7500 		}
7501 		last_timestamp = map->timestamp;
7502 
7503 		/*
7504 		 * clear transition bit for all constituent entries that
7505 		 * were in the original entry (saved in tmp_entry).  Also
7506 		 * check for waiters.
7507 		 */
7508 		while ((entry != vm_map_to_entry(map)) &&
7509 		    (entry->vme_start < tmp_entry.vme_end)) {
7510 			assert(entry->in_transition);
7511 			entry->in_transition = FALSE;
7512 			if (entry->needs_wakeup) {
7513 				entry->needs_wakeup = FALSE;
7514 				need_wakeup = TRUE;
7515 			}
7516 			entry = entry->vme_next;
7517 		}
7518 	}
7519 
7520 	/*
7521 	 * We might have fragmented the address space when we wired this
7522 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7523 	 * with their neighbors now that they're no longer wired.
7524 	 * Under some circumstances, address space fragmentation can
7525 	 * prevent VM object shadow chain collapsing, which can cause
7526 	 * swap space leaks.
7527 	 */
7528 	vm_map_simplify_range(map, start, end);
7529 
7530 	vm_map_unlock(map);
7531 	/*
7532 	 * wake up anybody waiting on entries that we have unwired.
7533 	 */
7534 	if (need_wakeup) {
7535 		vm_map_entry_wakeup(map);
7536 	}
7537 	return KERN_SUCCESS;
7538 }
7539 
7540 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire)7541 vm_map_unwire(
7542 	vm_map_t                map,
7543 	vm_map_offset_ut        start_u,
7544 	vm_map_offset_ut        end_u,
7545 	boolean_t               user_wire)
7546 {
7547 	return vm_map_unwire_impl(map, start_u, end_u, user_wire,
7548 	           VM_SANITIZE_CALLER_VM_MAP_UNWIRE);
7549 }
7550 
7551 static __attribute__((always_inline, warn_unused_result))
7552 kern_return_t
vm_map_unwire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)7553 vm_map_unwire_sanitize(
7554 	vm_map_t                map,
7555 	vm_map_offset_ut        start_u,
7556 	vm_map_offset_ut        end_u,
7557 	vm_sanitize_caller_t    vm_sanitize_caller,
7558 	vm_map_offset_t        *start,
7559 	vm_map_offset_t        *end,
7560 	vm_map_size_t          *size)
7561 {
7562 	return vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7563 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7564 	           size);
7565 }
7566 
7567 kern_return_t
vm_map_unwire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire,vm_sanitize_caller_t vm_sanitize_caller)7568 vm_map_unwire_impl(
7569 	vm_map_t                map,
7570 	vm_map_offset_ut        start_u,
7571 	vm_map_offset_ut        end_u,
7572 	boolean_t               user_wire,
7573 	vm_sanitize_caller_t    vm_sanitize_caller)
7574 {
7575 	vm_map_offset_t start, end;
7576 	vm_map_size_t   size;
7577 	kern_return_t   kr;
7578 
7579 	/*
7580 	 * Sanitize any input parameters that are addr/size/prot/inherit
7581 	 */
7582 	kr = vm_map_unwire_sanitize(
7583 		map,
7584 		start_u,
7585 		end_u,
7586 		vm_sanitize_caller,
7587 		&start,
7588 		&end,
7589 		&size);
7590 	if (__improbable(kr != KERN_SUCCESS)) {
7591 		return vm_sanitize_get_kr(kr);
7592 	}
7593 
7594 	return vm_map_unwire_nested(map, start, end,
7595 	           user_wire, (pmap_t)NULL, 0);
7596 }
7597 
7598 
7599 /*
7600  *	vm_map_entry_zap:	[ internal use only ]
7601  *
7602  *	Remove the entry from the target map
7603  *	and put it on a zap list.
7604  */
7605 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7606 vm_map_entry_zap(
7607 	vm_map_t                map,
7608 	vm_map_entry_t          entry,
7609 	vm_map_zap_t            zap)
7610 {
7611 	vm_map_offset_t s, e;
7612 
7613 	s = entry->vme_start;
7614 	e = entry->vme_end;
7615 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7616 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7617 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7618 		assert(page_aligned(s));
7619 		assert(page_aligned(e));
7620 	}
7621 	if (entry->map_aligned == TRUE) {
7622 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7623 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7624 	}
7625 	assert(entry->wired_count == 0);
7626 	assert(entry->user_wired_count == 0);
7627 	assert(!entry->vme_permanent);
7628 
7629 	vm_map_store_entry_unlink(map, entry, false);
7630 	map->size -= e - s;
7631 
7632 	vm_map_zap_append(zap, entry);
7633 }
7634 
7635 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7636 vm_map_submap_pmap_clean(
7637 	vm_map_t        map,
7638 	vm_map_offset_t start,
7639 	vm_map_offset_t end,
7640 	vm_map_t        sub_map,
7641 	vm_map_offset_t offset)
7642 {
7643 	vm_map_offset_t submap_start;
7644 	vm_map_offset_t submap_end;
7645 	vm_map_size_t   remove_size;
7646 	vm_map_entry_t  entry;
7647 
7648 	submap_end = offset + (end - start);
7649 	submap_start = offset;
7650 
7651 	vm_map_lock_read(sub_map);
7652 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7653 		remove_size = (entry->vme_end - entry->vme_start);
7654 		if (offset > entry->vme_start) {
7655 			remove_size -= offset - entry->vme_start;
7656 		}
7657 
7658 
7659 		if (submap_end < entry->vme_end) {
7660 			remove_size -=
7661 			    entry->vme_end - submap_end;
7662 		}
7663 		if (entry->is_sub_map) {
7664 			vm_map_submap_pmap_clean(
7665 				sub_map,
7666 				start,
7667 				start + remove_size,
7668 				VME_SUBMAP(entry),
7669 				VME_OFFSET(entry));
7670 		} else {
7671 			if (map->mapped_in_other_pmaps &&
7672 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7673 			    VME_OBJECT(entry) != NULL) {
7674 				vm_object_pmap_protect_options(
7675 					VME_OBJECT(entry),
7676 					(VME_OFFSET(entry) +
7677 					offset -
7678 					entry->vme_start),
7679 					remove_size,
7680 					PMAP_NULL,
7681 					PAGE_SIZE,
7682 					entry->vme_start,
7683 					VM_PROT_NONE,
7684 					PMAP_OPTIONS_REMOVE);
7685 			} else {
7686 				pmap_remove(map->pmap,
7687 				    (addr64_t)start,
7688 				    (addr64_t)(start + remove_size));
7689 			}
7690 		}
7691 	}
7692 
7693 	entry = entry->vme_next;
7694 
7695 	while ((entry != vm_map_to_entry(sub_map))
7696 	    && (entry->vme_start < submap_end)) {
7697 		remove_size = (entry->vme_end - entry->vme_start);
7698 		if (submap_end < entry->vme_end) {
7699 			remove_size -= entry->vme_end - submap_end;
7700 		}
7701 		if (entry->is_sub_map) {
7702 			vm_map_submap_pmap_clean(
7703 				sub_map,
7704 				(start + entry->vme_start) - offset,
7705 				((start + entry->vme_start) - offset) + remove_size,
7706 				VME_SUBMAP(entry),
7707 				VME_OFFSET(entry));
7708 		} else {
7709 			if (map->mapped_in_other_pmaps &&
7710 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7711 			    VME_OBJECT(entry) != NULL) {
7712 				vm_object_pmap_protect_options(
7713 					VME_OBJECT(entry),
7714 					VME_OFFSET(entry),
7715 					remove_size,
7716 					PMAP_NULL,
7717 					PAGE_SIZE,
7718 					entry->vme_start,
7719 					VM_PROT_NONE,
7720 					PMAP_OPTIONS_REMOVE);
7721 			} else {
7722 				pmap_remove(map->pmap,
7723 				    (addr64_t)((start + entry->vme_start)
7724 				    - offset),
7725 				    (addr64_t)(((start + entry->vme_start)
7726 				    - offset) + remove_size));
7727 			}
7728 		}
7729 		entry = entry->vme_next;
7730 	}
7731 	vm_map_unlock_read(sub_map);
7732 	return;
7733 }
7734 
7735 /*
7736  *     virt_memory_guard_ast:
7737  *
7738  *     Handle the AST callout for a virtual memory guard.
7739  *	   raise an EXC_GUARD exception and terminate the task
7740  *     if configured to do so.
7741  */
7742 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7743 virt_memory_guard_ast(
7744 	thread_t thread,
7745 	mach_exception_data_type_t code,
7746 	mach_exception_data_type_t subcode)
7747 {
7748 	task_t task = get_threadtask(thread);
7749 	assert(task != kernel_task);
7750 	assert(task == current_task());
7751 	kern_return_t sync_exception_result;
7752 	uint32_t behavior;
7753 
7754 	behavior = task->task_exc_guard;
7755 
7756 	/* Is delivery enabled */
7757 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7758 		return;
7759 	}
7760 
7761 	/* If only once, make sure we're that once */
7762 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7763 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7764 
7765 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7766 			break;
7767 		}
7768 		behavior = task->task_exc_guard;
7769 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7770 			return;
7771 		}
7772 	}
7773 
7774 	const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7775 	/* Raise exception synchronously and see if handler claimed it */
7776 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7777 
7778 	if (fatal) {
7779 		/*
7780 		 * If Synchronous EXC_GUARD delivery was successful then
7781 		 * kill the process and return, else kill the process
7782 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
7783 		 */
7784 
7785 
7786 		int flags = PX_DEBUG_NO_HONOR;
7787 		exception_info_t info = {
7788 			.os_reason = OS_REASON_GUARD,
7789 			.exception_type = EXC_GUARD,
7790 			.mx_code = code,
7791 			.mx_subcode = subcode
7792 		};
7793 
7794 		if (sync_exception_result == KERN_SUCCESS) {
7795 			flags |= PX_PSIGNAL;
7796 		}
7797 		exit_with_mach_exception(current_proc(), info, flags);
7798 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7799 		/*
7800 		 * If the synchronous EXC_GUARD delivery was not successful,
7801 		 * raise a simulated crash.
7802 		 */
7803 		if (sync_exception_result != KERN_SUCCESS) {
7804 			task_violated_guard(code, subcode, NULL, FALSE);
7805 		}
7806 	}
7807 }
7808 
7809 /*
7810  *     vm_map_guard_exception:
7811  *
7812  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7813  *
7814  *     Right now, we do this when we find nothing mapped, or a
7815  *     gap in the mapping when a user address space deallocate
7816  *     was requested. We report the address of the first gap found.
7817  */
7818 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7819 vm_map_guard_exception(
7820 	vm_map_offset_t gap_start,
7821 	unsigned reason)
7822 {
7823 	mach_exception_code_t code = 0;
7824 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7825 	unsigned int target = 0; /* should we pass in pid associated with map? */
7826 	mach_exception_data_type_t subcode = (uint64_t)gap_start;
7827 	boolean_t fatal = FALSE;
7828 
7829 	task_t task = current_task_early();
7830 
7831 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7832 	if (task == NULL || task == kernel_task) {
7833 		return;
7834 	}
7835 
7836 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
7837 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
7838 	EXC_GUARD_ENCODE_TARGET(code, target);
7839 
7840 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7841 		fatal = TRUE;
7842 	}
7843 	thread_guard_violation(current_thread(), code, subcode, fatal);
7844 }
7845 
7846 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7847 vm_map_delete_submap_recurse(
7848 	vm_map_t submap,
7849 	vm_map_offset_t submap_start,
7850 	vm_map_offset_t submap_end)
7851 {
7852 	vm_map_entry_t submap_entry;
7853 
7854 	/*
7855 	 * Verify that the submap does not contain any "permanent" entries
7856 	 * within the specified range. We permit TPRO ranges to be overwritten
7857 	 * as we only reach this path if TPRO const protection is disabled for a
7858 	 * given map.
7859 	 *
7860 	 * We do not care about gaps.
7861 	 */
7862 
7863 	vm_map_lock(submap);
7864 
7865 	if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7866 		submap_entry = submap_entry->vme_next;
7867 	}
7868 
7869 	for (;
7870 	    submap_entry != vm_map_to_entry(submap) &&
7871 	    submap_entry->vme_start < submap_end;
7872 	    submap_entry = submap_entry->vme_next) {
7873 		if (submap_entry->vme_permanent
7874 #ifdef __arm64e__
7875 		    /* allow TPRO submap entries to be overwritten */
7876 		    && !submap_entry->used_for_tpro
7877 #endif
7878 		    ) {
7879 			/* "permanent" entry -> fail */
7880 			vm_map_unlock(submap);
7881 			return KERN_PROTECTION_FAILURE;
7882 		}
7883 	}
7884 	/* no "permanent" entries in the range -> success */
7885 	vm_map_unlock(submap);
7886 	return KERN_SUCCESS;
7887 }
7888 
7889 __abortlike
7890 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7891 __vm_map_delete_misaligned_panic(
7892 	vm_map_t                map,
7893 	vm_map_offset_t         start,
7894 	vm_map_offset_t         end)
7895 {
7896 	panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7897 	    map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7898 }
7899 
7900 __abortlike
7901 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7902 __vm_map_delete_failed_panic(
7903 	vm_map_t                map,
7904 	vm_map_offset_t         start,
7905 	vm_map_offset_t         end,
7906 	kern_return_t           kr)
7907 {
7908 	panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7909 	    map, (uint64_t)start, (uint64_t)end, kr);
7910 }
7911 
7912 __abortlike
7913 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7914 __vm_map_delete_gap_panic(
7915 	vm_map_t                map,
7916 	vm_map_offset_t         where,
7917 	vm_map_offset_t         start,
7918 	vm_map_offset_t         end)
7919 {
7920 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7921 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7922 }
7923 
7924 __abortlike
7925 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7926 __vm_map_delete_permanent_panic(
7927 	vm_map_t                map,
7928 	vm_map_offset_t         start,
7929 	vm_map_offset_t         end,
7930 	vm_map_entry_t          entry)
7931 {
7932 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
7933 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7934 	    map, (uint64_t)start, (uint64_t)end, entry,
7935 	    (uint64_t)entry->vme_start,
7936 	    (uint64_t)entry->vme_end);
7937 }
7938 
7939 __options_decl(vm_map_delete_state_t, uint32_t, {
7940 	VMDS_NONE               = 0x0000,
7941 
7942 	VMDS_FOUND_GAP          = 0x0001,
7943 	VMDS_GAPS_OK            = 0x0002,
7944 
7945 	VMDS_KERNEL_PMAP        = 0x0004,
7946 	VMDS_NEEDS_LOOKUP       = 0x0008,
7947 	VMDS_NEEDS_WAKEUP       = 0x0010,
7948 	VMDS_KERNEL_KMEMPTR     = 0x0020
7949 });
7950 
7951 /*
7952  * vm_map_clamp_to_pmap(map, start, end)
7953  *
7954  * Modify *start and *end so they fall within the bounds of map->pmap.
7955  */
7956 #if MACH_ASSERT
7957 static void
vm_map_clamp_to_pmap(vm_map_t map,vm_map_address_t * start,vm_map_address_t * end)7958 vm_map_clamp_to_pmap(vm_map_t map, vm_map_address_t *start, vm_map_address_t *end)
7959 {
7960 	vm_map_address_t min;
7961 	vm_map_address_t max;
7962 
7963 #if __x86_64__
7964 	/* x86_64 struct pmap does not have min and max fields */
7965 	if (map->pmap == kernel_pmap) {
7966 		min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
7967 		max = VM_MAX_KERNEL_ADDRESS;
7968 	} else {
7969 		min = VM_MAP_MIN_ADDRESS;
7970 		max = VM_MAP_MAX_ADDRESS;
7971 	}
7972 #else
7973 	min = map->pmap->min;
7974 	max = map->pmap->max;
7975 #endif
7976 
7977 	if (*start < min) {
7978 		*start = min;
7979 	} else if (*start > max) {
7980 		*start = max;
7981 	}
7982 	if (*end < min) {
7983 		*end = min;
7984 	} else if (*end > max) {
7985 		*end = max;
7986 	}
7987 }
7988 #endif
7989 
7990 int vm_log_map_delete_permanent_prot_none = 0;
7991 /*
7992  *	vm_map_delete:	[ internal use only ]
7993  *
7994  *	Deallocates the given address range from the target map.
7995  *	Removes all user wirings. Unwires one kernel wiring if
7996  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
7997  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
7998  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
7999  *
8000  *
8001  *	When the map is a kernel map, then any error in removing mappings
8002  *	will lead to a panic so that clients do not have to repeat the panic
8003  *	code at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
8004  *	is also passed, then KERN_ABORTED will not lead to a panic.
8005  *
8006  *	This routine is called with map locked and leaves map locked.
8007  */
8008 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8009 vm_map_delete(
8010 	vm_map_t                map,
8011 	vm_map_offset_t         start,
8012 	vm_map_offset_t         end,
8013 	vmr_flags_t             flags,
8014 	kmem_guard_t            guard,
8015 	vm_map_zap_t            zap_list)
8016 {
8017 	vm_map_entry_t          entry, next;
8018 	int                     interruptible;
8019 	vm_map_offset_t         gap_start = 0;
8020 	vm_map_offset_t         clear_in_transition_end = 0;
8021 	__unused vm_map_offset_t save_start = start;
8022 	__unused vm_map_offset_t save_end = end;
8023 	vm_map_delete_state_t   state = VMDS_NONE;
8024 	kmem_return_t           ret = { };
8025 	vm_map_range_id_t       range_id = 0;
8026 	struct kmem_page_meta  *meta = NULL;
8027 	uint32_t                size_idx, slot_idx;
8028 	struct mach_vm_range    slot;
8029 
8030 	if (vm_map_pmap(map) == kernel_pmap) {
8031 		state |= VMDS_KERNEL_PMAP;
8032 		range_id = kmem_addr_get_range(start, end - start);
8033 		if (kmem_is_ptr_range(range_id)) {
8034 			state |= VMDS_KERNEL_KMEMPTR;
8035 			slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8036 			    &size_idx, &slot);
8037 		}
8038 	}
8039 
8040 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8041 		state |= VMDS_GAPS_OK;
8042 	}
8043 
8044 	if (map->corpse_source &&
8045 	    !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8046 	    !map->terminated) {
8047 		/*
8048 		 * The map is being used for corpses related diagnostics.
8049 		 * So skip any entry removal to avoid perturbing the map state.
8050 		 * The cleanup will happen in task_terminate_internal after the
8051 		 * call to task_port_no_senders.
8052 		 */
8053 		goto out;
8054 	}
8055 
8056 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8057 	    THREAD_ABORTSAFE : THREAD_UNINT;
8058 
8059 	if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8060 	    (start & VM_MAP_PAGE_MASK(map))) {
8061 		__vm_map_delete_misaligned_panic(map, start, end);
8062 	}
8063 
8064 	if ((state & VMDS_GAPS_OK) == 0) {
8065 		/*
8066 		 * If the map isn't terminated then all deletions must have
8067 		 * no gaps, and be within the [min, max) of the map.
8068 		 *
8069 		 * We got here without VM_MAP_RANGE_CHECK() being called,
8070 		 * and hence must validate bounds manually.
8071 		 *
8072 		 * It is worth noting that because vm_deallocate() will
8073 		 * round_page() the deallocation size, it's possible for "end"
8074 		 * to be 0 here due to overflow. We hence must treat it as being
8075 		 * beyond vm_map_max(map).
8076 		 *
8077 		 * Similarly, end < start means some wrap around happend,
8078 		 * which should cause an error or panic.
8079 		 */
8080 		if (end == 0 || end > vm_map_max(map)) {
8081 			state |= VMDS_FOUND_GAP;
8082 			gap_start = vm_map_max(map);
8083 			if (state & VMDS_KERNEL_PMAP) {
8084 				__vm_map_delete_gap_panic(map,
8085 				    gap_start, start, end);
8086 			}
8087 			goto out;
8088 		}
8089 
8090 		if (end < start) {
8091 			if (state & VMDS_KERNEL_PMAP) {
8092 				__vm_map_delete_gap_panic(map,
8093 				    vm_map_max(map), start, end);
8094 			}
8095 			ret.kmr_return = KERN_INVALID_ARGUMENT;
8096 			goto out;
8097 		}
8098 
8099 		if (start < vm_map_min(map)) {
8100 			state |= VMDS_FOUND_GAP;
8101 			gap_start = start;
8102 			if (state & VMDS_KERNEL_PMAP) {
8103 				__vm_map_delete_gap_panic(map,
8104 				    gap_start, start, end);
8105 			}
8106 			goto out;
8107 		}
8108 	} else {
8109 		/*
8110 		 * If the map is terminated, we must accept start/end
8111 		 * being beyond the boundaries of the map as this is
8112 		 * how some of the mappings like commpage mappings
8113 		 * can be destroyed (they're outside of those bounds).
8114 		 *
8115 		 * end < start is still something we can't cope with,
8116 		 * so just bail.
8117 		 */
8118 		if (end < start) {
8119 			goto out;
8120 		}
8121 	}
8122 
8123 
8124 	/*
8125 	 *	Find the start of the region.
8126 	 *
8127 	 *	If in a superpage, extend the range
8128 	 *	to include the start of the mapping.
8129 	 */
8130 	while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8131 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8132 			start = SUPERPAGE_ROUND_DOWN(start);
8133 		} else {
8134 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8135 			break;
8136 		}
8137 	}
8138 
8139 	if (entry->superpage_size) {
8140 		end = SUPERPAGE_ROUND_UP(end);
8141 	}
8142 
8143 	/*
8144 	 *	Step through all entries in this region
8145 	 */
8146 	for (vm_map_offset_t s = start; s < end;) {
8147 		/*
8148 		 * At this point, we have deleted all the memory entries
8149 		 * in [start, s) and are proceeding with the [s, end) range.
8150 		 *
8151 		 * This loop might drop the map lock, and it is possible that
8152 		 * some memory was already reallocated within [start, s)
8153 		 * and we don't want to mess with those entries.
8154 		 *
8155 		 * Some of those entries could even have been re-assembled
8156 		 * with an entry after "s" (in vm_map_simplify_entry()), so
8157 		 * we may have to vm_map_clip_start() again.
8158 		 *
8159 		 * When clear_in_transition_end is set, the we had marked
8160 		 * [start, clear_in_transition_end) as "in_transition"
8161 		 * during a previous iteration and we need to clear it.
8162 		 */
8163 
8164 		/*
8165 		 * Step 1: If needed (because we dropped locks),
8166 		 *         lookup the entry again.
8167 		 *
8168 		 *         If we're coming back from unwiring (Step 5),
8169 		 *         we also need to mark the entries as no longer
8170 		 *         in transition after that.
8171 		 */
8172 
8173 		if (state & VMDS_NEEDS_LOOKUP) {
8174 			state &= ~VMDS_NEEDS_LOOKUP;
8175 
8176 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8177 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8178 			}
8179 
8180 			if (state & VMDS_KERNEL_KMEMPTR) {
8181 				kmem_validate_slot(s, meta, size_idx, slot_idx);
8182 			}
8183 		}
8184 
8185 		if (clear_in_transition_end) {
8186 			for (vm_map_entry_t it = entry;
8187 			    it != vm_map_to_entry(map) &&
8188 			    it->vme_start < clear_in_transition_end;
8189 			    it = it->vme_next) {
8190 				assert(it->in_transition);
8191 				it->in_transition = FALSE;
8192 				if (it->needs_wakeup) {
8193 					it->needs_wakeup = FALSE;
8194 					state |= VMDS_NEEDS_WAKEUP;
8195 				}
8196 			}
8197 
8198 			clear_in_transition_end = 0;
8199 		}
8200 
8201 
8202 		/*
8203 		 * Step 2: Perform various policy checks
8204 		 *         before we do _anything_ to this entry.
8205 		 */
8206 
8207 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8208 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8209 				/*
8210 				 * Either we found a gap already,
8211 				 * or we are tearing down a map,
8212 				 * keep going.
8213 				 */
8214 			} else if (state & VMDS_KERNEL_PMAP) {
8215 				__vm_map_delete_gap_panic(map, s, start, end);
8216 			} else if (s < end) {
8217 				state |= VMDS_FOUND_GAP;
8218 				gap_start = s;
8219 			}
8220 
8221 			if (entry == vm_map_to_entry(map) ||
8222 			    end <= entry->vme_start) {
8223 				break;
8224 			}
8225 
8226 			s = entry->vme_start;
8227 		}
8228 
8229 		if (state & VMDS_KERNEL_PMAP) {
8230 			/*
8231 			 * In the kernel map and its submaps,
8232 			 * permanent entries never die, even
8233 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8234 			 */
8235 			if (entry->vme_permanent) {
8236 				__vm_map_delete_permanent_panic(map, start, end, entry);
8237 			}
8238 
8239 			if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8240 				end = entry->vme_end;
8241 				flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8242 			}
8243 
8244 			/*
8245 			 * In the kernel map and its submaps,
8246 			 * the removal of an atomic/guarded entry is strict.
8247 			 *
8248 			 * An atomic entry is processed only if it was
8249 			 * specifically targeted.
8250 			 *
8251 			 * We might have deleted non-atomic entries before
8252 			 * we reach this this point however...
8253 			 */
8254 			kmem_entry_validate_guard(map, entry,
8255 			    start, end - start, guard);
8256 		}
8257 
8258 		/*
8259 		 * Step 2.1: handle "permanent" and "submap" entries
8260 		 * *before* clipping to avoid triggering some unnecessary
8261 		 * un-nesting of the shared region.
8262 		 */
8263 		if (entry->vme_permanent && entry->is_sub_map) {
8264 //			printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8265 			/*
8266 			 * Un-mapping a "permanent" mapping of a user-space
8267 			 * submap is not allowed unless...
8268 			 */
8269 			if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8270 				/*
8271 				 * a. explicitly requested by the kernel caller.
8272 				 */
8273 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8274 			} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8275 			    developer_mode_state()) {
8276 				/*
8277 				 * b. we're in "developer" mode (for
8278 				 *    breakpoints, dtrace probes, ...).
8279 				 */
8280 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8281 			} else if (map->terminated) {
8282 				/*
8283 				 * c. this is the final address space cleanup.
8284 				 */
8285 //				printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8286 			} else {
8287 				vm_map_offset_t submap_start, submap_end;
8288 				kern_return_t submap_kr;
8289 
8290 				/*
8291 				 * Check if there are any "permanent" mappings
8292 				 * in this range in the submap.
8293 				 */
8294 				if (entry->in_transition) {
8295 					/* can that even happen ? */
8296 					goto in_transition;
8297 				}
8298 				/* compute the clipped range in the submap */
8299 				submap_start = s - entry->vme_start;
8300 				submap_start += VME_OFFSET(entry);
8301 				submap_end = end - entry->vme_start;
8302 				submap_end += VME_OFFSET(entry);
8303 				submap_kr = vm_map_delete_submap_recurse(
8304 					VME_SUBMAP(entry),
8305 					submap_start,
8306 					submap_end);
8307 				if (submap_kr != KERN_SUCCESS) {
8308 					/*
8309 					 * There are some "permanent" mappings
8310 					 * in the submap: we are not allowed
8311 					 * to remove this range.
8312 					 */
8313 					printf("%d[%s] removing permanent submap entry "
8314 					    "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8315 					    proc_selfpid(),
8316 					    (get_bsdtask_info(current_task())
8317 					    ? proc_name_address(get_bsdtask_info(current_task()))
8318 					    : "?"), entry,
8319 					    (uint64_t)entry->vme_start,
8320 					    (uint64_t)entry->vme_end,
8321 					    entry->protection,
8322 					    entry->max_protection);
8323 					DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8324 					    vm_map_entry_t, entry,
8325 					    vm_map_offset_t, entry->vme_start,
8326 					    vm_map_offset_t, entry->vme_end,
8327 					    vm_prot_t, entry->protection,
8328 					    vm_prot_t, entry->max_protection,
8329 					    int, VME_ALIAS(entry));
8330 					ret.kmr_return = KERN_PROTECTION_FAILURE;
8331 					goto out;
8332 				}
8333 				/* no permanent mappings: proceed */
8334 			}
8335 		}
8336 
8337 		/*
8338 		 * Step 3: Perform any clipping needed.
8339 		 *
8340 		 *         After this, "entry" starts at "s", ends before "end"
8341 		 */
8342 
8343 		if (entry->vme_start < s) {
8344 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8345 			    entry->map_aligned &&
8346 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8347 				/*
8348 				 * The entry will no longer be map-aligned
8349 				 * after clipping and the caller said it's OK.
8350 				 */
8351 				entry->map_aligned = FALSE;
8352 			}
8353 			vm_map_clip_start(map, entry, s);
8354 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8355 		}
8356 
8357 		if (end < entry->vme_end) {
8358 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8359 			    entry->map_aligned &&
8360 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8361 				/*
8362 				 * The entry will no longer be map-aligned
8363 				 * after clipping and the caller said it's OK.
8364 				 */
8365 				entry->map_aligned = FALSE;
8366 			}
8367 			vm_map_clip_end(map, entry, end);
8368 		}
8369 
8370 		if (entry->vme_permanent && entry->is_sub_map) {
8371 			/*
8372 			 * We already went through step 2.1 which did not deny
8373 			 * the removal of this "permanent" and "is_sub_map"
8374 			 * entry.
8375 			 * Now that we've clipped what we actually want to
8376 			 * delete, undo the "permanent" part to allow the
8377 			 * removal to proceed.
8378 			 */
8379 			DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8380 			    vm_map_entry_t, entry,
8381 			    vm_map_offset_t, entry->vme_start,
8382 			    vm_map_offset_t, entry->vme_end,
8383 			    vm_prot_t, entry->protection,
8384 			    vm_prot_t, entry->max_protection,
8385 			    int, VME_ALIAS(entry));
8386 			entry->vme_permanent = false;
8387 		}
8388 
8389 		assert(s == entry->vme_start);
8390 		assert(entry->vme_end <= end);
8391 
8392 
8393 		/*
8394 		 * Step 4: If the entry is in flux, wait for this to resolve.
8395 		 */
8396 
8397 		if (entry->in_transition) {
8398 			wait_result_t wait_result;
8399 
8400 in_transition:
8401 			/*
8402 			 * Another thread is wiring/unwiring this entry.
8403 			 * Let the other thread know we are waiting.
8404 			 */
8405 
8406 			entry->needs_wakeup = TRUE;
8407 
8408 			/*
8409 			 * wake up anybody waiting on entries that we have
8410 			 * already unwired/deleted.
8411 			 */
8412 			if (state & VMDS_NEEDS_WAKEUP) {
8413 				vm_map_entry_wakeup(map);
8414 				state &= ~VMDS_NEEDS_WAKEUP;
8415 			}
8416 
8417 			wait_result = vm_map_entry_wait(map, interruptible);
8418 
8419 			if (interruptible &&
8420 			    wait_result == THREAD_INTERRUPTED) {
8421 				/*
8422 				 * We do not clear the needs_wakeup flag,
8423 				 * since we cannot tell if we were the only one.
8424 				 */
8425 				ret.kmr_return = KERN_ABORTED;
8426 				return ret;
8427 			}
8428 
8429 			/*
8430 			 * The entry could have been clipped or it
8431 			 * may not exist anymore.  Look it up again.
8432 			 */
8433 			state |= VMDS_NEEDS_LOOKUP;
8434 			continue;
8435 		}
8436 
8437 
8438 		/*
8439 		 * Step 5: Handle wiring
8440 		 */
8441 
8442 		if (entry->wired_count) {
8443 			struct vm_map_entry tmp_entry;
8444 			boolean_t           user_wire;
8445 			unsigned int        last_timestamp;
8446 
8447 			user_wire = entry->user_wired_count > 0;
8448 
8449 			/*
8450 			 *      Remove a kernel wiring if requested
8451 			 */
8452 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8453 				entry->wired_count--;
8454 				vme_btref_consider_and_put(entry);
8455 			}
8456 
8457 			/*
8458 			 *	Remove all user wirings for proper accounting
8459 			 */
8460 			while (entry->user_wired_count) {
8461 				subtract_wire_counts(map, entry, user_wire);
8462 			}
8463 
8464 			/*
8465 			 * All our DMA I/O operations in IOKit are currently
8466 			 * done by wiring through the map entries of the task
8467 			 * requesting the I/O.
8468 			 *
8469 			 * Because of this, we must always wait for kernel wirings
8470 			 * to go away on the entries before deleting them.
8471 			 *
8472 			 * Any caller who wants to actually remove a kernel wiring
8473 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8474 			 * properly remove one wiring instead of blasting through
8475 			 * them all.
8476 			 */
8477 			if (entry->wired_count != 0) {
8478 				assert(map != kernel_map);
8479 				/*
8480 				 * Cannot continue.  Typical case is when
8481 				 * a user thread has physical io pending on
8482 				 * on this page.  Either wait for the
8483 				 * kernel wiring to go away or return an
8484 				 * error.
8485 				 */
8486 				wait_result_t wait_result;
8487 
8488 				entry->needs_wakeup = TRUE;
8489 				wait_result = vm_map_entry_wait(map,
8490 				    interruptible);
8491 
8492 				if (interruptible &&
8493 				    wait_result == THREAD_INTERRUPTED) {
8494 					/*
8495 					 * We do not clear the
8496 					 * needs_wakeup flag, since we
8497 					 * cannot tell if we were the
8498 					 * only one.
8499 					 */
8500 					ret.kmr_return = KERN_ABORTED;
8501 					return ret;
8502 				}
8503 
8504 
8505 				/*
8506 				 * The entry could have been clipped or
8507 				 * it may not exist anymore.  Look it
8508 				 * up again.
8509 				 */
8510 				state |= VMDS_NEEDS_LOOKUP;
8511 				continue;
8512 			}
8513 
8514 			/*
8515 			 * We can unlock the map now.
8516 			 *
8517 			 * The entry might be split once we unlock the map,
8518 			 * but we need the range as defined by this entry
8519 			 * to be stable. So we must make a local copy.
8520 			 *
8521 			 * The underlying objects do not change during clips,
8522 			 * and the in_transition state guarentees existence
8523 			 * of the entry.
8524 			 */
8525 			last_timestamp = map->timestamp;
8526 			entry->in_transition = TRUE;
8527 			tmp_entry = *entry;
8528 			vm_map_unlock(map);
8529 
8530 			if (tmp_entry.is_sub_map) {
8531 				vm_map_t sub_map;
8532 				vm_map_offset_t sub_start, sub_end;
8533 				pmap_t pmap;
8534 				vm_map_offset_t pmap_addr;
8535 
8536 
8537 				sub_map = VME_SUBMAP(&tmp_entry);
8538 				sub_start = VME_OFFSET(&tmp_entry);
8539 				sub_end = sub_start + (tmp_entry.vme_end -
8540 				    tmp_entry.vme_start);
8541 				if (tmp_entry.use_pmap) {
8542 					pmap = sub_map->pmap;
8543 					pmap_addr = tmp_entry.vme_start;
8544 				} else {
8545 					pmap = map->pmap;
8546 					pmap_addr = tmp_entry.vme_start;
8547 				}
8548 				(void) vm_map_unwire_nested(sub_map,
8549 				    sub_start, sub_end,
8550 				    user_wire,
8551 				    pmap, pmap_addr);
8552 			} else {
8553 				vm_map_offset_t entry_end = tmp_entry.vme_end;
8554 				vm_map_offset_t max_end;
8555 
8556 				if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8557 					max_end = end - VM_MAP_PAGE_SIZE(map);
8558 					if (entry_end > max_end) {
8559 						entry_end = max_end;
8560 					}
8561 				}
8562 
8563 				if (tmp_entry.vme_kernel_object) {
8564 					pmap_protect_options(
8565 						map->pmap,
8566 						tmp_entry.vme_start,
8567 						entry_end,
8568 						VM_PROT_NONE,
8569 						PMAP_OPTIONS_REMOVE,
8570 						NULL);
8571 				}
8572 				vm_fault_unwire(map, &tmp_entry,
8573 				    tmp_entry.vme_kernel_object, map->pmap,
8574 				    tmp_entry.vme_start, entry_end);
8575 			}
8576 
8577 			vm_map_lock(map);
8578 
8579 			/*
8580 			 * Unwiring happened, we can now go back to deleting
8581 			 * them (after we clear the in_transition bit for the range).
8582 			 */
8583 			if (last_timestamp + 1 != map->timestamp) {
8584 				state |= VMDS_NEEDS_LOOKUP;
8585 			}
8586 			clear_in_transition_end = tmp_entry.vme_end;
8587 			continue;
8588 		}
8589 
8590 		assert(entry->wired_count == 0);
8591 		assert(entry->user_wired_count == 0);
8592 
8593 
8594 		/*
8595 		 * Step 6: Entry is unwired and ready for us to delete !
8596 		 */
8597 
8598 		if (!entry->vme_permanent) {
8599 			/*
8600 			 * Typical case: the entry really shouldn't be permanent
8601 			 */
8602 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8603 		    (entry->protection & VM_PROT_EXECUTE) &&
8604 		    developer_mode_state()) {
8605 			/*
8606 			 * Allow debuggers to undo executable mappings
8607 			 * when developer mode is on.
8608 			 */
8609 #if 0
8610 			printf("FBDP %d[%s] removing permanent executable entry "
8611 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8612 			    proc_selfpid(),
8613 			    (current_task()->bsd_info
8614 			    ? proc_name_address(current_task()->bsd_info)
8615 			    : "?"), entry,
8616 			    (uint64_t)entry->vme_start,
8617 			    (uint64_t)entry->vme_end,
8618 			    entry->protection,
8619 			    entry->max_protection);
8620 #endif
8621 			entry->vme_permanent = FALSE;
8622 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8623 #if 0
8624 			printf("FBDP %d[%s] removing permanent entry "
8625 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8626 			    proc_selfpid(),
8627 			    (current_task()->bsd_info
8628 			    ? proc_name_address(current_task()->bsd_info)
8629 			    : "?"), entry,
8630 			    (uint64_t)entry->vme_start,
8631 			    (uint64_t)entry->vme_end,
8632 			    entry->protection,
8633 			    entry->max_protection);
8634 #endif
8635 			entry->vme_permanent = FALSE;
8636 #if CODE_SIGNING_MONITOR
8637 		} else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8638 			entry->vme_permanent = FALSE;
8639 
8640 			printf("%d[%s] %s(0x%llx,0x%llx): "
8641 			    "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8642 			    "prot 0x%x/0x%x\n",
8643 			    proc_selfpid(),
8644 			    (get_bsdtask_info(current_task())
8645 			    ? proc_name_address(get_bsdtask_info(current_task()))
8646 			    : "?"),
8647 			    __FUNCTION__,
8648 			    (uint64_t)start,
8649 			    (uint64_t)end,
8650 			    (uint64_t)entry->vme_start,
8651 			    (uint64_t)entry->vme_end,
8652 			    entry->protection,
8653 			    entry->max_protection);
8654 #endif
8655 		} else {
8656 			DTRACE_VM6(vm_map_delete_permanent,
8657 			    vm_map_entry_t, entry,
8658 			    vm_map_offset_t, entry->vme_start,
8659 			    vm_map_offset_t, entry->vme_end,
8660 			    vm_prot_t, entry->protection,
8661 			    vm_prot_t, entry->max_protection,
8662 			    int, VME_ALIAS(entry));
8663 		}
8664 
8665 		if (entry->is_sub_map) {
8666 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8667 			    "map %p (%d) entry %p submap %p (%d)\n",
8668 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8669 			    VME_SUBMAP(entry),
8670 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8671 			if (entry->use_pmap) {
8672 #ifndef NO_NESTED_PMAP
8673 				int pmap_flags;
8674 
8675 				if (map->terminated) {
8676 					/*
8677 					 * This is the final cleanup of the
8678 					 * address space being terminated.
8679 					 * No new mappings are expected and
8680 					 * we don't really need to unnest the
8681 					 * shared region (and lose the "global"
8682 					 * pmap mappings, if applicable).
8683 					 *
8684 					 * Tell the pmap layer that we're
8685 					 * "clean" wrt nesting.
8686 					 */
8687 					pmap_flags = PMAP_UNNEST_CLEAN;
8688 				} else {
8689 					/*
8690 					 * We're unmapping part of the nested
8691 					 * shared region, so we can't keep the
8692 					 * nested pmap.
8693 					 */
8694 					pmap_flags = 0;
8695 				}
8696 				pmap_unnest_options(
8697 					map->pmap,
8698 					(addr64_t)entry->vme_start,
8699 					entry->vme_end - entry->vme_start,
8700 					pmap_flags);
8701 #endif  /* NO_NESTED_PMAP */
8702 				if (map->mapped_in_other_pmaps &&
8703 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8704 					/* clean up parent map/maps */
8705 					vm_map_submap_pmap_clean(
8706 						map, entry->vme_start,
8707 						entry->vme_end,
8708 						VME_SUBMAP(entry),
8709 						VME_OFFSET(entry));
8710 				}
8711 			} else {
8712 				vm_map_submap_pmap_clean(
8713 					map, entry->vme_start, entry->vme_end,
8714 					VME_SUBMAP(entry),
8715 					VME_OFFSET(entry));
8716 			}
8717 		} else if (entry->vme_kernel_object ||
8718 		    VME_OBJECT(entry) == compressor_object) {
8719 			/*
8720 			 * nothing to do
8721 			 */
8722 		} else if (map->mapped_in_other_pmaps &&
8723 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8724 			vm_object_pmap_protect_options(
8725 				VME_OBJECT(entry), VME_OFFSET(entry),
8726 				entry->vme_end - entry->vme_start,
8727 				PMAP_NULL,
8728 				PAGE_SIZE,
8729 				entry->vme_start,
8730 				VM_PROT_NONE,
8731 				PMAP_OPTIONS_REMOVE);
8732 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8733 		    (state & VMDS_KERNEL_PMAP)) {
8734 			/* Remove translations associated
8735 			 * with this range unless the entry
8736 			 * does not have an object, or
8737 			 * it's the kernel map or a descendant
8738 			 * since the platform could potentially
8739 			 * create "backdoor" mappings invisible
8740 			 * to the VM. It is expected that
8741 			 * objectless, non-kernel ranges
8742 			 * do not have such VM invisible
8743 			 * translations.
8744 			 */
8745 			vm_map_address_t remove_start = entry->vme_start;
8746 			vm_map_address_t remove_end = entry->vme_end;
8747 #if MACH_ASSERT
8748 			/*
8749 			 * Prevent panics in pmap_remove() from some vm test code
8750 			 * which uses virtual address ranges that pmap disallows.
8751 			 */
8752 			if (thread_get_test_option(test_option_vm_map_clamp_pmap_remove)) {
8753 				vm_map_clamp_to_pmap(map, &remove_start, &remove_end);
8754 			}
8755 #endif /* MACH_ASSERT */
8756 			pmap_remove(map->pmap, remove_start, remove_end);
8757 		}
8758 
8759 #if DEBUG
8760 		/*
8761 		 * All pmap mappings for this map entry must have been
8762 		 * cleared by now.
8763 		 */
8764 		assert(pmap_is_empty(map->pmap,
8765 		    entry->vme_start,
8766 		    entry->vme_end));
8767 #endif /* DEBUG */
8768 
8769 		if (entry->iokit_acct) {
8770 			/* alternate accounting */
8771 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8772 			    vm_map_t, map,
8773 			    vm_map_offset_t, entry->vme_start,
8774 			    vm_map_offset_t, entry->vme_end,
8775 			    int, VME_ALIAS(entry));
8776 			vm_map_iokit_unmapped_region(map,
8777 			    (entry->vme_end -
8778 			    entry->vme_start));
8779 			entry->iokit_acct = FALSE;
8780 			entry->use_pmap = FALSE;
8781 		}
8782 
8783 		/* move "s" forward */
8784 		s    = entry->vme_end;
8785 		next = entry->vme_next;
8786 		if (!entry->map_aligned) {
8787 			vm_map_offset_t rounded_s;
8788 
8789 			/*
8790 			 * Skip artificial gap due to mis-aligned entry
8791 			 * on devices with a page size smaller than the
8792 			 * map's page size (i.e. 16k task on a 4k device).
8793 			 */
8794 			rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8795 			if (next == vm_map_to_entry(map)) {
8796 				s = rounded_s;
8797 			} else if (s < rounded_s) {
8798 				s = MIN(rounded_s, next->vme_start);
8799 			}
8800 		}
8801 		ret.kmr_size += s - entry->vme_start;
8802 
8803 		if (entry->vme_permanent) {
8804 			/*
8805 			 * A permanent entry can not be removed, so leave it
8806 			 * in place but remove all access permissions.
8807 			 */
8808 			if (__improbable(vm_log_map_delete_permanent_prot_none)) {
8809 				printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8810 				    __FUNCTION__, __LINE__,
8811 				    proc_selfpid(),
8812 				    (get_bsdtask_info(current_task())
8813 				    ? proc_name_address(get_bsdtask_info(current_task()))
8814 				    : "?"),
8815 				    map,
8816 				    entry,
8817 				    (uint64_t)entry->vme_start,
8818 				    (uint64_t)entry->vme_end,
8819 				    entry->is_sub_map,
8820 				    entry->protection,
8821 				    entry->max_protection);
8822 			}
8823 			DTRACE_VM6(vm_map_delete_permanent_prot_none,
8824 			    vm_map_entry_t, entry,
8825 			    vm_map_offset_t, entry->vme_start,
8826 			    vm_map_offset_t, entry->vme_end,
8827 			    vm_prot_t, entry->protection,
8828 			    vm_prot_t, entry->max_protection,
8829 			    int, VME_ALIAS(entry));
8830 			entry->protection = VM_PROT_NONE;
8831 			entry->max_protection = VM_PROT_NONE;
8832 #ifdef __arm64e__
8833 			entry->used_for_tpro = FALSE;
8834 #endif
8835 		} else {
8836 			vm_map_entry_zap(map, entry, zap_list);
8837 		}
8838 
8839 		entry = next;
8840 		next  = VM_MAP_ENTRY_NULL;
8841 
8842 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8843 			unsigned int last_timestamp = map->timestamp++;
8844 
8845 			if (lck_rw_lock_yield_exclusive(&map->lock,
8846 			    LCK_RW_YIELD_ANY_WAITER)) {
8847 				if (last_timestamp != map->timestamp + 1) {
8848 					state |= VMDS_NEEDS_LOOKUP;
8849 				}
8850 			} else {
8851 				/* we didn't yield, undo our change */
8852 				map->timestamp--;
8853 			}
8854 		}
8855 	}
8856 
8857 	if (map->wait_for_space) {
8858 		thread_wakeup((event_t) map);
8859 	}
8860 
8861 	if (state & VMDS_NEEDS_WAKEUP) {
8862 		vm_map_entry_wakeup(map);
8863 	}
8864 
8865 out:
8866 	if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8867 		__vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8868 	}
8869 
8870 	if (state & VMDS_KERNEL_KMEMPTR) {
8871 		kmem_free_space(start, end, range_id, &slot);
8872 	}
8873 
8874 	if (state & VMDS_FOUND_GAP) {
8875 		DTRACE_VM3(kern_vm_deallocate_gap,
8876 		    vm_map_offset_t, gap_start,
8877 		    vm_map_offset_t, save_start,
8878 		    vm_map_offset_t, save_end);
8879 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8880 			ret.kmr_return = KERN_INVALID_VALUE;
8881 		} else {
8882 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8883 		}
8884 	}
8885 
8886 	return ret;
8887 }
8888 
8889 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8890 vm_map_remove_and_unlock(
8891 	vm_map_t        map,
8892 	vm_map_offset_t start,
8893 	vm_map_offset_t end,
8894 	vmr_flags_t     flags,
8895 	kmem_guard_t    guard)
8896 {
8897 	kmem_return_t ret;
8898 	VM_MAP_ZAP_DECLARE(zap);
8899 
8900 	ret = vm_map_delete(map, start, end, flags, guard, &zap);
8901 	vm_map_unlock(map);
8902 
8903 	vm_map_zap_dispose(&zap);
8904 
8905 	return ret;
8906 }
8907 
8908 /*
8909  *	vm_map_remove_guard:
8910  *
8911  *	Remove the given address range from the target map.
8912  *	This is the exported form of vm_map_delete.
8913  */
8914 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8915 vm_map_remove_guard(
8916 	vm_map_t        map,
8917 	vm_map_offset_t start,
8918 	vm_map_offset_t end,
8919 	vmr_flags_t     flags,
8920 	kmem_guard_t    guard)
8921 {
8922 	vm_map_lock(map);
8923 	return vm_map_remove_and_unlock(map, start, end, flags, guard);
8924 }
8925 
8926 /*
8927  *	vm_map_terminate:
8928  *
8929  *	Clean out a task's map.
8930  */
8931 kern_return_t
vm_map_terminate(vm_map_t map)8932 vm_map_terminate(
8933 	vm_map_t        map)
8934 {
8935 	vm_map_lock(map);
8936 	map->terminated = TRUE;
8937 	vm_map_disable_hole_optimization(map);
8938 	(void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8939 	    VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
8940 	return KERN_SUCCESS;
8941 }
8942 
8943 /*
8944  *	Routine:	vm_map_copy_allocate
8945  *
8946  *	Description:
8947  *		Allocates and initializes a map copy object.
8948  */
8949 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)8950 vm_map_copy_allocate(uint16_t type)
8951 {
8952 	vm_map_copy_t new_copy;
8953 
8954 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
8955 	new_copy->type = type;
8956 	if (type == VM_MAP_COPY_ENTRY_LIST) {
8957 		new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8958 		vm_map_store_init(&new_copy->cpy_hdr);
8959 	}
8960 	return new_copy;
8961 }
8962 
8963 /*
8964  *	Routine:	vm_map_copy_discard
8965  *
8966  *	Description:
8967  *		Dispose of a map copy object (returned by
8968  *		vm_map_copyin).
8969  */
8970 void
vm_map_copy_discard(vm_map_copy_t copy)8971 vm_map_copy_discard(
8972 	vm_map_copy_t   copy)
8973 {
8974 	if (copy == VM_MAP_COPY_NULL) {
8975 		return;
8976 	}
8977 
8978 	/*
8979 	 * Assert that the vm_map_copy is coming from the right
8980 	 * zone and hasn't been forged
8981 	 */
8982 	vm_map_copy_require(copy);
8983 
8984 	switch (copy->type) {
8985 	case VM_MAP_COPY_ENTRY_LIST:
8986 		while (vm_map_copy_first_entry(copy) !=
8987 		    vm_map_copy_to_entry(copy)) {
8988 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
8989 
8990 			vm_map_copy_entry_unlink(copy, entry);
8991 			if (entry->is_sub_map) {
8992 				vm_map_deallocate(VME_SUBMAP(entry));
8993 			} else {
8994 				vm_object_deallocate(VME_OBJECT(entry));
8995 			}
8996 			vm_map_copy_entry_dispose(entry);
8997 		}
8998 		break;
8999 	case VM_MAP_COPY_KERNEL_BUFFER:
9000 
9001 		/*
9002 		 * The vm_map_copy_t and possibly the data buffer were
9003 		 * allocated by a single call to kalloc_data(), i.e. the
9004 		 * vm_map_copy_t was not allocated out of the zone.
9005 		 */
9006 		if (copy->size > msg_ool_size_small || copy->offset) {
9007 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9008 			    (long long)copy->size, (long long)copy->offset);
9009 		}
9010 		kfree_data(copy->cpy_kdata, copy->size);
9011 	}
9012 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9013 }
9014 
9015 #if XNU_PLATFORM_MacOSX
9016 
9017 __exported
9018 extern vm_map_copy_t vm_map_copy_copy(vm_map_copy_t copy);
9019 
9020 /*
9021  *	Routine:	vm_map_copy_copy
9022  *
9023  *	Description:
9024  *			Move the information in a map copy object to
9025  *			a new map copy object, leaving the old one
9026  *			empty.
9027  *
9028  *			This is used by kernel routines that need
9029  *			to look at out-of-line data (in copyin form)
9030  *			before deciding whether to return SUCCESS.
9031  *			If the routine returns FAILURE, the original
9032  *			copy object will be deallocated; therefore,
9033  *			these routines must make a copy of the copy
9034  *			object and leave the original empty so that
9035  *			deallocation will not fail.
9036  */
9037 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9038 vm_map_copy_copy(
9039 	vm_map_copy_t   copy)
9040 {
9041 	vm_map_copy_t   new_copy;
9042 
9043 	if (copy == VM_MAP_COPY_NULL) {
9044 		return VM_MAP_COPY_NULL;
9045 	}
9046 
9047 	/*
9048 	 * Assert that the vm_map_copy is coming from the right
9049 	 * zone and hasn't been forged
9050 	 */
9051 	vm_map_copy_require(copy);
9052 
9053 	/*
9054 	 * Allocate a new copy object, and copy the information
9055 	 * from the old one into it.
9056 	 */
9057 
9058 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9059 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9060 #if __has_feature(ptrauth_calls)
9061 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9062 		new_copy->cpy_kdata = copy->cpy_kdata;
9063 	}
9064 #endif
9065 
9066 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9067 		/*
9068 		 * The links in the entry chain must be
9069 		 * changed to point to the new copy object.
9070 		 */
9071 		vm_map_copy_first_entry(copy)->vme_prev
9072 		        = vm_map_copy_to_entry(new_copy);
9073 		vm_map_copy_last_entry(copy)->vme_next
9074 		        = vm_map_copy_to_entry(new_copy);
9075 	}
9076 
9077 	/*
9078 	 * Change the old copy object into one that contains
9079 	 * nothing to be deallocated.
9080 	 */
9081 	bzero(copy, sizeof(struct vm_map_copy));
9082 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9083 
9084 	/*
9085 	 * Return the new object.
9086 	 */
9087 	return new_copy;
9088 }
9089 
9090 #endif /* XNU_PLATFORM_MacOSX */
9091 
9092 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9093 vm_map_entry_is_overwritable(
9094 	vm_map_t        dst_map __unused,
9095 	vm_map_entry_t  entry)
9096 {
9097 	if (!(entry->protection & VM_PROT_WRITE)) {
9098 		/* can't overwrite if not writable */
9099 		return FALSE;
9100 	}
9101 #if !__x86_64__
9102 	if (entry->used_for_jit &&
9103 	    vm_map_cs_enforcement(dst_map) &&
9104 	    !dst_map->cs_debugged) {
9105 		/*
9106 		 * Can't overwrite a JIT region while cs_enforced
9107 		 * and not cs_debugged.
9108 		 */
9109 		return FALSE;
9110 	}
9111 
9112 #if __arm64e__
9113 	/* Do not allow overwrite HW assisted TPRO entries */
9114 	if (entry->used_for_tpro) {
9115 		return FALSE;
9116 	}
9117 #endif /* __arm64e__ */
9118 
9119 	if (entry->vme_permanent) {
9120 		if (entry->is_sub_map) {
9121 			/*
9122 			 * We can't tell if the submap contains "permanent"
9123 			 * entries within the range targeted by the caller.
9124 			 * The caller will have to check for that with
9125 			 * vm_map_overwrite_submap_recurse() for example.
9126 			 */
9127 		} else {
9128 			/*
9129 			 * Do not allow overwriting of a "permanent"
9130 			 * entry.
9131 			 */
9132 			DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9133 			    vm_map_entry_t, entry,
9134 			    vm_map_offset_t, entry->vme_start,
9135 			    vm_map_offset_t, entry->vme_end,
9136 			    vm_prot_t, entry->protection,
9137 			    vm_prot_t, entry->max_protection,
9138 			    int, VME_ALIAS(entry));
9139 			return FALSE;
9140 		}
9141 	}
9142 #endif /* !__x86_64__ */
9143 
9144 	if (entry->is_sub_map) {
9145 		/* remember not to assume every entry has a VM object... */
9146 	}
9147 
9148 	return TRUE;
9149 }
9150 
9151 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9152 vm_map_overwrite_submap_recurse(
9153 	vm_map_t        dst_map,
9154 	vm_map_offset_t dst_addr,
9155 	vm_map_size_t   dst_size)
9156 {
9157 	vm_map_offset_t dst_end;
9158 	vm_map_entry_t  tmp_entry;
9159 	vm_map_entry_t  entry;
9160 	kern_return_t   result;
9161 	boolean_t       encountered_sub_map = FALSE;
9162 
9163 
9164 
9165 	/*
9166 	 *	Verify that the destination is all writeable
9167 	 *	initially.  We have to trunc the destination
9168 	 *	address and round the copy size or we'll end up
9169 	 *	splitting entries in strange ways.
9170 	 */
9171 
9172 	dst_end = vm_map_round_page(dst_addr + dst_size,
9173 	    VM_MAP_PAGE_MASK(dst_map));
9174 	vm_map_lock(dst_map);
9175 
9176 start_pass_1:
9177 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9178 		vm_map_unlock(dst_map);
9179 		return KERN_INVALID_ADDRESS;
9180 	}
9181 
9182 	vm_map_clip_start(dst_map,
9183 	    tmp_entry,
9184 	    vm_map_trunc_page(dst_addr,
9185 	    VM_MAP_PAGE_MASK(dst_map)));
9186 	if (tmp_entry->is_sub_map) {
9187 		/* clipping did unnest if needed */
9188 		assert(!tmp_entry->use_pmap);
9189 	}
9190 
9191 	for (entry = tmp_entry;;) {
9192 		vm_map_entry_t  next;
9193 
9194 		next = entry->vme_next;
9195 		while (entry->is_sub_map) {
9196 			vm_map_offset_t sub_start;
9197 			vm_map_offset_t sub_end;
9198 			vm_map_offset_t local_end;
9199 
9200 			if (entry->in_transition) {
9201 				/*
9202 				 * Say that we are waiting, and wait for entry.
9203 				 */
9204 				entry->needs_wakeup = TRUE;
9205 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9206 
9207 				goto start_pass_1;
9208 			}
9209 
9210 			encountered_sub_map = TRUE;
9211 			sub_start = VME_OFFSET(entry);
9212 
9213 			if (entry->vme_end < dst_end) {
9214 				sub_end = entry->vme_end;
9215 			} else {
9216 				sub_end = dst_end;
9217 			}
9218 			sub_end -= entry->vme_start;
9219 			sub_end += VME_OFFSET(entry);
9220 			local_end = entry->vme_end;
9221 			vm_map_unlock(dst_map);
9222 
9223 			result = vm_map_overwrite_submap_recurse(
9224 				VME_SUBMAP(entry),
9225 				sub_start,
9226 				sub_end - sub_start);
9227 
9228 			if (result != KERN_SUCCESS) {
9229 				return result;
9230 			}
9231 			if (dst_end <= entry->vme_end) {
9232 				return KERN_SUCCESS;
9233 			}
9234 			vm_map_lock(dst_map);
9235 			if (!vm_map_lookup_entry(dst_map, local_end,
9236 			    &tmp_entry)) {
9237 				vm_map_unlock(dst_map);
9238 				return KERN_INVALID_ADDRESS;
9239 			}
9240 			entry = tmp_entry;
9241 			next = entry->vme_next;
9242 		}
9243 		assert(!entry->is_sub_map);
9244 
9245 		if (!(entry->protection & VM_PROT_WRITE)) {
9246 			vm_map_unlock(dst_map);
9247 			return KERN_PROTECTION_FAILURE;
9248 		}
9249 
9250 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9251 			vm_map_unlock(dst_map);
9252 			return KERN_PROTECTION_FAILURE;
9253 		}
9254 
9255 		/*
9256 		 *	If the entry is in transition, we must wait
9257 		 *	for it to exit that state.  Anything could happen
9258 		 *	when we unlock the map, so start over.
9259 		 */
9260 		if (entry->in_transition) {
9261 			/*
9262 			 * Say that we are waiting, and wait for entry.
9263 			 */
9264 			entry->needs_wakeup = TRUE;
9265 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9266 
9267 			goto start_pass_1;
9268 		}
9269 
9270 /*
9271  *		our range is contained completely within this map entry
9272  */
9273 		if (dst_end <= entry->vme_end) {
9274 			vm_map_unlock(dst_map);
9275 			return KERN_SUCCESS;
9276 		}
9277 /*
9278  *		check that range specified is contiguous region
9279  */
9280 		if ((next == vm_map_to_entry(dst_map)) ||
9281 		    (next->vme_start != entry->vme_end)) {
9282 			vm_map_unlock(dst_map);
9283 			return KERN_INVALID_ADDRESS;
9284 		}
9285 
9286 		/*
9287 		 *	Check for permanent objects in the destination.
9288 		 */
9289 		assert(!entry->is_sub_map);
9290 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9291 		    ((!VME_OBJECT(entry)->internal) ||
9292 		    (VME_OBJECT(entry)->true_share))) {
9293 			if (encountered_sub_map) {
9294 				vm_map_unlock(dst_map);
9295 				return KERN_FAILURE;
9296 			}
9297 		}
9298 
9299 
9300 		entry = next;
9301 	}/* for */
9302 	vm_map_unlock(dst_map);
9303 	return KERN_SUCCESS;
9304 }
9305 
9306 /*
9307  *	Routine:	vm_map_copy_overwrite
9308  *
9309  *	Description:
9310  *		Copy the memory described by the map copy
9311  *		object (copy; returned by vm_map_copyin) onto
9312  *		the specified destination region (dst_map, dst_addr).
9313  *		The destination must be writeable.
9314  *
9315  *		Unlike vm_map_copyout, this routine actually
9316  *		writes over previously-mapped memory.  If the
9317  *		previous mapping was to a permanent (user-supplied)
9318  *		memory object, it is preserved.
9319  *
9320  *		The attributes (protection and inheritance) of the
9321  *		destination region are preserved.
9322  *
9323  *		If successful, consumes the copy object.
9324  *		Otherwise, the caller is responsible for it.
9325  *
9326  *	Implementation notes:
9327  *		To overwrite aligned temporary virtual memory, it is
9328  *		sufficient to remove the previous mapping and insert
9329  *		the new copy.  This replacement is done either on
9330  *		the whole region (if no permanent virtual memory
9331  *		objects are embedded in the destination region) or
9332  *		in individual map entries.
9333  *
9334  *		To overwrite permanent virtual memory , it is necessary
9335  *		to copy each page, as the external memory management
9336  *		interface currently does not provide any optimizations.
9337  *
9338  *		Unaligned memory also has to be copied.  It is possible
9339  *		to use 'vm_trickery' to copy the aligned data.  This is
9340  *		not done but not hard to implement.
9341  *
9342  *		Once a page of permanent memory has been overwritten,
9343  *		it is impossible to interrupt this function; otherwise,
9344  *		the call would be neither atomic nor location-independent.
9345  *		The kernel-state portion of a user thread must be
9346  *		interruptible.
9347  *
9348  *		It may be expensive to forward all requests that might
9349  *		overwrite permanent memory (vm_write, vm_copy) to
9350  *		uninterruptible kernel threads.  This routine may be
9351  *		called by interruptible threads; however, success is
9352  *		not guaranteed -- if the request cannot be performed
9353  *		atomically and interruptibly, an error indication is
9354  *		returned.
9355  *
9356  *		Callers of this function must call vm_map_copy_require on
9357  *		previously created vm_map_copy_t or pass a newly created
9358  *		one to ensure that it hasn't been forged.
9359  */
9360 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9361 vm_map_copy_overwrite_nested(
9362 	vm_map_t                dst_map,
9363 	vm_map_address_t        dst_addr,
9364 	vm_map_copy_t           copy,
9365 	boolean_t               interruptible,
9366 	pmap_t                  pmap,
9367 	boolean_t               discard_on_success)
9368 {
9369 	vm_map_offset_t         dst_end;
9370 	vm_map_entry_t          tmp_entry;
9371 	vm_map_entry_t          entry;
9372 	kern_return_t           kr;
9373 	boolean_t               aligned = TRUE;
9374 	boolean_t               contains_permanent_objects = FALSE;
9375 	boolean_t               encountered_sub_map = FALSE;
9376 	vm_map_offset_t         base_addr;
9377 	vm_map_size_t           copy_size;
9378 	vm_map_size_t           total_size;
9379 	uint16_t                copy_page_shift;
9380 
9381 	/*
9382 	 *	Check for special kernel buffer allocated
9383 	 *	by new_ipc_kmsg_copyin.
9384 	 */
9385 
9386 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9387 		kr = vm_map_copyout_kernel_buffer(
9388 			dst_map, &dst_addr,
9389 			copy, copy->size, TRUE, discard_on_success);
9390 		return kr;
9391 	}
9392 
9393 	/*
9394 	 *      Only works for entry lists at the moment.  Will
9395 	 *	support page lists later.
9396 	 */
9397 
9398 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9399 
9400 	if (copy->size == 0) {
9401 		if (discard_on_success) {
9402 			vm_map_copy_discard(copy);
9403 		}
9404 		return KERN_SUCCESS;
9405 	}
9406 
9407 	copy_page_shift = copy->cpy_hdr.page_shift;
9408 
9409 	/*
9410 	 *	Verify that the destination is all writeable
9411 	 *	initially.  We have to trunc the destination
9412 	 *	address and round the copy size or we'll end up
9413 	 *	splitting entries in strange ways.
9414 	 */
9415 
9416 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
9417 	    VM_MAP_PAGE_MASK(dst_map)) ||
9418 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
9419 	    VM_MAP_PAGE_MASK(dst_map)) ||
9420 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
9421 	    VM_MAP_PAGE_MASK(dst_map)) ||
9422 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9423 		aligned = FALSE;
9424 		dst_end = vm_map_round_page(dst_addr + copy->size,
9425 		    VM_MAP_PAGE_MASK(dst_map));
9426 	} else {
9427 		dst_end = dst_addr + copy->size;
9428 	}
9429 
9430 	vm_map_lock(dst_map);
9431 
9432 	/* LP64todo - remove this check when vm_map_commpage64()
9433 	 * no longer has to stuff in a map_entry for the commpage
9434 	 * above the map's max_offset.
9435 	 */
9436 	if (dst_addr >= dst_map->max_offset) {
9437 		vm_map_unlock(dst_map);
9438 		return KERN_INVALID_ADDRESS;
9439 	}
9440 
9441 start_pass_1:
9442 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9443 		vm_map_unlock(dst_map);
9444 		return KERN_INVALID_ADDRESS;
9445 	}
9446 	vm_map_clip_start(dst_map,
9447 	    tmp_entry,
9448 	    vm_map_trunc_page(dst_addr,
9449 	    VM_MAP_PAGE_MASK(dst_map)));
9450 	for (entry = tmp_entry;;) {
9451 		vm_map_entry_t  next = entry->vme_next;
9452 
9453 		while (entry->is_sub_map) {
9454 			vm_map_offset_t sub_start;
9455 			vm_map_offset_t sub_end;
9456 			vm_map_offset_t local_end;
9457 
9458 			if (entry->in_transition) {
9459 				/*
9460 				 * Say that we are waiting, and wait for entry.
9461 				 */
9462 				entry->needs_wakeup = TRUE;
9463 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9464 
9465 				goto start_pass_1;
9466 			}
9467 
9468 			local_end = entry->vme_end;
9469 			if (!(entry->needs_copy)) {
9470 				/* if needs_copy we are a COW submap */
9471 				/* in such a case we just replace so */
9472 				/* there is no need for the follow-  */
9473 				/* ing check.                        */
9474 				encountered_sub_map = TRUE;
9475 				sub_start = VME_OFFSET(entry);
9476 
9477 				if (entry->vme_end < dst_end) {
9478 					sub_end = entry->vme_end;
9479 				} else {
9480 					sub_end = dst_end;
9481 				}
9482 				sub_end -= entry->vme_start;
9483 				sub_end += VME_OFFSET(entry);
9484 				vm_map_unlock(dst_map);
9485 
9486 				kr = vm_map_overwrite_submap_recurse(
9487 					VME_SUBMAP(entry),
9488 					sub_start,
9489 					sub_end - sub_start);
9490 				if (kr != KERN_SUCCESS) {
9491 					return kr;
9492 				}
9493 				vm_map_lock(dst_map);
9494 			}
9495 
9496 			if (dst_end <= entry->vme_end) {
9497 				goto start_overwrite;
9498 			}
9499 			if (!vm_map_lookup_entry(dst_map, local_end,
9500 			    &entry)) {
9501 				vm_map_unlock(dst_map);
9502 				return KERN_INVALID_ADDRESS;
9503 			}
9504 			next = entry->vme_next;
9505 		}
9506 		assert(!entry->is_sub_map);
9507 
9508 		if (!(entry->protection & VM_PROT_WRITE)) {
9509 			vm_map_unlock(dst_map);
9510 			return KERN_PROTECTION_FAILURE;
9511 		}
9512 
9513 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9514 			vm_map_unlock(dst_map);
9515 			return KERN_PROTECTION_FAILURE;
9516 		}
9517 
9518 		/*
9519 		 *	If the entry is in transition, we must wait
9520 		 *	for it to exit that state.  Anything could happen
9521 		 *	when we unlock the map, so start over.
9522 		 */
9523 		if (entry->in_transition) {
9524 			/*
9525 			 * Say that we are waiting, and wait for entry.
9526 			 */
9527 			entry->needs_wakeup = TRUE;
9528 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9529 
9530 			goto start_pass_1;
9531 		}
9532 
9533 /*
9534  *		our range is contained completely within this map entry
9535  */
9536 		if (dst_end <= entry->vme_end) {
9537 			break;
9538 		}
9539 /*
9540  *		check that range specified is contiguous region
9541  */
9542 		if ((next == vm_map_to_entry(dst_map)) ||
9543 		    (next->vme_start != entry->vme_end)) {
9544 			vm_map_unlock(dst_map);
9545 			return KERN_INVALID_ADDRESS;
9546 		}
9547 
9548 
9549 		/*
9550 		 *	Check for permanent objects in the destination.
9551 		 */
9552 		assert(!entry->is_sub_map);
9553 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9554 		    ((!VME_OBJECT(entry)->internal) ||
9555 		    (VME_OBJECT(entry)->true_share))) {
9556 			contains_permanent_objects = TRUE;
9557 		}
9558 
9559 		entry = next;
9560 	}/* for */
9561 
9562 start_overwrite:
9563 	/*
9564 	 *	If there are permanent objects in the destination, then
9565 	 *	the copy cannot be interrupted.
9566 	 */
9567 
9568 	if (interruptible && contains_permanent_objects) {
9569 		vm_map_unlock(dst_map);
9570 		return KERN_FAILURE;   /* XXX */
9571 	}
9572 
9573 	/*
9574 	 *
9575 	 *	Make a second pass, overwriting the data
9576 	 *	At the beginning of each loop iteration,
9577 	 *	the next entry to be overwritten is "tmp_entry"
9578 	 *	(initially, the value returned from the lookup above),
9579 	 *	and the starting address expected in that entry
9580 	 *	is "start".
9581 	 */
9582 
9583 	total_size = copy->size;
9584 	if (encountered_sub_map) {
9585 		copy_size = 0;
9586 		/* re-calculate tmp_entry since we've had the map */
9587 		/* unlocked */
9588 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9589 			vm_map_unlock(dst_map);
9590 			return KERN_INVALID_ADDRESS;
9591 		}
9592 	} else {
9593 		copy_size = copy->size;
9594 	}
9595 
9596 	base_addr = dst_addr;
9597 	while (TRUE) {
9598 		/* deconstruct the copy object and do in parts */
9599 		/* only in sub_map, interruptable case */
9600 		vm_map_entry_t  copy_entry;
9601 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9602 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9603 		int             nentries;
9604 		int             remaining_entries = 0;
9605 		vm_map_offset_t new_offset = 0;
9606 
9607 		for (entry = tmp_entry; copy_size == 0;) {
9608 			vm_map_entry_t  next;
9609 
9610 			next = entry->vme_next;
9611 
9612 			/* tmp_entry and base address are moved along */
9613 			/* each time we encounter a sub-map.  Otherwise */
9614 			/* entry can outpase tmp_entry, and the copy_size */
9615 			/* may reflect the distance between them */
9616 			/* if the current entry is found to be in transition */
9617 			/* we will start over at the beginning or the last */
9618 			/* encounter of a submap as dictated by base_addr */
9619 			/* we will zero copy_size accordingly. */
9620 			if (entry->in_transition) {
9621 				/*
9622 				 * Say that we are waiting, and wait for entry.
9623 				 */
9624 				entry->needs_wakeup = TRUE;
9625 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9626 
9627 				if (!vm_map_lookup_entry(dst_map, base_addr,
9628 				    &tmp_entry)) {
9629 					vm_map_unlock(dst_map);
9630 					return KERN_INVALID_ADDRESS;
9631 				}
9632 				copy_size = 0;
9633 				entry = tmp_entry;
9634 				continue;
9635 			}
9636 			if (entry->is_sub_map) {
9637 				vm_map_offset_t sub_start;
9638 				vm_map_offset_t sub_end;
9639 				vm_map_offset_t local_end;
9640 
9641 				if (entry->needs_copy) {
9642 					/* if this is a COW submap */
9643 					/* just back the range with a */
9644 					/* anonymous entry */
9645 					assert(!entry->vme_permanent);
9646 					if (entry->vme_end < dst_end) {
9647 						sub_end = entry->vme_end;
9648 					} else {
9649 						sub_end = dst_end;
9650 					}
9651 					if (entry->vme_start < base_addr) {
9652 						sub_start = base_addr;
9653 					} else {
9654 						sub_start = entry->vme_start;
9655 					}
9656 					vm_map_clip_end(
9657 						dst_map, entry, sub_end);
9658 					vm_map_clip_start(
9659 						dst_map, entry, sub_start);
9660 					assert(!entry->use_pmap);
9661 					assert(!entry->iokit_acct);
9662 					entry->use_pmap = TRUE;
9663 					vm_map_deallocate(VME_SUBMAP(entry));
9664 					assert(!entry->vme_permanent);
9665 					VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9666 					VME_OFFSET_SET(entry, 0);
9667 					entry->is_shared = FALSE;
9668 					entry->needs_copy = FALSE;
9669 					entry->protection = VM_PROT_DEFAULT;
9670 					entry->max_protection = VM_PROT_ALL;
9671 					entry->wired_count = 0;
9672 					entry->user_wired_count = 0;
9673 					if (entry->inheritance
9674 					    == VM_INHERIT_SHARE) {
9675 						entry->inheritance = VM_INHERIT_COPY;
9676 					}
9677 					continue;
9678 				}
9679 				/* first take care of any non-sub_map */
9680 				/* entries to send */
9681 				if (base_addr < entry->vme_start) {
9682 					/* stuff to send */
9683 					copy_size =
9684 					    entry->vme_start - base_addr;
9685 					break;
9686 				}
9687 				sub_start = VME_OFFSET(entry);
9688 
9689 				if (entry->vme_end < dst_end) {
9690 					sub_end = entry->vme_end;
9691 				} else {
9692 					sub_end = dst_end;
9693 				}
9694 				sub_end -= entry->vme_start;
9695 				sub_end += VME_OFFSET(entry);
9696 				local_end = entry->vme_end;
9697 				vm_map_unlock(dst_map);
9698 				copy_size = sub_end - sub_start;
9699 
9700 				/* adjust the copy object */
9701 				if (total_size > copy_size) {
9702 					vm_map_size_t   local_size = 0;
9703 					vm_map_size_t   entry_size;
9704 
9705 					nentries = 1;
9706 					new_offset = copy->offset;
9707 					copy_entry = vm_map_copy_first_entry(copy);
9708 					while (copy_entry !=
9709 					    vm_map_copy_to_entry(copy)) {
9710 						entry_size = copy_entry->vme_end -
9711 						    copy_entry->vme_start;
9712 						if ((local_size < copy_size) &&
9713 						    ((local_size + entry_size)
9714 						    >= copy_size)) {
9715 							vm_map_copy_clip_end(copy,
9716 							    copy_entry,
9717 							    copy_entry->vme_start +
9718 							    (copy_size - local_size));
9719 							entry_size = copy_entry->vme_end -
9720 							    copy_entry->vme_start;
9721 							local_size += entry_size;
9722 							new_offset += entry_size;
9723 						}
9724 						if (local_size >= copy_size) {
9725 							next_copy = copy_entry->vme_next;
9726 							copy_entry->vme_next =
9727 							    vm_map_copy_to_entry(copy);
9728 							previous_prev =
9729 							    copy->cpy_hdr.links.prev;
9730 							copy->cpy_hdr.links.prev = copy_entry;
9731 							copy->size = copy_size;
9732 							remaining_entries =
9733 							    copy->cpy_hdr.nentries;
9734 							remaining_entries -= nentries;
9735 							copy->cpy_hdr.nentries = nentries;
9736 							break;
9737 						} else {
9738 							local_size += entry_size;
9739 							new_offset += entry_size;
9740 							nentries++;
9741 						}
9742 						copy_entry = copy_entry->vme_next;
9743 					}
9744 				}
9745 
9746 				if ((entry->use_pmap) && (pmap == NULL)) {
9747 					kr = vm_map_copy_overwrite_nested(
9748 						VME_SUBMAP(entry),
9749 						sub_start,
9750 						copy,
9751 						interruptible,
9752 						VME_SUBMAP(entry)->pmap,
9753 						TRUE);
9754 				} else if (pmap != NULL) {
9755 					kr = vm_map_copy_overwrite_nested(
9756 						VME_SUBMAP(entry),
9757 						sub_start,
9758 						copy,
9759 						interruptible, pmap,
9760 						TRUE);
9761 				} else {
9762 					kr = vm_map_copy_overwrite_nested(
9763 						VME_SUBMAP(entry),
9764 						sub_start,
9765 						copy,
9766 						interruptible,
9767 						dst_map->pmap,
9768 						TRUE);
9769 				}
9770 				if (kr != KERN_SUCCESS) {
9771 					if (next_copy != NULL) {
9772 						copy->cpy_hdr.nentries +=
9773 						    remaining_entries;
9774 						copy->cpy_hdr.links.prev->vme_next =
9775 						    next_copy;
9776 						copy->cpy_hdr.links.prev
9777 						        = previous_prev;
9778 						copy->size = total_size;
9779 					}
9780 					return kr;
9781 				}
9782 				if (dst_end <= local_end) {
9783 					return KERN_SUCCESS;
9784 				}
9785 				/* otherwise copy no longer exists, it was */
9786 				/* destroyed after successful copy_overwrite */
9787 				copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9788 				copy->offset = new_offset;
9789 				copy->cpy_hdr.page_shift = copy_page_shift;
9790 
9791 				total_size -= copy_size;
9792 				copy_size = 0;
9793 				/* put back remainder of copy in container */
9794 				if (next_copy != NULL) {
9795 					copy->cpy_hdr.nentries = remaining_entries;
9796 					copy->cpy_hdr.links.next = next_copy;
9797 					copy->cpy_hdr.links.prev = previous_prev;
9798 					copy->size = total_size;
9799 					next_copy->vme_prev =
9800 					    vm_map_copy_to_entry(copy);
9801 					next_copy = NULL;
9802 				}
9803 				base_addr = local_end;
9804 				vm_map_lock(dst_map);
9805 				if (!vm_map_lookup_entry(dst_map,
9806 				    local_end, &tmp_entry)) {
9807 					vm_map_unlock(dst_map);
9808 					return KERN_INVALID_ADDRESS;
9809 				}
9810 				entry = tmp_entry;
9811 				continue;
9812 			}
9813 			assert(!entry->is_sub_map);
9814 
9815 			if (dst_end <= entry->vme_end) {
9816 				copy_size = dst_end - base_addr;
9817 				break;
9818 			}
9819 
9820 			if ((next == vm_map_to_entry(dst_map)) ||
9821 			    (next->vme_start != entry->vme_end)) {
9822 				vm_map_unlock(dst_map);
9823 				return KERN_INVALID_ADDRESS;
9824 			}
9825 
9826 			entry = next;
9827 		}/* for */
9828 
9829 		next_copy = NULL;
9830 		nentries = 1;
9831 
9832 		/* adjust the copy object */
9833 		if (total_size > copy_size) {
9834 			vm_map_size_t   local_size = 0;
9835 			vm_map_size_t   entry_size;
9836 
9837 			new_offset = copy->offset;
9838 			copy_entry = vm_map_copy_first_entry(copy);
9839 			while (copy_entry != vm_map_copy_to_entry(copy)) {
9840 				entry_size = copy_entry->vme_end -
9841 				    copy_entry->vme_start;
9842 				if ((local_size < copy_size) &&
9843 				    ((local_size + entry_size)
9844 				    >= copy_size)) {
9845 					vm_map_copy_clip_end(copy, copy_entry,
9846 					    copy_entry->vme_start +
9847 					    (copy_size - local_size));
9848 					entry_size = copy_entry->vme_end -
9849 					    copy_entry->vme_start;
9850 					local_size += entry_size;
9851 					new_offset += entry_size;
9852 				}
9853 				if (local_size >= copy_size) {
9854 					next_copy = copy_entry->vme_next;
9855 					copy_entry->vme_next =
9856 					    vm_map_copy_to_entry(copy);
9857 					previous_prev =
9858 					    copy->cpy_hdr.links.prev;
9859 					copy->cpy_hdr.links.prev = copy_entry;
9860 					copy->size = copy_size;
9861 					remaining_entries =
9862 					    copy->cpy_hdr.nentries;
9863 					remaining_entries -= nentries;
9864 					copy->cpy_hdr.nentries = nentries;
9865 					break;
9866 				} else {
9867 					local_size += entry_size;
9868 					new_offset += entry_size;
9869 					nentries++;
9870 				}
9871 				copy_entry = copy_entry->vme_next;
9872 			}
9873 		}
9874 
9875 		if (aligned) {
9876 			pmap_t  local_pmap;
9877 
9878 			if (pmap) {
9879 				local_pmap = pmap;
9880 			} else {
9881 				local_pmap = dst_map->pmap;
9882 			}
9883 
9884 			if ((kr =  vm_map_copy_overwrite_aligned(
9885 				    dst_map, tmp_entry, copy,
9886 				    base_addr, local_pmap)) != KERN_SUCCESS) {
9887 				if (next_copy != NULL) {
9888 					copy->cpy_hdr.nentries +=
9889 					    remaining_entries;
9890 					copy->cpy_hdr.links.prev->vme_next =
9891 					    next_copy;
9892 					copy->cpy_hdr.links.prev =
9893 					    previous_prev;
9894 					copy->size += copy_size;
9895 				}
9896 				return kr;
9897 			}
9898 			vm_map_unlock(dst_map);
9899 		} else {
9900 			/*
9901 			 * Performance gain:
9902 			 *
9903 			 * if the copy and dst address are misaligned but the same
9904 			 * offset within the page we can copy_not_aligned the
9905 			 * misaligned parts and copy aligned the rest.  If they are
9906 			 * aligned but len is unaligned we simply need to copy
9907 			 * the end bit unaligned.  We'll need to split the misaligned
9908 			 * bits of the region in this case !
9909 			 */
9910 			/* ALWAYS UNLOCKS THE dst_map MAP */
9911 			kr = vm_map_copy_overwrite_unaligned(
9912 				dst_map,
9913 				tmp_entry,
9914 				copy,
9915 				base_addr,
9916 				discard_on_success);
9917 			if (kr != KERN_SUCCESS) {
9918 				if (next_copy != NULL) {
9919 					copy->cpy_hdr.nentries +=
9920 					    remaining_entries;
9921 					copy->cpy_hdr.links.prev->vme_next =
9922 					    next_copy;
9923 					copy->cpy_hdr.links.prev =
9924 					    previous_prev;
9925 					copy->size += copy_size;
9926 				}
9927 				return kr;
9928 			}
9929 		}
9930 		total_size -= copy_size;
9931 		if (total_size == 0) {
9932 			break;
9933 		}
9934 		base_addr += copy_size;
9935 		copy_size = 0;
9936 		copy->offset = new_offset;
9937 		if (next_copy != NULL) {
9938 			copy->cpy_hdr.nentries = remaining_entries;
9939 			copy->cpy_hdr.links.next = next_copy;
9940 			copy->cpy_hdr.links.prev = previous_prev;
9941 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
9942 			copy->size = total_size;
9943 		}
9944 		vm_map_lock(dst_map);
9945 		while (TRUE) {
9946 			if (!vm_map_lookup_entry(dst_map,
9947 			    base_addr, &tmp_entry)) {
9948 				vm_map_unlock(dst_map);
9949 				return KERN_INVALID_ADDRESS;
9950 			}
9951 			if (tmp_entry->in_transition) {
9952 				entry->needs_wakeup = TRUE;
9953 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9954 			} else {
9955 				break;
9956 			}
9957 		}
9958 		vm_map_clip_start(dst_map,
9959 		    tmp_entry,
9960 		    vm_map_trunc_page(base_addr,
9961 		    VM_MAP_PAGE_MASK(dst_map)));
9962 
9963 		entry = tmp_entry;
9964 	} /* while */
9965 
9966 	/*
9967 	 *	Throw away the vm_map_copy object
9968 	 */
9969 	if (discard_on_success) {
9970 		vm_map_copy_discard(copy);
9971 	}
9972 
9973 	return KERN_SUCCESS;
9974 }/* vm_map_copy_overwrite */
9975 
9976 static __attribute__((always_inline, warn_unused_result))
9977 kern_return_t
vm_map_copy_addr_size_sanitize(vm_map_t map,vm_map_offset_ut addr_u,vm_map_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * addr,vm_map_offset_t * end,vm_map_size_t * size)9978 vm_map_copy_addr_size_sanitize(
9979 	vm_map_t                map,
9980 	vm_map_offset_ut        addr_u,
9981 	vm_map_size_ut          size_u,
9982 	vm_sanitize_caller_t    vm_sanitize_caller,
9983 	vm_map_offset_t        *addr,
9984 	vm_map_offset_t        *end,
9985 	vm_map_size_t          *size)
9986 {
9987 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
9988 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES;
9989 
9990 
9991 	return vm_sanitize_addr_size(addr_u, size_u,
9992 	           vm_sanitize_caller, map,
9993 	           flags,
9994 	           addr, end, size);
9995 }
9996 
9997 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_ut dst_addr_u,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t interruptible)9998 vm_map_copy_overwrite(
9999 	vm_map_t                dst_map,
10000 	vm_map_offset_ut        dst_addr_u,
10001 	vm_map_copy_t           copy,
10002 	vm_map_size_ut          copy_size_u,
10003 	boolean_t               interruptible)
10004 {
10005 	vm_map_offset_t dst_addr, dst_end;
10006 	vm_map_size_t   copy_size;
10007 	vm_map_size_t   head_size, tail_size;
10008 	vm_map_copy_t   head_copy, tail_copy;
10009 	vm_map_offset_t head_addr, tail_addr;
10010 	vm_map_entry_t  entry;
10011 	kern_return_t   kr;
10012 	vm_map_offset_t effective_page_mask, effective_page_size;
10013 	uint16_t        copy_page_shift;
10014 
10015 	head_size = 0;
10016 	tail_size = 0;
10017 	head_copy = NULL;
10018 	tail_copy = NULL;
10019 	head_addr = 0;
10020 	tail_addr = 0;
10021 
10022 	/*
10023 	 *	Check for null copy object.
10024 	 */
10025 	if (copy == VM_MAP_COPY_NULL) {
10026 		return KERN_SUCCESS;
10027 	}
10028 
10029 	/*
10030 	 * Sanitize any input parameters that are addr/size/prot/inherit
10031 	 */
10032 	kr = vm_map_copy_addr_size_sanitize(
10033 		dst_map,
10034 		dst_addr_u,
10035 		copy_size_u,
10036 		VM_SANITIZE_CALLER_VM_MAP_COPY_OVERWRITE,
10037 		&dst_addr,
10038 		&dst_end,
10039 		&copy_size);
10040 	if (__improbable(kr != KERN_SUCCESS)) {
10041 		return vm_sanitize_get_kr(kr);
10042 	}
10043 
10044 	/*
10045 	 * Assert that the vm_map_copy is coming from the right
10046 	 * zone and hasn't been forged
10047 	 */
10048 	vm_map_copy_require(copy);
10049 
10050 	if (interruptible ||
10051 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
10052 		/*
10053 		 * We can't split the "copy" map if we're interruptible
10054 		 * or if we don't have a "copy" map...
10055 		 */
10056 blunt_copy:
10057 		kr = vm_map_copy_overwrite_nested(dst_map,
10058 		    dst_addr,
10059 		    copy,
10060 		    interruptible,
10061 		    (pmap_t) NULL,
10062 		    TRUE);
10063 		if (kr) {
10064 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10065 		}
10066 		return kr;
10067 	}
10068 
10069 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10070 	if (copy_page_shift < PAGE_SHIFT ||
10071 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10072 		goto blunt_copy;
10073 	}
10074 
10075 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10076 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10077 	} else {
10078 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10079 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10080 		    effective_page_mask);
10081 	}
10082 	effective_page_size = effective_page_mask + 1;
10083 
10084 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10085 		/*
10086 		 * Too small to bother with optimizing...
10087 		 */
10088 		goto blunt_copy;
10089 	}
10090 
10091 	if ((dst_addr & effective_page_mask) !=
10092 	    (copy->offset & effective_page_mask)) {
10093 		/*
10094 		 * Incompatible mis-alignment of source and destination...
10095 		 */
10096 		goto blunt_copy;
10097 	}
10098 
10099 	/*
10100 	 * Proper alignment or identical mis-alignment at the beginning.
10101 	 * Let's try and do a small unaligned copy first (if needed)
10102 	 * and then an aligned copy for the rest.
10103 	 */
10104 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10105 		head_addr = dst_addr;
10106 		head_size = (effective_page_size -
10107 		    (copy->offset & effective_page_mask));
10108 		head_size = MIN(head_size, copy_size);
10109 	}
10110 	if (!vm_map_page_aligned(copy->offset + copy_size,
10111 	    effective_page_mask)) {
10112 		/*
10113 		 * Mis-alignment at the end.
10114 		 * Do an aligned copy up to the last page and
10115 		 * then an unaligned copy for the remaining bytes.
10116 		 */
10117 		tail_size = ((copy->offset + copy_size) &
10118 		    effective_page_mask);
10119 		tail_size = MIN(tail_size, copy_size);
10120 		tail_addr = dst_addr + copy_size - tail_size;
10121 		assert(tail_addr >= head_addr + head_size);
10122 	}
10123 	assert(head_size + tail_size <= copy_size);
10124 
10125 	if (head_size + tail_size == copy_size) {
10126 		/*
10127 		 * It's all unaligned, no optimization possible...
10128 		 */
10129 		goto blunt_copy;
10130 	}
10131 
10132 	/*
10133 	 * Can't optimize if there are any submaps in the
10134 	 * destination due to the way we free the "copy" map
10135 	 * progressively in vm_map_copy_overwrite_nested()
10136 	 * in that case.
10137 	 */
10138 	vm_map_lock_read(dst_map);
10139 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10140 		vm_map_unlock_read(dst_map);
10141 		goto blunt_copy;
10142 	}
10143 	for (;
10144 	    (entry != vm_map_to_entry(dst_map) &&
10145 	    entry->vme_start < dst_addr + copy_size);
10146 	    entry = entry->vme_next) {
10147 		if (entry->is_sub_map) {
10148 			vm_map_unlock_read(dst_map);
10149 			goto blunt_copy;
10150 		}
10151 	}
10152 	vm_map_unlock_read(dst_map);
10153 
10154 	if (head_size) {
10155 		/*
10156 		 * Unaligned copy of the first "head_size" bytes, to reach
10157 		 * a page boundary.
10158 		 */
10159 
10160 		/*
10161 		 * Extract "head_copy" out of "copy".
10162 		 */
10163 		head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10164 		head_copy->cpy_hdr.entries_pageable =
10165 		    copy->cpy_hdr.entries_pageable;
10166 		head_copy->cpy_hdr.page_shift = copy_page_shift;
10167 
10168 		entry = vm_map_copy_first_entry(copy);
10169 		if (entry->vme_end < copy->offset + head_size) {
10170 			head_size = entry->vme_end - copy->offset;
10171 		}
10172 
10173 		head_copy->offset = copy->offset;
10174 		head_copy->size = head_size;
10175 		copy->offset += head_size;
10176 		copy->size -= head_size;
10177 		copy_size -= head_size;
10178 		assert(copy_size > 0);
10179 
10180 		vm_map_copy_clip_end(copy, entry, copy->offset);
10181 		vm_map_copy_entry_unlink(copy, entry);
10182 		vm_map_copy_entry_link(head_copy,
10183 		    vm_map_copy_to_entry(head_copy),
10184 		    entry);
10185 
10186 		/*
10187 		 * Do the unaligned copy.
10188 		 */
10189 		kr = vm_map_copy_overwrite_nested(dst_map,
10190 		    head_addr,
10191 		    head_copy,
10192 		    interruptible,
10193 		    (pmap_t) NULL,
10194 		    FALSE);
10195 		if (kr != KERN_SUCCESS) {
10196 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10197 			goto done;
10198 		}
10199 	}
10200 
10201 	if (tail_size) {
10202 		/*
10203 		 * Extract "tail_copy" out of "copy".
10204 		 */
10205 		tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10206 		tail_copy->cpy_hdr.entries_pageable =
10207 		    copy->cpy_hdr.entries_pageable;
10208 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
10209 
10210 		tail_copy->offset = copy->offset + copy_size - tail_size;
10211 		tail_copy->size = tail_size;
10212 
10213 		copy->size -= tail_size;
10214 		copy_size -= tail_size;
10215 		assert(copy_size > 0);
10216 
10217 		entry = vm_map_copy_last_entry(copy);
10218 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10219 		entry = vm_map_copy_last_entry(copy);
10220 		vm_map_copy_entry_unlink(copy, entry);
10221 		vm_map_copy_entry_link(tail_copy,
10222 		    vm_map_copy_last_entry(tail_copy),
10223 		    entry);
10224 	}
10225 
10226 	/*
10227 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10228 	 * we want to avoid TOCTOU issues w.r.t copy->size but
10229 	 * we don't need to change vm_map_copy_overwrite_nested()
10230 	 * and all other vm_map_copy_overwrite variants.
10231 	 *
10232 	 * So we assign the original copy_size that was passed into
10233 	 * this routine back to copy.
10234 	 *
10235 	 * This use of local 'copy_size' passed into this routine is
10236 	 * to try and protect against TOCTOU attacks where the kernel
10237 	 * has been exploited. We don't expect this to be an issue
10238 	 * during normal system operation.
10239 	 */
10240 	assertf(copy->size == copy_size,
10241 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10242 	copy->size = copy_size;
10243 
10244 	/*
10245 	 * Copy most (or possibly all) of the data.
10246 	 */
10247 	kr = vm_map_copy_overwrite_nested(dst_map,
10248 	    dst_addr + head_size,
10249 	    copy,
10250 	    interruptible,
10251 	    (pmap_t) NULL,
10252 	    FALSE);
10253 	if (kr != KERN_SUCCESS) {
10254 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10255 		goto done;
10256 	}
10257 
10258 	if (tail_size) {
10259 		kr = vm_map_copy_overwrite_nested(dst_map,
10260 		    tail_addr,
10261 		    tail_copy,
10262 		    interruptible,
10263 		    (pmap_t) NULL,
10264 		    FALSE);
10265 		if (kr) {
10266 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10267 		}
10268 	}
10269 
10270 done:
10271 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10272 	if (kr == KERN_SUCCESS) {
10273 		/*
10274 		 * Discard all the copy maps.
10275 		 */
10276 		if (head_copy) {
10277 			vm_map_copy_discard(head_copy);
10278 			head_copy = NULL;
10279 		}
10280 		vm_map_copy_discard(copy);
10281 		if (tail_copy) {
10282 			vm_map_copy_discard(tail_copy);
10283 			tail_copy = NULL;
10284 		}
10285 	} else {
10286 		/*
10287 		 * Re-assemble the original copy map.
10288 		 */
10289 		if (head_copy) {
10290 			entry = vm_map_copy_first_entry(head_copy);
10291 			vm_map_copy_entry_unlink(head_copy, entry);
10292 			vm_map_copy_entry_link(copy,
10293 			    vm_map_copy_to_entry(copy),
10294 			    entry);
10295 			copy->offset -= head_size;
10296 			copy->size += head_size;
10297 			vm_map_copy_discard(head_copy);
10298 			head_copy = NULL;
10299 		}
10300 		if (tail_copy) {
10301 			entry = vm_map_copy_last_entry(tail_copy);
10302 			vm_map_copy_entry_unlink(tail_copy, entry);
10303 			vm_map_copy_entry_link(copy,
10304 			    vm_map_copy_last_entry(copy),
10305 			    entry);
10306 			copy->size += tail_size;
10307 			vm_map_copy_discard(tail_copy);
10308 			tail_copy = NULL;
10309 		}
10310 	}
10311 	return kr;
10312 }
10313 
10314 
10315 /*
10316  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
10317  *
10318  *	Decription:
10319  *	Physically copy unaligned data
10320  *
10321  *	Implementation:
10322  *	Unaligned parts of pages have to be physically copied.  We use
10323  *	a modified form of vm_fault_copy (which understands none-aligned
10324  *	page offsets and sizes) to do the copy.  We attempt to copy as
10325  *	much memory in one go as possibly, however vm_fault_copy copies
10326  *	within 1 memory object so we have to find the smaller of "amount left"
10327  *	"source object data size" and "target object data size".  With
10328  *	unaligned data we don't need to split regions, therefore the source
10329  *	(copy) object should be one map entry, the target range may be split
10330  *	over multiple map entries however.  In any event we are pessimistic
10331  *	about these assumptions.
10332  *
10333  *	Callers of this function must call vm_map_copy_require on
10334  *	previously created vm_map_copy_t or pass a newly created
10335  *	one to ensure that it hasn't been forged.
10336  *
10337  *	Assumptions:
10338  *	dst_map is locked on entry and is return locked on success,
10339  *	unlocked on error.
10340  */
10341 
10342 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10343 vm_map_copy_overwrite_unaligned(
10344 	vm_map_t        dst_map,
10345 	vm_map_entry_t  entry,
10346 	vm_map_copy_t   copy,
10347 	vm_map_offset_t start,
10348 	boolean_t       discard_on_success)
10349 {
10350 	vm_map_entry_t          copy_entry;
10351 	vm_map_entry_t          copy_entry_next;
10352 	vm_map_version_t        version;
10353 	vm_object_t             dst_object;
10354 	vm_object_offset_t      dst_offset;
10355 	vm_object_offset_t      src_offset;
10356 	vm_object_offset_t      entry_offset;
10357 	vm_map_offset_t         entry_end;
10358 	vm_map_size_t           src_size,
10359 	    dst_size,
10360 	    copy_size,
10361 	    amount_left;
10362 	kern_return_t           kr = KERN_SUCCESS;
10363 
10364 
10365 	copy_entry = vm_map_copy_first_entry(copy);
10366 
10367 	vm_map_lock_write_to_read(dst_map);
10368 
10369 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10370 	amount_left = copy->size;
10371 /*
10372  *	unaligned so we never clipped this entry, we need the offset into
10373  *	the vm_object not just the data.
10374  */
10375 	while (amount_left > 0) {
10376 		if (entry == vm_map_to_entry(dst_map)) {
10377 			vm_map_unlock_read(dst_map);
10378 			return KERN_INVALID_ADDRESS;
10379 		}
10380 
10381 		/* "start" must be within the current map entry */
10382 		assert((start >= entry->vme_start) && (start < entry->vme_end));
10383 
10384 		/*
10385 		 *	Check protection again
10386 		 */
10387 		if (!(entry->protection & VM_PROT_WRITE)) {
10388 			vm_map_unlock_read(dst_map);
10389 			return KERN_PROTECTION_FAILURE;
10390 		}
10391 		if (entry->is_sub_map) {
10392 			/* not implemented... */
10393 			vm_map_unlock_read(dst_map);
10394 			return KERN_INVALID_ARGUMENT;
10395 		}
10396 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10397 			vm_map_unlock_read(dst_map);
10398 			return KERN_PROTECTION_FAILURE;
10399 		}
10400 		/*
10401 		 *	If the entry is in transition, we must wait
10402 		 *	for it to exit that state.  Anything could happen
10403 		 *	when we unlock the map, so start over.
10404 		 */
10405 		if (entry->in_transition) {
10406 			/*
10407 			 * Say that we are waiting, and wait for entry.
10408 			 */
10409 			entry->needs_wakeup = TRUE;
10410 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10411 
10412 			goto RetryLookup;
10413 		}
10414 
10415 		dst_offset = start - entry->vme_start;
10416 
10417 		dst_size = entry->vme_end - start;
10418 
10419 		src_size = copy_entry->vme_end -
10420 		    (copy_entry->vme_start + src_offset);
10421 
10422 		if (dst_size < src_size) {
10423 /*
10424  *			we can only copy dst_size bytes before
10425  *			we have to get the next destination entry
10426  */
10427 			copy_size = dst_size;
10428 		} else {
10429 /*
10430  *			we can only copy src_size bytes before
10431  *			we have to get the next source copy entry
10432  */
10433 			copy_size = src_size;
10434 		}
10435 
10436 		if (copy_size > amount_left) {
10437 			copy_size = amount_left;
10438 		}
10439 /*
10440  *		Entry needs copy, create a shadow shadow object for
10441  *		Copy on write region.
10442  */
10443 		assert(!entry->is_sub_map);
10444 		if (entry->needs_copy) {
10445 			if (vm_map_lock_read_to_write(dst_map)) {
10446 				vm_map_lock_read(dst_map);
10447 				goto RetryLookup;
10448 			}
10449 			VME_OBJECT_SHADOW(entry,
10450 			    (vm_map_size_t)(entry->vme_end
10451 			    - entry->vme_start),
10452 			    vm_map_always_shadow(dst_map));
10453 			entry->needs_copy = FALSE;
10454 			vm_map_lock_write_to_read(dst_map);
10455 		}
10456 		dst_object = VME_OBJECT(entry);
10457 /*
10458  *		unlike with the virtual (aligned) copy we're going
10459  *		to fault on it therefore we need a target object.
10460  */
10461 		if (dst_object == VM_OBJECT_NULL) {
10462 			if (vm_map_lock_read_to_write(dst_map)) {
10463 				vm_map_lock_read(dst_map);
10464 				goto RetryLookup;
10465 			}
10466 			dst_object = vm_object_allocate((vm_map_size_t)
10467 			    entry->vme_end - entry->vme_start);
10468 			VME_OBJECT_SET(entry, dst_object, false, 0);
10469 			VME_OFFSET_SET(entry, 0);
10470 			assert(entry->use_pmap);
10471 			vm_map_lock_write_to_read(dst_map);
10472 		}
10473 /*
10474  *		Take an object reference and unlock map. The "entry" may
10475  *		disappear or change when the map is unlocked.
10476  */
10477 		vm_object_reference(dst_object);
10478 		version.main_timestamp = dst_map->timestamp;
10479 		entry_offset = VME_OFFSET(entry);
10480 		entry_end = entry->vme_end;
10481 		vm_map_unlock_read(dst_map);
10482 /*
10483  *		Copy as much as possible in one pass
10484  */
10485 		kr = vm_fault_copy(
10486 			VME_OBJECT(copy_entry),
10487 			VME_OFFSET(copy_entry) + src_offset,
10488 			&copy_size,
10489 			dst_object,
10490 			entry_offset + dst_offset,
10491 			dst_map,
10492 			&version,
10493 			THREAD_UNINT );
10494 
10495 		start += copy_size;
10496 		src_offset += copy_size;
10497 		amount_left -= copy_size;
10498 /*
10499  *		Release the object reference
10500  */
10501 		vm_object_deallocate(dst_object);
10502 /*
10503  *		If a hard error occurred, return it now
10504  */
10505 		if (kr != KERN_SUCCESS) {
10506 			return kr;
10507 		}
10508 
10509 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10510 		    || amount_left == 0) {
10511 /*
10512  *			all done with this copy entry, dispose.
10513  */
10514 			copy_entry_next = copy_entry->vme_next;
10515 
10516 			if (discard_on_success) {
10517 				vm_map_copy_entry_unlink(copy, copy_entry);
10518 				assert(!copy_entry->is_sub_map);
10519 				vm_object_deallocate(VME_OBJECT(copy_entry));
10520 				vm_map_copy_entry_dispose(copy_entry);
10521 			}
10522 
10523 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10524 			    amount_left) {
10525 /*
10526  *				not finished copying but run out of source
10527  */
10528 				return KERN_INVALID_ADDRESS;
10529 			}
10530 
10531 			copy_entry = copy_entry_next;
10532 
10533 			src_offset = 0;
10534 		}
10535 
10536 		if (amount_left == 0) {
10537 			return KERN_SUCCESS;
10538 		}
10539 
10540 		vm_map_lock_read(dst_map);
10541 		if (version.main_timestamp == dst_map->timestamp) {
10542 			if (start == entry_end) {
10543 /*
10544  *				destination region is split.  Use the version
10545  *				information to avoid a lookup in the normal
10546  *				case.
10547  */
10548 				entry = entry->vme_next;
10549 /*
10550  *				should be contiguous. Fail if we encounter
10551  *				a hole in the destination.
10552  */
10553 				if (start != entry->vme_start) {
10554 					vm_map_unlock_read(dst_map);
10555 					return KERN_INVALID_ADDRESS;
10556 				}
10557 			}
10558 		} else {
10559 /*
10560  *			Map version check failed.
10561  *			we must lookup the entry because somebody
10562  *			might have changed the map behind our backs.
10563  */
10564 RetryLookup:
10565 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10566 				vm_map_unlock_read(dst_map);
10567 				return KERN_INVALID_ADDRESS;
10568 			}
10569 		}
10570 	}/* while */
10571 
10572 	return KERN_SUCCESS;
10573 }/* vm_map_copy_overwrite_unaligned */
10574 
10575 /*
10576  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
10577  *
10578  *	Description:
10579  *	Does all the vm_trickery possible for whole pages.
10580  *
10581  *	Implementation:
10582  *
10583  *	If there are no permanent objects in the destination,
10584  *	and the source and destination map entry zones match,
10585  *	and the destination map entry is not shared,
10586  *	then the map entries can be deleted and replaced
10587  *	with those from the copy.  The following code is the
10588  *	basic idea of what to do, but there are lots of annoying
10589  *	little details about getting protection and inheritance
10590  *	right.  Should add protection, inheritance, and sharing checks
10591  *	to the above pass and make sure that no wiring is involved.
10592  *
10593  *	Callers of this function must call vm_map_copy_require on
10594  *	previously created vm_map_copy_t or pass a newly created
10595  *	one to ensure that it hasn't been forged.
10596  */
10597 
10598 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10599 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10600 int vm_map_copy_overwrite_aligned_src_large = 0;
10601 
10602 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10603 vm_map_copy_overwrite_aligned(
10604 	vm_map_t        dst_map,
10605 	vm_map_entry_t  tmp_entry,
10606 	vm_map_copy_t   copy,
10607 	vm_map_offset_t start,
10608 	__unused pmap_t pmap)
10609 {
10610 	vm_object_t     object;
10611 	vm_map_entry_t  copy_entry;
10612 	vm_map_size_t   copy_size;
10613 	vm_map_size_t   size;
10614 	vm_map_entry_t  entry;
10615 
10616 	while ((copy_entry = vm_map_copy_first_entry(copy))
10617 	    != vm_map_copy_to_entry(copy)) {
10618 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10619 
10620 		entry = tmp_entry;
10621 
10622 		if (entry->is_sub_map) {
10623 			/* unnested when clipped earlier */
10624 			assert(!entry->use_pmap);
10625 		}
10626 		if (entry == vm_map_to_entry(dst_map)) {
10627 			vm_map_unlock(dst_map);
10628 			return KERN_INVALID_ADDRESS;
10629 		}
10630 		size = (entry->vme_end - entry->vme_start);
10631 		/*
10632 		 *	Make sure that no holes popped up in the
10633 		 *	address map, and that the protection is
10634 		 *	still valid, in case the map was unlocked
10635 		 *	earlier.
10636 		 */
10637 
10638 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10639 		    && !entry->needs_copy)) {
10640 			vm_map_unlock(dst_map);
10641 			return KERN_INVALID_ADDRESS;
10642 		}
10643 		assert(entry != vm_map_to_entry(dst_map));
10644 
10645 		/*
10646 		 *	Check protection again
10647 		 */
10648 
10649 		if (!(entry->protection & VM_PROT_WRITE)) {
10650 			vm_map_unlock(dst_map);
10651 			return KERN_PROTECTION_FAILURE;
10652 		}
10653 
10654 		if (entry->is_sub_map) {
10655 			/* not properly implemented */
10656 			vm_map_unlock(dst_map);
10657 			return KERN_PROTECTION_FAILURE;
10658 		}
10659 
10660 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10661 			vm_map_unlock(dst_map);
10662 			return KERN_PROTECTION_FAILURE;
10663 		}
10664 
10665 		/*
10666 		 *	If the entry is in transition, we must wait
10667 		 *	for it to exit that state.  Anything could happen
10668 		 *	when we unlock the map, so start over.
10669 		 */
10670 		if (entry->in_transition) {
10671 			/*
10672 			 * Say that we are waiting, and wait for entry.
10673 			 */
10674 			entry->needs_wakeup = TRUE;
10675 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10676 
10677 			goto RetryLookup;
10678 		}
10679 
10680 		/*
10681 		 *	Adjust to source size first
10682 		 */
10683 
10684 		if (copy_size < size) {
10685 			if (entry->map_aligned &&
10686 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10687 			    VM_MAP_PAGE_MASK(dst_map))) {
10688 				/* no longer map-aligned */
10689 				entry->map_aligned = FALSE;
10690 			}
10691 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10692 			size = copy_size;
10693 		}
10694 
10695 		/*
10696 		 *	Adjust to destination size
10697 		 */
10698 
10699 		if (size < copy_size) {
10700 			vm_map_copy_clip_end(copy, copy_entry,
10701 			    copy_entry->vme_start + size);
10702 			copy_size = size;
10703 		}
10704 
10705 		assert((entry->vme_end - entry->vme_start) == size);
10706 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10707 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10708 
10709 		/*
10710 		 *	If the destination contains temporary unshared memory,
10711 		 *	we can perform the copy by throwing it away and
10712 		 *	installing the source data.
10713 		 *
10714 		 *	Exceptions for mappings with special semantics:
10715 		 *	+ "permanent" entries,
10716 		 *	+ JIT regions,
10717 		 *	+ TPRO regions,
10718 		 *      + pmap-specific protection policies,
10719 		 *	+ VM objects with COPY_NONE copy strategy.
10720 		 */
10721 
10722 		object = VME_OBJECT(entry);
10723 		if ((!entry->is_shared &&
10724 		    !entry->vme_permanent &&
10725 		    !entry->used_for_jit &&
10726 #if __arm64e__
10727 		    !entry->used_for_tpro &&
10728 #endif /* __arm64e__ */
10729 		    !(entry->protection & VM_PROT_EXECUTE) &&
10730 		    !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10731 		    ((object == VM_OBJECT_NULL) ||
10732 		    (object->internal &&
10733 		    !object->true_share &&
10734 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10735 		    entry->needs_copy) {
10736 			vm_object_t     old_object = VME_OBJECT(entry);
10737 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10738 			vm_object_offset_t      offset;
10739 
10740 			assert(!entry->is_sub_map);
10741 			/*
10742 			 * Ensure that the source and destination aren't
10743 			 * identical
10744 			 */
10745 			if (old_object == VME_OBJECT(copy_entry) &&
10746 			    old_offset == VME_OFFSET(copy_entry)) {
10747 				vm_map_copy_entry_unlink(copy, copy_entry);
10748 				vm_map_copy_entry_dispose(copy_entry);
10749 
10750 				if (old_object != VM_OBJECT_NULL) {
10751 					vm_object_deallocate(old_object);
10752 				}
10753 
10754 				start = tmp_entry->vme_end;
10755 				tmp_entry = tmp_entry->vme_next;
10756 				continue;
10757 			}
10758 
10759 #if XNU_TARGET_OS_OSX
10760 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10761 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10762 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10763 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10764 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10765 				/*
10766 				 * Virtual vs. Physical copy tradeoff #1.
10767 				 *
10768 				 * Copying only a few pages out of a large
10769 				 * object:  do a physical copy instead of
10770 				 * a virtual copy, to avoid possibly keeping
10771 				 * the entire large object alive because of
10772 				 * those few copy-on-write pages.
10773 				 */
10774 				vm_map_copy_overwrite_aligned_src_large++;
10775 				goto slow_copy;
10776 			}
10777 #endif /* XNU_TARGET_OS_OSX */
10778 
10779 			if ((dst_map->pmap != kernel_pmap) &&
10780 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10781 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10782 				vm_object_t new_object, new_shadow;
10783 
10784 				/*
10785 				 * We're about to map something over a mapping
10786 				 * established by malloc()...
10787 				 */
10788 				new_object = VME_OBJECT(copy_entry);
10789 				if (new_object != VM_OBJECT_NULL) {
10790 					vm_object_lock_shared(new_object);
10791 				}
10792 				while (new_object != VM_OBJECT_NULL &&
10793 #if XNU_TARGET_OS_OSX
10794 				    !new_object->true_share &&
10795 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10796 #endif /* XNU_TARGET_OS_OSX */
10797 				    new_object->internal) {
10798 					new_shadow = new_object->shadow;
10799 					if (new_shadow == VM_OBJECT_NULL) {
10800 						break;
10801 					}
10802 					vm_object_lock_shared(new_shadow);
10803 					vm_object_unlock(new_object);
10804 					new_object = new_shadow;
10805 				}
10806 				if (new_object != VM_OBJECT_NULL) {
10807 					if (!new_object->internal) {
10808 						/*
10809 						 * The new mapping is backed
10810 						 * by an external object.  We
10811 						 * don't want malloc'ed memory
10812 						 * to be replaced with such a
10813 						 * non-anonymous mapping, so
10814 						 * let's go off the optimized
10815 						 * path...
10816 						 */
10817 						vm_map_copy_overwrite_aligned_src_not_internal++;
10818 						vm_object_unlock(new_object);
10819 						goto slow_copy;
10820 					}
10821 #if XNU_TARGET_OS_OSX
10822 					if (new_object->true_share ||
10823 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10824 						/*
10825 						 * Same if there's a "true_share"
10826 						 * object in the shadow chain, or
10827 						 * an object with a non-default
10828 						 * (SYMMETRIC) copy strategy.
10829 						 */
10830 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
10831 						vm_object_unlock(new_object);
10832 						goto slow_copy;
10833 					}
10834 #endif /* XNU_TARGET_OS_OSX */
10835 					vm_object_unlock(new_object);
10836 				}
10837 				/*
10838 				 * The new mapping is still backed by
10839 				 * anonymous (internal) memory, so it's
10840 				 * OK to substitute it for the original
10841 				 * malloc() mapping.
10842 				 */
10843 			}
10844 
10845 			if (old_object != VM_OBJECT_NULL) {
10846 				assert(!entry->vme_permanent);
10847 				if (entry->is_sub_map) {
10848 					if (entry->use_pmap) {
10849 #ifndef NO_NESTED_PMAP
10850 						pmap_unnest(dst_map->pmap,
10851 						    (addr64_t)entry->vme_start,
10852 						    entry->vme_end - entry->vme_start);
10853 #endif  /* NO_NESTED_PMAP */
10854 						if (dst_map->mapped_in_other_pmaps) {
10855 							/* clean up parent */
10856 							/* map/maps */
10857 							vm_map_submap_pmap_clean(
10858 								dst_map, entry->vme_start,
10859 								entry->vme_end,
10860 								VME_SUBMAP(entry),
10861 								VME_OFFSET(entry));
10862 						}
10863 					} else {
10864 						vm_map_submap_pmap_clean(
10865 							dst_map, entry->vme_start,
10866 							entry->vme_end,
10867 							VME_SUBMAP(entry),
10868 							VME_OFFSET(entry));
10869 					}
10870 					vm_map_deallocate(VME_SUBMAP(entry));
10871 				} else {
10872 					if (dst_map->mapped_in_other_pmaps) {
10873 						vm_object_pmap_protect_options(
10874 							VME_OBJECT(entry),
10875 							VME_OFFSET(entry),
10876 							entry->vme_end
10877 							- entry->vme_start,
10878 							PMAP_NULL,
10879 							PAGE_SIZE,
10880 							entry->vme_start,
10881 							VM_PROT_NONE,
10882 							PMAP_OPTIONS_REMOVE);
10883 					} else {
10884 						pmap_remove_options(
10885 							dst_map->pmap,
10886 							(addr64_t)(entry->vme_start),
10887 							(addr64_t)(entry->vme_end),
10888 							PMAP_OPTIONS_REMOVE);
10889 					}
10890 					vm_object_deallocate(old_object);
10891 				}
10892 			}
10893 
10894 			if (entry->iokit_acct) {
10895 				/* keep using iokit accounting */
10896 				entry->use_pmap = FALSE;
10897 			} else {
10898 				/* use pmap accounting */
10899 				entry->use_pmap = TRUE;
10900 			}
10901 			assert(!entry->vme_permanent);
10902 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10903 			object = VME_OBJECT(entry);
10904 			entry->needs_copy = copy_entry->needs_copy;
10905 			entry->wired_count = 0;
10906 			entry->user_wired_count = 0;
10907 			offset = VME_OFFSET(copy_entry);
10908 			VME_OFFSET_SET(entry, offset);
10909 
10910 			vm_map_copy_entry_unlink(copy, copy_entry);
10911 			vm_map_copy_entry_dispose(copy_entry);
10912 
10913 			/*
10914 			 * we could try to push pages into the pmap at this point, BUT
10915 			 * this optimization only saved on average 2 us per page if ALL
10916 			 * the pages in the source were currently mapped
10917 			 * and ALL the pages in the dest were touched, if there were fewer
10918 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
10919 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10920 			 */
10921 
10922 			/*
10923 			 *	Set up for the next iteration.  The map
10924 			 *	has not been unlocked, so the next
10925 			 *	address should be at the end of this
10926 			 *	entry, and the next map entry should be
10927 			 *	the one following it.
10928 			 */
10929 
10930 			start = tmp_entry->vme_end;
10931 			tmp_entry = tmp_entry->vme_next;
10932 		} else {
10933 			vm_map_version_t        version;
10934 			vm_object_t             dst_object;
10935 			vm_object_offset_t      dst_offset;
10936 			kern_return_t           r;
10937 
10938 slow_copy:
10939 			if (entry->needs_copy) {
10940 				VME_OBJECT_SHADOW(entry,
10941 				    (entry->vme_end -
10942 				    entry->vme_start),
10943 				    vm_map_always_shadow(dst_map));
10944 				entry->needs_copy = FALSE;
10945 			}
10946 
10947 			dst_object = VME_OBJECT(entry);
10948 			dst_offset = VME_OFFSET(entry);
10949 
10950 			/*
10951 			 *	Take an object reference, and record
10952 			 *	the map version information so that the
10953 			 *	map can be safely unlocked.
10954 			 */
10955 
10956 			if (dst_object == VM_OBJECT_NULL) {
10957 				/*
10958 				 * We would usually have just taken the
10959 				 * optimized path above if the destination
10960 				 * object has not been allocated yet.  But we
10961 				 * now disable that optimization if the copy
10962 				 * entry's object is not backed by anonymous
10963 				 * memory to avoid replacing malloc'ed
10964 				 * (i.e. re-usable) anonymous memory with a
10965 				 * not-so-anonymous mapping.
10966 				 * So we have to handle this case here and
10967 				 * allocate a new VM object for this map entry.
10968 				 */
10969 				dst_object = vm_object_allocate(
10970 					entry->vme_end - entry->vme_start);
10971 				dst_offset = 0;
10972 				VME_OBJECT_SET(entry, dst_object, false, 0);
10973 				VME_OFFSET_SET(entry, dst_offset);
10974 				assert(entry->use_pmap);
10975 			}
10976 
10977 			vm_object_reference(dst_object);
10978 
10979 			/* account for unlock bumping up timestamp */
10980 			version.main_timestamp = dst_map->timestamp + 1;
10981 
10982 			vm_map_unlock(dst_map);
10983 
10984 			/*
10985 			 *	Copy as much as possible in one pass
10986 			 */
10987 
10988 			copy_size = size;
10989 			r = vm_fault_copy(
10990 				VME_OBJECT(copy_entry),
10991 				VME_OFFSET(copy_entry),
10992 				&copy_size,
10993 				dst_object,
10994 				dst_offset,
10995 				dst_map,
10996 				&version,
10997 				THREAD_UNINT );
10998 
10999 			/*
11000 			 *	Release the object reference
11001 			 */
11002 
11003 			vm_object_deallocate(dst_object);
11004 
11005 			/*
11006 			 *	If a hard error occurred, return it now
11007 			 */
11008 
11009 			if (r != KERN_SUCCESS) {
11010 				return r;
11011 			}
11012 
11013 			if (copy_size != 0) {
11014 				/*
11015 				 *	Dispose of the copied region
11016 				 */
11017 
11018 				vm_map_copy_clip_end(copy, copy_entry,
11019 				    copy_entry->vme_start + copy_size);
11020 				vm_map_copy_entry_unlink(copy, copy_entry);
11021 				vm_object_deallocate(VME_OBJECT(copy_entry));
11022 				vm_map_copy_entry_dispose(copy_entry);
11023 			}
11024 
11025 			/*
11026 			 *	Pick up in the destination map where we left off.
11027 			 *
11028 			 *	Use the version information to avoid a lookup
11029 			 *	in the normal case.
11030 			 */
11031 
11032 			start += copy_size;
11033 			vm_map_lock(dst_map);
11034 			if (version.main_timestamp == dst_map->timestamp &&
11035 			    copy_size != 0) {
11036 				/* We can safely use saved tmp_entry value */
11037 
11038 				if (tmp_entry->map_aligned &&
11039 				    !VM_MAP_PAGE_ALIGNED(
11040 					    start,
11041 					    VM_MAP_PAGE_MASK(dst_map))) {
11042 					/* no longer map-aligned */
11043 					tmp_entry->map_aligned = FALSE;
11044 				}
11045 				vm_map_clip_end(dst_map, tmp_entry, start);
11046 				tmp_entry = tmp_entry->vme_next;
11047 			} else {
11048 				/* Must do lookup of tmp_entry */
11049 
11050 RetryLookup:
11051 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11052 					vm_map_unlock(dst_map);
11053 					return KERN_INVALID_ADDRESS;
11054 				}
11055 				if (tmp_entry->map_aligned &&
11056 				    !VM_MAP_PAGE_ALIGNED(
11057 					    start,
11058 					    VM_MAP_PAGE_MASK(dst_map))) {
11059 					/* no longer map-aligned */
11060 					tmp_entry->map_aligned = FALSE;
11061 				}
11062 				vm_map_clip_start(dst_map, tmp_entry, start);
11063 			}
11064 		}
11065 	}/* while */
11066 
11067 	return KERN_SUCCESS;
11068 }/* vm_map_copy_overwrite_aligned */
11069 
11070 /*
11071  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
11072  *
11073  *	Description:
11074  *		Copy in data to a kernel buffer from space in the
11075  *		source map. The original space may be optionally
11076  *		deallocated.
11077  *
11078  *		If successful, returns a new copy object.
11079  */
11080 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11081 vm_map_copyin_kernel_buffer(
11082 	vm_map_t        src_map,
11083 	vm_map_offset_t src_addr,
11084 	vm_map_size_t   len,
11085 	boolean_t       src_destroy,
11086 	vm_map_copy_t   *copy_result)
11087 {
11088 	kern_return_t kr;
11089 	vm_map_copy_t copy;
11090 	void *kdata;
11091 
11092 	if (len > msg_ool_size_small) {
11093 		return KERN_INVALID_ARGUMENT;
11094 	}
11095 
11096 	kdata = kalloc_data(len, Z_WAITOK);
11097 	if (kdata == NULL) {
11098 		return KERN_RESOURCE_SHORTAGE;
11099 	}
11100 	kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11101 	if (kr != KERN_SUCCESS) {
11102 		kfree_data(kdata, len);
11103 		return kr;
11104 	}
11105 
11106 	copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11107 	copy->cpy_kdata = kdata;
11108 	copy->size = len;
11109 	copy->offset = 0;
11110 
11111 	if (src_destroy) {
11112 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11113 
11114 		if (src_map == kernel_map) {
11115 			flags |= VM_MAP_REMOVE_KUNWIRE;
11116 		}
11117 
11118 		(void)vm_map_remove_guard(src_map,
11119 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11120 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11121 		    flags, KMEM_GUARD_NONE);
11122 	}
11123 
11124 	*copy_result = copy;
11125 	return KERN_SUCCESS;
11126 }
11127 
11128 /*
11129  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
11130  *
11131  *	Description:
11132  *		Copy out data from a kernel buffer into space in the
11133  *		destination map. The space may be otpionally dynamically
11134  *		allocated.
11135  *
11136  *		If successful, consumes the copy object.
11137  *		Otherwise, the caller is responsible for it.
11138  *
11139  *		Callers of this function must call vm_map_copy_require on
11140  *		previously created vm_map_copy_t or pass a newly created
11141  *		one to ensure that it hasn't been forged.
11142  */
11143 static int vm_map_copyout_kernel_buffer_failures = 0;
11144 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11145 vm_map_copyout_kernel_buffer(
11146 	vm_map_t                map,
11147 	vm_map_address_t        *addr,  /* IN/OUT */
11148 	vm_map_copy_t           copy,
11149 	vm_map_size_t           copy_size,
11150 	boolean_t               overwrite,
11151 	boolean_t               consume_on_success)
11152 {
11153 	kern_return_t kr = KERN_SUCCESS;
11154 	thread_t thread = current_thread();
11155 
11156 	assert(copy->size == copy_size);
11157 
11158 	/*
11159 	 * check for corrupted vm_map_copy structure
11160 	 */
11161 	if (copy_size > msg_ool_size_small || copy->offset) {
11162 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11163 		    (long long)copy->size, (long long)copy->offset);
11164 	}
11165 
11166 	if (!overwrite) {
11167 		/*
11168 		 * Allocate space in the target map for the data
11169 		 */
11170 		vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11171 
11172 		if (map == kernel_map) {
11173 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11174 		}
11175 
11176 		*addr = 0;
11177 		kr = vm_map_enter(map,
11178 		    addr,
11179 		    vm_map_round_page(copy_size,
11180 		    VM_MAP_PAGE_MASK(map)),
11181 		    (vm_map_offset_t) 0,
11182 		    vmk_flags,
11183 		    VM_OBJECT_NULL,
11184 		    (vm_object_offset_t) 0,
11185 		    FALSE,
11186 		    VM_PROT_DEFAULT,
11187 		    VM_PROT_ALL,
11188 		    VM_INHERIT_DEFAULT);
11189 		if (kr != KERN_SUCCESS) {
11190 			return kr;
11191 		}
11192 #if KASAN
11193 		if (map->pmap == kernel_pmap) {
11194 			kasan_notify_address(*addr, copy->size);
11195 		}
11196 #endif
11197 	}
11198 
11199 	/*
11200 	 * Copyout the data from the kernel buffer to the target map.
11201 	 */
11202 	if (thread->map == map) {
11203 		/*
11204 		 * If the target map is the current map, just do
11205 		 * the copy.
11206 		 */
11207 		assert((vm_size_t)copy_size == copy_size);
11208 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11209 			kr = KERN_INVALID_ADDRESS;
11210 		}
11211 	} else {
11212 		vm_map_t oldmap;
11213 
11214 		/*
11215 		 * If the target map is another map, assume the
11216 		 * target's address space identity for the duration
11217 		 * of the copy.
11218 		 */
11219 		vm_map_reference(map);
11220 		oldmap = vm_map_switch(map);
11221 
11222 		assert((vm_size_t)copy_size == copy_size);
11223 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11224 			vm_map_copyout_kernel_buffer_failures++;
11225 			kr = KERN_INVALID_ADDRESS;
11226 		}
11227 
11228 		(void) vm_map_switch(oldmap);
11229 		vm_map_deallocate(map);
11230 	}
11231 
11232 	if (kr != KERN_SUCCESS) {
11233 		/* the copy failed, clean up */
11234 		if (!overwrite) {
11235 			/*
11236 			 * Deallocate the space we allocated in the target map.
11237 			 */
11238 			(void) vm_map_remove(map,
11239 			    vm_map_trunc_page(*addr,
11240 			    VM_MAP_PAGE_MASK(map)),
11241 			    vm_map_round_page((*addr +
11242 			    vm_map_round_page(copy_size,
11243 			    VM_MAP_PAGE_MASK(map))),
11244 			    VM_MAP_PAGE_MASK(map)));
11245 			*addr = 0;
11246 		}
11247 	} else {
11248 		/* copy was successful, dicard the copy structure */
11249 		if (consume_on_success) {
11250 			kfree_data(copy->cpy_kdata, copy_size);
11251 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11252 		}
11253 	}
11254 
11255 	return kr;
11256 }
11257 
11258 /*
11259  *	Routine:	vm_map_copy_insert      [internal use only]
11260  *
11261  *	Description:
11262  *		Link a copy chain ("copy") into a map at the
11263  *		specified location (after "where").
11264  *
11265  *		Callers of this function must call vm_map_copy_require on
11266  *		previously created vm_map_copy_t or pass a newly created
11267  *		one to ensure that it hasn't been forged.
11268  *	Side effects:
11269  *		The copy chain is destroyed.
11270  */
11271 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11272 vm_map_copy_insert(
11273 	vm_map_t        map,
11274 	vm_map_entry_t  after_where,
11275 	vm_map_copy_t   copy)
11276 {
11277 	vm_map_entry_t  entry;
11278 
11279 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11280 		entry = vm_map_copy_first_entry(copy);
11281 		vm_map_copy_entry_unlink(copy, entry);
11282 		vm_map_store_entry_link(map, after_where, entry,
11283 		    VM_MAP_KERNEL_FLAGS_NONE);
11284 		after_where = entry;
11285 	}
11286 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11287 }
11288 
11289 /*
11290  * Callers of this function must call vm_map_copy_require on
11291  * previously created vm_map_copy_t or pass a newly created
11292  * one to ensure that it hasn't been forged.
11293  */
11294 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11295 vm_map_copy_remap(
11296 	vm_map_t        map,
11297 	vm_map_entry_t  where,
11298 	vm_map_copy_t   copy,
11299 	vm_map_offset_t adjustment,
11300 	vm_prot_t       cur_prot,
11301 	vm_prot_t       max_prot,
11302 	vm_inherit_t    inheritance)
11303 {
11304 	vm_map_entry_t  copy_entry, new_entry;
11305 
11306 	for (copy_entry = vm_map_copy_first_entry(copy);
11307 	    copy_entry != vm_map_copy_to_entry(copy);
11308 	    copy_entry = copy_entry->vme_next) {
11309 		/* get a new VM map entry for the map */
11310 		new_entry = vm_map_entry_create(map);
11311 		/* copy the "copy entry" to the new entry */
11312 		vm_map_entry_copy(map, new_entry, copy_entry);
11313 		/* adjust "start" and "end" */
11314 		new_entry->vme_start += adjustment;
11315 		new_entry->vme_end += adjustment;
11316 		/* clear some attributes */
11317 		new_entry->inheritance = inheritance;
11318 		new_entry->protection = cur_prot;
11319 		new_entry->max_protection = max_prot;
11320 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11321 		/* take an extra reference on the entry's "object" */
11322 		if (new_entry->is_sub_map) {
11323 			assert(!new_entry->use_pmap); /* not nested */
11324 			vm_map_reference(VME_SUBMAP(new_entry));
11325 		} else {
11326 			vm_object_reference(VME_OBJECT(new_entry));
11327 		}
11328 		/* insert the new entry in the map */
11329 		vm_map_store_entry_link(map, where, new_entry,
11330 		    VM_MAP_KERNEL_FLAGS_NONE);
11331 		/* continue inserting the "copy entries" after the new entry */
11332 		where = new_entry;
11333 	}
11334 }
11335 
11336 
11337 /*
11338  * Returns true if *size matches (or is in the range of) copy->size.
11339  * Upon returning true, the *size field is updated with the actual size of the
11340  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11341  */
11342 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11343 vm_map_copy_validate_size(
11344 	vm_map_t                dst_map,
11345 	vm_map_copy_t           copy,
11346 	vm_map_size_t           *size)
11347 {
11348 	if (copy == VM_MAP_COPY_NULL) {
11349 		return FALSE;
11350 	}
11351 
11352 	/*
11353 	 * Assert that the vm_map_copy is coming from the right
11354 	 * zone and hasn't been forged
11355 	 */
11356 	vm_map_copy_require(copy);
11357 
11358 	vm_map_size_t copy_sz = copy->size;
11359 	vm_map_size_t sz = *size;
11360 	switch (copy->type) {
11361 	case VM_MAP_COPY_KERNEL_BUFFER:
11362 		if (sz == copy_sz) {
11363 			return TRUE;
11364 		}
11365 		break;
11366 	case VM_MAP_COPY_ENTRY_LIST:
11367 		/*
11368 		 * potential page-size rounding prevents us from exactly
11369 		 * validating this flavor of vm_map_copy, but we can at least
11370 		 * assert that it's within a range.
11371 		 */
11372 		if (copy_sz >= sz &&
11373 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11374 			*size = copy_sz;
11375 			return TRUE;
11376 		}
11377 		break;
11378 	default:
11379 		break;
11380 	}
11381 	return FALSE;
11382 }
11383 
11384 static kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11385 vm_map_copyout_internal(
11386 	vm_map_t                dst_map,
11387 	vm_map_address_t       *dst_addr,      /* OUT */
11388 	vm_map_copy_t           copy,
11389 	vm_map_size_ut          copy_size_u,
11390 	boolean_t               consume_on_success,
11391 	vm_prot_t               cur_protection,
11392 	vm_prot_t               max_protection,
11393 	vm_inherit_t            inheritance)
11394 {
11395 	vm_map_size_t           size, copy_size;
11396 	vm_map_size_t           adjustment;
11397 	vm_map_offset_t         start;
11398 	vm_object_offset_t      vm_copy_start;
11399 	vm_map_entry_t          last;
11400 	vm_map_entry_t          entry;
11401 	vm_map_copy_t           original_copy;
11402 	kern_return_t           kr;
11403 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11404 
11405 	/*
11406 	 *	Check for null copy object.
11407 	 */
11408 
11409 	if (copy == VM_MAP_COPY_NULL) {
11410 		*dst_addr = 0;
11411 		return KERN_SUCCESS;
11412 	}
11413 
11414 	/*
11415 	 * Assert that the vm_map_copy is coming from the right
11416 	 * zone and hasn't been forged
11417 	 */
11418 	vm_map_copy_require(copy);
11419 
11420 	if (!VM_SANITIZE_UNSAFE_IS_EQUAL(copy_size_u, copy->size)) {
11421 		*dst_addr = 0;
11422 		ktriage_record(thread_tid(current_thread()),
11423 		    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11424 		    KDBG_TRIAGE_RESERVED,
11425 		    KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR),
11426 		    KERN_FAILURE /* arg */);
11427 		return KERN_FAILURE;
11428 	}
11429 	copy_size = copy->size;
11430 
11431 	/*
11432 	 *	Check for special kernel buffer allocated
11433 	 *	by new_ipc_kmsg_copyin.
11434 	 */
11435 
11436 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11437 		kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11438 		    copy, copy_size, FALSE,
11439 		    consume_on_success);
11440 		if (kr) {
11441 			ktriage_record(thread_tid(current_thread()),
11442 			    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11443 			    KDBG_TRIAGE_RESERVED,
11444 			    KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11445 		}
11446 		return kr;
11447 	}
11448 
11449 	original_copy = copy;
11450 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11451 		vm_map_copy_t target_copy;
11452 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11453 
11454 		target_copy = VM_MAP_COPY_NULL;
11455 		DEBUG4K_ADJUST("adjusting...\n");
11456 		kr = vm_map_copy_adjust_to_target(
11457 			copy,
11458 			0, /* offset */
11459 			copy->size, /* size */
11460 			dst_map,
11461 			TRUE, /* copy */
11462 			&target_copy,
11463 			&overmap_start,
11464 			&overmap_end,
11465 			&trimmed_start);
11466 		if (kr != KERN_SUCCESS) {
11467 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11468 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11469 			return kr;
11470 		}
11471 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11472 		if (target_copy != copy) {
11473 			copy = target_copy;
11474 		}
11475 		copy_size = copy->size;
11476 	}
11477 
11478 	/*
11479 	 *	Find space for the data
11480 	 */
11481 
11482 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11483 	    VM_MAP_COPY_PAGE_MASK(copy));
11484 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11485 	    VM_MAP_COPY_PAGE_MASK(copy))
11486 	    - vm_copy_start;
11487 
11488 	vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map, size);
11489 
11490 	vm_map_lock(dst_map);
11491 	kr = vm_map_locate_space_anywhere(dst_map, size, 0, vmk_flags,
11492 	    &start, &last);
11493 	if (kr != KERN_SUCCESS) {
11494 		vm_map_unlock(dst_map);
11495 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11496 		return kr;
11497 	}
11498 
11499 	adjustment = start - vm_copy_start;
11500 	if (!consume_on_success) {
11501 		/*
11502 		 * We're not allowed to consume "copy", so we'll have to
11503 		 * copy its map entries into the destination map below.
11504 		 * No need to re-allocate map entries from the correct
11505 		 * (pageable or not) zone, since we'll get new map entries
11506 		 * during the transfer.
11507 		 * We'll also adjust the map entries's "start" and "end"
11508 		 * during the transfer, to keep "copy"'s entries consistent
11509 		 * with its "offset".
11510 		 */
11511 		goto after_adjustments;
11512 	}
11513 
11514 	/*
11515 	 *	Since we're going to just drop the map
11516 	 *	entries from the copy into the destination
11517 	 *	map, they must come from the same pool.
11518 	 */
11519 
11520 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11521 		/*
11522 		 * Mismatches occur when dealing with the default
11523 		 * pager.
11524 		 */
11525 		vm_map_entry_t  next, new;
11526 
11527 		/*
11528 		 * Find the zone that the copies were allocated from
11529 		 */
11530 
11531 		entry = vm_map_copy_first_entry(copy);
11532 
11533 		/*
11534 		 * Reinitialize the copy so that vm_map_copy_entry_link
11535 		 * will work.
11536 		 */
11537 		vm_map_store_copy_reset(copy, entry);
11538 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11539 
11540 		/*
11541 		 * Copy each entry.
11542 		 */
11543 		while (entry != vm_map_copy_to_entry(copy)) {
11544 			new = vm_map_copy_entry_create(copy);
11545 			vm_map_entry_copy_full(new, entry);
11546 			new->vme_no_copy_on_read = FALSE;
11547 			assert(!new->iokit_acct);
11548 			if (new->is_sub_map) {
11549 				/* clr address space specifics */
11550 				new->use_pmap = FALSE;
11551 			}
11552 			vm_map_copy_entry_link(copy,
11553 			    vm_map_copy_last_entry(copy),
11554 			    new);
11555 			next = entry->vme_next;
11556 			vm_map_entry_dispose(entry);
11557 			entry = next;
11558 		}
11559 	}
11560 
11561 	/*
11562 	 *	Adjust the addresses in the copy chain, and
11563 	 *	reset the region attributes.
11564 	 */
11565 
11566 	for (entry = vm_map_copy_first_entry(copy);
11567 	    entry != vm_map_copy_to_entry(copy);
11568 	    entry = entry->vme_next) {
11569 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11570 			/*
11571 			 * We're injecting this copy entry into a map that
11572 			 * has the standard page alignment, so clear
11573 			 * "map_aligned" (which might have been inherited
11574 			 * from the original map entry).
11575 			 */
11576 			entry->map_aligned = FALSE;
11577 		}
11578 
11579 		entry->vme_start += adjustment;
11580 		entry->vme_end += adjustment;
11581 
11582 		if (entry->map_aligned) {
11583 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11584 			    VM_MAP_PAGE_MASK(dst_map)));
11585 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11586 			    VM_MAP_PAGE_MASK(dst_map)));
11587 		}
11588 
11589 		entry->inheritance = VM_INHERIT_DEFAULT;
11590 		entry->protection = VM_PROT_DEFAULT;
11591 		entry->max_protection = VM_PROT_ALL;
11592 		entry->behavior = VM_BEHAVIOR_DEFAULT;
11593 
11594 		/*
11595 		 * If the entry is now wired,
11596 		 * map the pages into the destination map.
11597 		 */
11598 		if (entry->wired_count != 0) {
11599 			vm_map_offset_t va;
11600 			vm_object_offset_t       offset;
11601 			vm_object_t object;
11602 			vm_prot_t prot;
11603 			int     type_of_fault;
11604 			uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11605 
11606 			/* TODO4K would need to use actual page size */
11607 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11608 
11609 			object = VME_OBJECT(entry);
11610 			offset = VME_OFFSET(entry);
11611 			va = entry->vme_start;
11612 
11613 			pmap_pageable(dst_map->pmap,
11614 			    entry->vme_start,
11615 			    entry->vme_end,
11616 			    TRUE);
11617 
11618 			while (va < entry->vme_end) {
11619 				vm_page_t       m;
11620 				struct vm_object_fault_info fault_info = {};
11621 
11622 				/*
11623 				 * Look up the page in the object.
11624 				 * Assert that the page will be found in the
11625 				 * top object:
11626 				 * either
11627 				 *	the object was newly created by
11628 				 *	vm_object_copy_slowly, and has
11629 				 *	copies of all of the pages from
11630 				 *	the source object
11631 				 * or
11632 				 *	the object was moved from the old
11633 				 *	map entry; because the old map
11634 				 *	entry was wired, all of the pages
11635 				 *	were in the top-level object.
11636 				 *	(XXX not true if we wire pages for
11637 				 *	 reading)
11638 				 */
11639 				vm_object_lock(object);
11640 
11641 				m = vm_page_lookup(object, offset);
11642 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11643 				    m->vmp_absent) {
11644 					panic("vm_map_copyout: wiring %p", m);
11645 				}
11646 
11647 				prot = entry->protection;
11648 
11649 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11650 				    prot) {
11651 					prot |= VM_PROT_EXECUTE;
11652 				}
11653 
11654 				type_of_fault = DBG_CACHE_HIT_FAULT;
11655 
11656 				fault_info.user_tag = VME_ALIAS(entry);
11657 				fault_info.pmap_options = 0;
11658 				if (entry->iokit_acct ||
11659 				    (!entry->is_sub_map && !entry->use_pmap)) {
11660 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11661 				}
11662 				if (entry->vme_xnu_user_debug &&
11663 				    !VM_PAGE_OBJECT(m)->code_signed) {
11664 					/*
11665 					 * Modified code-signed executable
11666 					 * region: this page does not belong
11667 					 * to a code-signed VM object, so it
11668 					 * must have been copied and should
11669 					 * therefore be typed XNU_USER_DEBUG
11670 					 * rather than XNU_USER_EXEC.
11671 					 */
11672 					fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11673 				}
11674 
11675 				vm_fault_enter(m,
11676 				    dst_map->pmap,
11677 				    va,
11678 				    PAGE_SIZE, 0,
11679 				    prot,
11680 				    prot,
11681 				    VM_PAGE_WIRED(m),
11682 				    FALSE,            /* change_wiring */
11683 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11684 				    &fault_info,
11685 				    NULL,             /* need_retry */
11686 				    &type_of_fault,
11687 				    &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11688 
11689 				vm_object_unlock(object);
11690 
11691 				offset += PAGE_SIZE_64;
11692 				va += PAGE_SIZE;
11693 			}
11694 		}
11695 	}
11696 
11697 after_adjustments:
11698 
11699 	/*
11700 	 *	Correct the page alignment for the result
11701 	 */
11702 
11703 	*dst_addr = start + (copy->offset - vm_copy_start);
11704 
11705 #if KASAN
11706 	kasan_notify_address(*dst_addr, size);
11707 #endif
11708 
11709 	/*
11710 	 *	Update the hints and the map size
11711 	 */
11712 
11713 	if (consume_on_success) {
11714 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11715 	} else {
11716 		SAVE_HINT_MAP_WRITE(dst_map, last);
11717 	}
11718 
11719 	dst_map->size += size;
11720 
11721 	/*
11722 	 *	Link in the copy
11723 	 */
11724 
11725 	if (consume_on_success) {
11726 		vm_map_copy_insert(dst_map, last, copy);
11727 		if (copy != original_copy) {
11728 			vm_map_copy_discard(original_copy);
11729 			original_copy = VM_MAP_COPY_NULL;
11730 		}
11731 	} else {
11732 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11733 		    cur_protection, max_protection,
11734 		    inheritance);
11735 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11736 			vm_map_copy_discard(copy);
11737 			copy = original_copy;
11738 		}
11739 	}
11740 
11741 
11742 	vm_map_unlock(dst_map);
11743 
11744 	/*
11745 	 * XXX	If wiring_required, call vm_map_pageable
11746 	 */
11747 
11748 	return KERN_SUCCESS;
11749 }
11750 
11751 /*
11752  *	Routine:	vm_map_copyout_size
11753  *
11754  *	Description:
11755  *		Copy out a copy chain ("copy") into newly-allocated
11756  *		space in the destination map. Uses a prevalidated
11757  *		size for the copy object (vm_map_copy_validate_size).
11758  *
11759  *		If successful, consumes the copy object.
11760  *		Otherwise, the caller is responsible for it.
11761  */
11762 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size)11763 vm_map_copyout_size(
11764 	vm_map_t                dst_map,
11765 	vm_map_address_t       *dst_addr,      /* OUT */
11766 	vm_map_copy_t           copy,
11767 	vm_map_size_ut          copy_size)
11768 {
11769 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11770 	           TRUE,                     /* consume_on_success */
11771 	           VM_PROT_DEFAULT,
11772 	           VM_PROT_ALL,
11773 	           VM_INHERIT_DEFAULT);
11774 }
11775 
11776 /*
11777  *	Routine:	vm_map_copyout
11778  *
11779  *	Description:
11780  *		Copy out a copy chain ("copy") into newly-allocated
11781  *		space in the destination map.
11782  *
11783  *		If successful, consumes the copy object.
11784  *		Otherwise, the caller is responsible for it.
11785  */
11786 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11787 vm_map_copyout(
11788 	vm_map_t                dst_map,
11789 	vm_map_address_t       *dst_addr,      /* OUT */
11790 	vm_map_copy_t           copy)
11791 {
11792 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11793 	           TRUE,                     /* consume_on_success */
11794 	           VM_PROT_DEFAULT,
11795 	           VM_PROT_ALL,
11796 	           VM_INHERIT_DEFAULT);
11797 }
11798 
11799 /*
11800  *	Routine:	vm_map_copyin
11801  *
11802  *	Description:
11803  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11804  *
11805  */
11806 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,vm_map_copy_t * copy_result)11807 vm_map_copyin(
11808 	vm_map_t                src_map,
11809 	vm_map_address_ut       src_addr,
11810 	vm_map_size_ut          len,
11811 	boolean_t               src_destroy,
11812 	vm_map_copy_t          *copy_result)   /* OUT */
11813 {
11814 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11815 	           FALSE, copy_result, FALSE);
11816 }
11817 
11818 /*
11819  *	Routine:	vm_map_copyin_common
11820  *
11821  *	Description:
11822  *		Copy the specified region (src_addr, len) from the
11823  *		source address space (src_map), possibly removing
11824  *		the region from the source address space (src_destroy).
11825  *
11826  *	Returns:
11827  *		A vm_map_copy_t object (copy_result), suitable for
11828  *		insertion into another address space (using vm_map_copyout),
11829  *		copying over another address space region (using
11830  *		vm_map_copy_overwrite).  If the copy is unused, it
11831  *		should be destroyed (using vm_map_copy_discard).
11832  *
11833  *	In/out conditions:
11834  *		The source map should not be locked on entry.
11835  */
11836 
11837 typedef struct submap_map {
11838 	vm_map_t        parent_map;
11839 	vm_map_offset_t base_start;
11840 	vm_map_offset_t base_end;
11841 	vm_map_size_t   base_len;
11842 	struct submap_map *next;
11843 } submap_map_t;
11844 
11845 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11846 vm_map_copyin_common(
11847 	vm_map_t                src_map,
11848 	vm_map_address_ut       src_addr,
11849 	vm_map_size_ut          len,
11850 	boolean_t               src_destroy,
11851 	__unused boolean_t      src_volatile,
11852 	vm_map_copy_t          *copy_result,   /* OUT */
11853 	boolean_t               use_maxprot)
11854 {
11855 	int flags;
11856 
11857 	flags = 0;
11858 	if (src_destroy) {
11859 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
11860 	}
11861 	if (use_maxprot) {
11862 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
11863 	}
11864 	return vm_map_copyin_internal(src_map,
11865 	           src_addr,
11866 	           len,
11867 	           flags,
11868 	           copy_result);
11869 }
11870 
11871 static __attribute__((always_inline, warn_unused_result))
11872 kern_return_t
vm_map_copyin_sanitize(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,vm_map_offset_t * src_start,vm_map_offset_t * src_end,vm_map_size_t * len,vm_map_offset_t * src_addr_unaligned)11873 vm_map_copyin_sanitize(
11874 	vm_map_t                src_map,
11875 	vm_map_address_ut       src_addr_u,
11876 	vm_map_size_ut          len_u,
11877 	vm_map_offset_t        *src_start,
11878 	vm_map_offset_t        *src_end,
11879 	vm_map_size_t          *len,
11880 	vm_map_offset_t        *src_addr_unaligned)
11881 {
11882 	kern_return_t   kr;
11883 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS |
11884 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES;
11885 
11886 	if (src_map->pmap == kernel_pmap) {
11887 		flags |= VM_SANITIZE_FLAGS_CANONICALIZE;
11888 	}
11889 
11890 
11891 	kr = vm_sanitize_addr_size(src_addr_u, len_u,
11892 	    VM_SANITIZE_CALLER_VM_MAP_COPYIN,
11893 	    src_map,
11894 	    flags,
11895 	    src_start, src_end, len);
11896 	if (__improbable(kr != KERN_SUCCESS)) {
11897 		return kr;
11898 	}
11899 
11900 	/*
11901 	 *	Compute (page aligned) start and end of region
11902 	 */
11903 	*src_addr_unaligned  = *src_start; /* remember unaligned value */
11904 	*src_start = vm_map_trunc_page(*src_addr_unaligned,
11905 	    VM_MAP_PAGE_MASK(src_map));
11906 	*src_end   = vm_map_round_page(*src_end, VM_MAP_PAGE_MASK(src_map));
11907 	return KERN_SUCCESS;
11908 }
11909 
11910 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,int flags,vm_map_copy_t * copy_result)11911 vm_map_copyin_internal(
11912 	vm_map_t                src_map,
11913 	vm_map_address_ut       src_addr_u,
11914 	vm_map_size_ut          len_u,
11915 	int                     flags,
11916 	vm_map_copy_t          *copy_result)   /* OUT */
11917 {
11918 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
11919 	                                 * in multi-level lookup, this
11920 	                                 * entry contains the actual
11921 	                                 * vm_object/offset.
11922 	                                 */
11923 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
11924 
11925 	vm_map_offset_t src_start;      /* Start of current entry --
11926 	                                 * where copy is taking place now
11927 	                                 */
11928 	vm_map_offset_t src_end;        /* End of entire region to be
11929 	                                 * copied */
11930 	vm_map_offset_t src_addr_unaligned;
11931 	vm_map_offset_t src_base;
11932 	vm_map_size_t   len;
11933 	vm_map_t        base_map = src_map;
11934 	boolean_t       map_share = FALSE;
11935 	submap_map_t    *parent_maps = NULL;
11936 
11937 	vm_map_copy_t   copy;           /* Resulting copy */
11938 	vm_map_address_t copy_addr;
11939 	vm_map_size_t   copy_size;
11940 	boolean_t       src_destroy;
11941 	boolean_t       use_maxprot;
11942 	boolean_t       preserve_purgeable;
11943 	boolean_t       entry_was_shared;
11944 	vm_map_entry_t  saved_src_entry;
11945 	kern_return_t   kr;
11946 
11947 
11948 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11949 		return KERN_INVALID_ARGUMENT;
11950 	}
11951 
11952 	/*
11953 	 *	Check for copies of zero bytes.
11954 	 */
11955 	if (VM_SANITIZE_UNSAFE_IS_ZERO(len_u)) {
11956 		*copy_result = VM_MAP_COPY_NULL;
11957 		return KERN_SUCCESS;
11958 	}
11959 
11960 	/*
11961 	 * Sanitize any input parameters that are addr/size/prot/inherit
11962 	 */
11963 	kr = vm_map_copyin_sanitize(
11964 		src_map,
11965 		src_addr_u,
11966 		len_u,
11967 		&src_start,
11968 		&src_end,
11969 		&len,
11970 		&src_addr_unaligned);
11971 	if (__improbable(kr != KERN_SUCCESS)) {
11972 		return vm_sanitize_get_kr(kr);
11973 	}
11974 
11975 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11976 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11977 	preserve_purgeable =
11978 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11979 
11980 	/*
11981 	 * If the copy is sufficiently small, use a kernel buffer instead
11982 	 * of making a virtual copy.  The theory being that the cost of
11983 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
11984 	 * for small regions.
11985 	 */
11986 	if ((len <= msg_ool_size_small) &&
11987 	    !use_maxprot &&
11988 	    !preserve_purgeable &&
11989 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11990 	    /*
11991 	     * Since the "msg_ool_size_small" threshold was increased and
11992 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11993 	     * address space limits, we revert to doing a virtual copy if the
11994 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
11995 	     * of the commpage would now fail when it used to work.
11996 	     */
11997 	    (src_start >= vm_map_min(src_map) &&
11998 	    src_start < vm_map_max(src_map) &&
11999 	    src_end >= vm_map_min(src_map) &&
12000 	    src_end < vm_map_max(src_map))) {
12001 		return vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len,
12002 		           src_destroy, copy_result);
12003 	}
12004 
12005 	/*
12006 	 *	Allocate a header element for the list.
12007 	 *
12008 	 *	Use the start and end in the header to
12009 	 *	remember the endpoints prior to rounding.
12010 	 */
12011 
12012 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12013 	copy->cpy_hdr.entries_pageable = TRUE;
12014 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
12015 	copy->offset = src_addr_unaligned;
12016 	copy->size = len;
12017 
12018 	new_entry = vm_map_copy_entry_create(copy);
12019 
12020 #define RETURN(x)                                               \
12021 	MACRO_BEGIN                                             \
12022 	vm_map_unlock(src_map);                                 \
12023 	if(src_map != base_map)                                 \
12024 	        vm_map_deallocate(src_map);                     \
12025 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
12026 	        vm_map_copy_entry_dispose(new_entry);           \
12027 	vm_map_copy_discard(copy);                              \
12028 	{                                                       \
12029 	        submap_map_t	*_ptr;                          \
12030                                                                 \
12031 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12032 	                parent_maps=parent_maps->next;          \
12033 	                if (_ptr->parent_map != base_map)       \
12034 	                        vm_map_deallocate(_ptr->parent_map);    \
12035 	                kfree_type(submap_map_t, _ptr);         \
12036 	        }                                               \
12037 	}                                                       \
12038 	MACRO_RETURN(x);                                        \
12039 	MACRO_END
12040 
12041 	/*
12042 	 *	Find the beginning of the region.
12043 	 */
12044 
12045 	vm_map_lock(src_map);
12046 
12047 	/*
12048 	 * Lookup the original "src_addr_unaligned" rather than the truncated
12049 	 * "src_start", in case "src_start" falls in a non-map-aligned
12050 	 * map entry *before* the map entry that contains "src_addr_unaligned"...
12051 	 */
12052 	if (!vm_map_lookup_entry(src_map, src_addr_unaligned, &tmp_entry)) {
12053 		RETURN(KERN_INVALID_ADDRESS);
12054 	}
12055 	if (!tmp_entry->is_sub_map) {
12056 		/*
12057 		 * ... but clip to the map-rounded "src_start" rather than
12058 		 * "src_addr_unaligned" to preserve map-alignment.  We'll adjust the
12059 		 * first copy entry at the end, if needed.
12060 		 */
12061 		vm_map_clip_start(src_map, tmp_entry, src_start);
12062 	}
12063 	if (src_start < tmp_entry->vme_start) {
12064 		/*
12065 		 * Move "src_start" up to the start of the
12066 		 * first map entry to copy.
12067 		 */
12068 		src_start = tmp_entry->vme_start;
12069 	}
12070 	/* set for later submap fix-up */
12071 	copy_addr = src_start;
12072 
12073 	/*
12074 	 *	Go through entries until we get to the end.
12075 	 */
12076 
12077 	while (TRUE) {
12078 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
12079 		vm_map_size_t   src_size;               /* Size of source
12080 		                                         * map entry (in both
12081 		                                         * maps)
12082 		                                         */
12083 
12084 		vm_object_t             src_object;     /* Object to copy */
12085 		vm_object_offset_t      src_offset;
12086 
12087 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
12088 
12089 		boolean_t       src_needs_copy;         /* Should source map
12090 		                                         * be made read-only
12091 		                                         * for copy-on-write?
12092 		                                         */
12093 
12094 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
12095 
12096 		boolean_t       was_wired;              /* Was source wired? */
12097 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
12098 		vm_map_version_t version;               /* Version before locks
12099 		                                         * dropped to make copy
12100 		                                         */
12101 		kern_return_t   result;                 /* Return value from
12102 		                                         * copy_strategically.
12103 		                                         */
12104 		while (tmp_entry->is_sub_map) {
12105 			vm_map_size_t submap_len;
12106 			submap_map_t *ptr;
12107 
12108 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
12109 			ptr->next = parent_maps;
12110 			parent_maps = ptr;
12111 			ptr->parent_map = src_map;
12112 			ptr->base_start = src_start;
12113 			ptr->base_end = src_end;
12114 			submap_len = tmp_entry->vme_end - src_start;
12115 			if (submap_len > (src_end - src_start)) {
12116 				submap_len = src_end - src_start;
12117 			}
12118 			ptr->base_len = submap_len;
12119 
12120 			src_start -= tmp_entry->vme_start;
12121 			src_start += VME_OFFSET(tmp_entry);
12122 			src_end = src_start + submap_len;
12123 			src_map = VME_SUBMAP(tmp_entry);
12124 			vm_map_lock(src_map);
12125 			/* keep an outstanding reference for all maps in */
12126 			/* the parents tree except the base map */
12127 			vm_map_reference(src_map);
12128 			vm_map_unlock(ptr->parent_map);
12129 			if (!vm_map_lookup_entry(
12130 				    src_map, src_start, &tmp_entry)) {
12131 				RETURN(KERN_INVALID_ADDRESS);
12132 			}
12133 			map_share = TRUE;
12134 			if (!tmp_entry->is_sub_map) {
12135 				vm_map_clip_start(src_map, tmp_entry, src_start);
12136 			}
12137 			src_entry = tmp_entry;
12138 		}
12139 		/* we are now in the lowest level submap... */
12140 
12141 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12142 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12143 			/* This is not, supported for now.In future */
12144 			/* we will need to detect the phys_contig   */
12145 			/* condition and then upgrade copy_slowly   */
12146 			/* to do physical copy from the device mem  */
12147 			/* based object. We can piggy-back off of   */
12148 			/* the was wired boolean to set-up the      */
12149 			/* proper handling */
12150 			RETURN(KERN_PROTECTION_FAILURE);
12151 		}
12152 		/*
12153 		 *	Create a new address map entry to hold the result.
12154 		 *	Fill in the fields from the appropriate source entries.
12155 		 *	We must unlock the source map to do this if we need
12156 		 *	to allocate a map entry.
12157 		 */
12158 		if (new_entry == VM_MAP_ENTRY_NULL) {
12159 			version.main_timestamp = src_map->timestamp;
12160 			vm_map_unlock(src_map);
12161 
12162 			new_entry = vm_map_copy_entry_create(copy);
12163 
12164 			vm_map_lock(src_map);
12165 			if ((version.main_timestamp + 1) != src_map->timestamp) {
12166 				if (!vm_map_lookup_entry(src_map, src_start,
12167 				    &tmp_entry)) {
12168 					RETURN(KERN_INVALID_ADDRESS);
12169 				}
12170 				if (!tmp_entry->is_sub_map) {
12171 					vm_map_clip_start(src_map, tmp_entry, src_start);
12172 				}
12173 				continue; /* restart w/ new tmp_entry */
12174 			}
12175 		}
12176 
12177 		/*
12178 		 *	Verify that the region can be read.
12179 		 */
12180 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12181 		    !use_maxprot) ||
12182 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
12183 			RETURN(KERN_PROTECTION_FAILURE);
12184 		}
12185 
12186 		/*
12187 		 *	Clip against the endpoints of the entire region.
12188 		 */
12189 
12190 		vm_map_clip_end(src_map, src_entry, src_end);
12191 
12192 		src_size = src_entry->vme_end - src_start;
12193 		src_object = VME_OBJECT(src_entry);
12194 		src_offset = VME_OFFSET(src_entry);
12195 		was_wired = (src_entry->wired_count != 0);
12196 
12197 		vm_map_entry_copy(src_map, new_entry, src_entry);
12198 		if (new_entry->is_sub_map) {
12199 			/* clr address space specifics */
12200 			new_entry->use_pmap = FALSE;
12201 		} else {
12202 			/*
12203 			 * We're dealing with a copy-on-write operation,
12204 			 * so the resulting mapping should not inherit the
12205 			 * original mapping's accounting settings.
12206 			 * "iokit_acct" should have been cleared in
12207 			 * vm_map_entry_copy().
12208 			 * "use_pmap" should be reset to its default (TRUE)
12209 			 * so that the new mapping gets accounted for in
12210 			 * the task's memory footprint.
12211 			 */
12212 			assert(!new_entry->iokit_acct);
12213 			new_entry->use_pmap = TRUE;
12214 		}
12215 
12216 		/*
12217 		 *	Attempt non-blocking copy-on-write optimizations.
12218 		 */
12219 
12220 		/*
12221 		 * If we are destroying the source, and the object
12222 		 * is internal, we could move the object reference
12223 		 * from the source to the copy.  The copy is
12224 		 * copy-on-write only if the source is.
12225 		 * We make another reference to the object, because
12226 		 * destroying the source entry will deallocate it.
12227 		 *
12228 		 * This memory transfer has to be atomic, (to prevent
12229 		 * the VM object from being shared or copied while
12230 		 * it's being moved here), so we could only do this
12231 		 * if we won't have to unlock the VM map until the
12232 		 * original mapping has been fully removed.
12233 		 */
12234 
12235 RestartCopy:
12236 		if ((src_object == VM_OBJECT_NULL ||
12237 		    (!was_wired && !map_share && !tmp_entry->is_shared
12238 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12239 		    vm_object_copy_quickly(
12240 			    VME_OBJECT(new_entry),
12241 			    src_offset,
12242 			    src_size,
12243 			    &src_needs_copy,
12244 			    &new_entry_needs_copy)) {
12245 			new_entry->needs_copy = new_entry_needs_copy;
12246 
12247 			/*
12248 			 *	Handle copy-on-write obligations
12249 			 */
12250 
12251 			if (src_needs_copy && !tmp_entry->needs_copy) {
12252 				vm_prot_t prot;
12253 
12254 				prot = src_entry->protection & ~VM_PROT_WRITE;
12255 
12256 				if (override_nx(src_map, VME_ALIAS(src_entry))
12257 				    && prot) {
12258 					prot |= VM_PROT_EXECUTE;
12259 				}
12260 
12261 				vm_object_pmap_protect(
12262 					src_object,
12263 					src_offset,
12264 					src_size,
12265 					(src_entry->is_shared ?
12266 					PMAP_NULL
12267 					: src_map->pmap),
12268 					VM_MAP_PAGE_SIZE(src_map),
12269 					src_entry->vme_start,
12270 					prot);
12271 
12272 				assert(tmp_entry->wired_count == 0);
12273 				tmp_entry->needs_copy = TRUE;
12274 			}
12275 
12276 			/*
12277 			 *	The map has never been unlocked, so it's safe
12278 			 *	to move to the next entry rather than doing
12279 			 *	another lookup.
12280 			 */
12281 
12282 			goto CopySuccessful;
12283 		}
12284 
12285 		entry_was_shared = tmp_entry->is_shared;
12286 
12287 		/*
12288 		 *	Take an object reference, so that we may
12289 		 *	release the map lock(s).
12290 		 */
12291 
12292 		assert(src_object != VM_OBJECT_NULL);
12293 		vm_object_reference(src_object);
12294 
12295 		/*
12296 		 *	Record the timestamp for later verification.
12297 		 *	Unlock the map.
12298 		 */
12299 
12300 		version.main_timestamp = src_map->timestamp;
12301 		vm_map_unlock(src_map); /* Increments timestamp once! */
12302 		saved_src_entry = src_entry;
12303 		tmp_entry = VM_MAP_ENTRY_NULL;
12304 		src_entry = VM_MAP_ENTRY_NULL;
12305 
12306 		/*
12307 		 *	Perform the copy
12308 		 */
12309 
12310 		if (was_wired ||
12311 		    (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12312 		    !(flags & VM_MAP_COPYIN_FORK)) ||
12313 		    (debug4k_no_cow_copyin &&
12314 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12315 CopySlowly:
12316 			vm_object_lock(src_object);
12317 			result = vm_object_copy_slowly(
12318 				src_object,
12319 				src_offset,
12320 				src_size,
12321 				THREAD_UNINT,
12322 				&new_copy_object);
12323 			/* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12324 			saved_used_for_jit = new_entry->used_for_jit;
12325 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12326 			new_entry->used_for_jit = saved_used_for_jit;
12327 			VME_OFFSET_SET(new_entry,
12328 			    src_offset - vm_object_trunc_page(src_offset));
12329 			new_entry->needs_copy = FALSE;
12330 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12331 		    (entry_was_shared || map_share)) {
12332 			vm_object_t new_object;
12333 
12334 			vm_object_lock_shared(src_object);
12335 			new_object = vm_object_copy_delayed(
12336 				src_object,
12337 				src_offset,
12338 				src_size,
12339 				TRUE);
12340 			if (new_object == VM_OBJECT_NULL) {
12341 				goto CopySlowly;
12342 			}
12343 
12344 			VME_OBJECT_SET(new_entry, new_object, false, 0);
12345 			assert(new_entry->wired_count == 0);
12346 			new_entry->needs_copy = TRUE;
12347 			assert(!new_entry->iokit_acct);
12348 			assert(new_object->purgable == VM_PURGABLE_DENY);
12349 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12350 			result = KERN_SUCCESS;
12351 		} else {
12352 			vm_object_offset_t new_offset;
12353 			new_offset = VME_OFFSET(new_entry);
12354 			result = vm_object_copy_strategically(src_object,
12355 			    src_offset,
12356 			    src_size,
12357 			    (flags & VM_MAP_COPYIN_FORK),
12358 			    &new_copy_object,
12359 			    &new_offset,
12360 			    &new_entry_needs_copy);
12361 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12362 			saved_used_for_jit = new_entry->used_for_jit;
12363 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12364 			new_entry->used_for_jit = saved_used_for_jit;
12365 			if (new_offset != VME_OFFSET(new_entry)) {
12366 				VME_OFFSET_SET(new_entry, new_offset);
12367 			}
12368 
12369 			new_entry->needs_copy = new_entry_needs_copy;
12370 		}
12371 
12372 		if (result == KERN_SUCCESS &&
12373 		    ((preserve_purgeable &&
12374 		    src_object->purgable != VM_PURGABLE_DENY) ||
12375 		    new_entry->used_for_jit)) {
12376 			/*
12377 			 * Purgeable objects should be COPY_NONE, true share;
12378 			 * this should be propogated to the copy.
12379 			 *
12380 			 * Also force mappings the pmap specially protects to
12381 			 * be COPY_NONE; trying to COW these mappings would
12382 			 * change the effective protections, which could have
12383 			 * side effects if the pmap layer relies on the
12384 			 * specified protections.
12385 			 */
12386 
12387 			vm_object_t     new_object;
12388 
12389 			new_object = VME_OBJECT(new_entry);
12390 			assert(new_object != src_object);
12391 			vm_object_lock(new_object);
12392 			assert(os_ref_get_count_raw(&new_object->ref_count) == 1);
12393 			assert(new_object->shadow == VM_OBJECT_NULL);
12394 			assert(new_object->vo_copy == VM_OBJECT_NULL);
12395 			assert(new_object->vo_owner == NULL);
12396 
12397 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12398 
12399 			if (preserve_purgeable &&
12400 			    src_object->purgable != VM_PURGABLE_DENY) {
12401 				VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12402 
12403 				/* start as non-volatile with no owner... */
12404 				VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12405 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12406 				/* ... and move to src_object's purgeable state */
12407 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12408 					int state;
12409 					state = src_object->purgable;
12410 					vm_object_purgable_control(
12411 						new_object,
12412 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
12413 						&state);
12414 				}
12415 				/* no pmap accounting for purgeable objects */
12416 				new_entry->use_pmap = FALSE;
12417 			}
12418 
12419 			vm_object_unlock(new_object);
12420 			new_object = VM_OBJECT_NULL;
12421 		}
12422 
12423 		/*
12424 		 *	Throw away the extra reference
12425 		 */
12426 
12427 		vm_object_deallocate(src_object);
12428 
12429 		if (result != KERN_SUCCESS &&
12430 		    result != KERN_MEMORY_RESTART_COPY) {
12431 			vm_map_lock(src_map);
12432 			RETURN(result);
12433 		}
12434 
12435 		/*
12436 		 *	Verify that the map has not substantially
12437 		 *	changed while the copy was being made.
12438 		 */
12439 
12440 		vm_map_lock(src_map);
12441 
12442 		if ((version.main_timestamp + 1) == src_map->timestamp) {
12443 			/* src_map hasn't changed: src_entry is still valid */
12444 			src_entry = saved_src_entry;
12445 			goto VerificationSuccessful;
12446 		}
12447 
12448 		/*
12449 		 *	Simple version comparison failed.
12450 		 *
12451 		 *	Retry the lookup and verify that the
12452 		 *	same object/offset are still present.
12453 		 *
12454 		 *	[Note: a memory manager that colludes with
12455 		 *	the calling task can detect that we have
12456 		 *	cheated.  While the map was unlocked, the
12457 		 *	mapping could have been changed and restored.]
12458 		 */
12459 
12460 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12461 			if (result != KERN_MEMORY_RESTART_COPY) {
12462 				vm_object_deallocate(VME_OBJECT(new_entry));
12463 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12464 				/* reset accounting state */
12465 				new_entry->iokit_acct = FALSE;
12466 				new_entry->use_pmap = TRUE;
12467 			}
12468 			RETURN(KERN_INVALID_ADDRESS);
12469 		}
12470 
12471 		src_entry = tmp_entry;
12472 		vm_map_clip_start(src_map, src_entry, src_start);
12473 
12474 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12475 		    !use_maxprot) ||
12476 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12477 			goto VerificationFailed;
12478 		}
12479 
12480 		if (src_entry->vme_end < new_entry->vme_end) {
12481 			/*
12482 			 * This entry might have been shortened
12483 			 * (vm_map_clip_end) or been replaced with
12484 			 * an entry that ends closer to "src_start"
12485 			 * than before.
12486 			 * Adjust "new_entry" accordingly; copying
12487 			 * less memory would be correct but we also
12488 			 * redo the copy (see below) if the new entry
12489 			 * no longer points at the same object/offset.
12490 			 */
12491 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12492 			    VM_MAP_COPY_PAGE_MASK(copy)));
12493 			new_entry->vme_end = src_entry->vme_end;
12494 			src_size = new_entry->vme_end - src_start;
12495 		} else if (src_entry->vme_end > new_entry->vme_end) {
12496 			/*
12497 			 * This entry might have been extended
12498 			 * (vm_map_entry_simplify() or coalesce)
12499 			 * or been replaced with an entry that ends farther
12500 			 * from "src_start" than before.
12501 			 *
12502 			 * We've called vm_object_copy_*() only on
12503 			 * the previous <start:end> range, so we can't
12504 			 * just extend new_entry.  We have to re-do
12505 			 * the copy based on the new entry as if it was
12506 			 * pointing at a different object/offset (see
12507 			 * "Verification failed" below).
12508 			 */
12509 		}
12510 
12511 		if ((VME_OBJECT(src_entry) != src_object) ||
12512 		    (VME_OFFSET(src_entry) != src_offset) ||
12513 		    (src_entry->vme_end > new_entry->vme_end)) {
12514 			/*
12515 			 *	Verification failed.
12516 			 *
12517 			 *	Start over with this top-level entry.
12518 			 */
12519 
12520 VerificationFailed:     ;
12521 
12522 			vm_object_deallocate(VME_OBJECT(new_entry));
12523 			tmp_entry = src_entry;
12524 			continue;
12525 		}
12526 
12527 		/*
12528 		 *	Verification succeeded.
12529 		 */
12530 
12531 VerificationSuccessful:;
12532 
12533 		if (result == KERN_MEMORY_RESTART_COPY) {
12534 			goto RestartCopy;
12535 		}
12536 
12537 		/*
12538 		 *	Copy succeeded.
12539 		 */
12540 
12541 CopySuccessful: ;
12542 
12543 		/*
12544 		 *	Link in the new copy entry.
12545 		 */
12546 
12547 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12548 		    new_entry);
12549 
12550 		/*
12551 		 *	Determine whether the entire region
12552 		 *	has been copied.
12553 		 */
12554 		src_base = src_start;
12555 		src_start = new_entry->vme_end;
12556 		new_entry = VM_MAP_ENTRY_NULL;
12557 		while ((src_start >= src_end) && (src_end != 0)) {
12558 			submap_map_t    *ptr;
12559 
12560 			if (src_map == base_map) {
12561 				/* back to the top */
12562 				break;
12563 			}
12564 
12565 			ptr = parent_maps;
12566 			assert(ptr != NULL);
12567 			parent_maps = parent_maps->next;
12568 
12569 			/* fix up the damage we did in that submap */
12570 			vm_map_simplify_range(src_map,
12571 			    src_base,
12572 			    src_end);
12573 
12574 			vm_map_unlock(src_map);
12575 			vm_map_deallocate(src_map);
12576 			vm_map_lock(ptr->parent_map);
12577 			src_map = ptr->parent_map;
12578 			src_base = ptr->base_start;
12579 			src_start = ptr->base_start + ptr->base_len;
12580 			src_end = ptr->base_end;
12581 			if (!vm_map_lookup_entry(src_map,
12582 			    src_start,
12583 			    &tmp_entry) &&
12584 			    (src_end > src_start)) {
12585 				RETURN(KERN_INVALID_ADDRESS);
12586 			}
12587 			kfree_type(submap_map_t, ptr);
12588 			if (parent_maps == NULL) {
12589 				map_share = FALSE;
12590 			}
12591 			src_entry = tmp_entry->vme_prev;
12592 		}
12593 
12594 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12595 		    (src_start >= src_addr_unaligned + len) &&
12596 		    (src_addr_unaligned + len != 0)) {
12597 			/*
12598 			 * Stop copying now, even though we haven't reached
12599 			 * "src_end".  We'll adjust the end of the last copy
12600 			 * entry at the end, if needed.
12601 			 *
12602 			 * If src_map's aligment is different from the
12603 			 * system's page-alignment, there could be
12604 			 * extra non-map-aligned map entries between
12605 			 * the original (non-rounded) "src_addr_unaligned + len"
12606 			 * and the rounded "src_end".
12607 			 * We do not want to copy those map entries since
12608 			 * they're not part of the copied range.
12609 			 */
12610 			break;
12611 		}
12612 
12613 		if ((src_start >= src_end) && (src_end != 0)) {
12614 			break;
12615 		}
12616 
12617 		/*
12618 		 *	Verify that there are no gaps in the region
12619 		 */
12620 
12621 		tmp_entry = src_entry->vme_next;
12622 		if ((tmp_entry->vme_start != src_start) ||
12623 		    (tmp_entry == vm_map_to_entry(src_map))) {
12624 			RETURN(KERN_INVALID_ADDRESS);
12625 		}
12626 	}
12627 
12628 	/*
12629 	 * If the source should be destroyed, do it now, since the
12630 	 * copy was successful.
12631 	 */
12632 	if (src_destroy) {
12633 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12634 
12635 		if (src_map == kernel_map) {
12636 			remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12637 		}
12638 		(void)vm_map_remove_and_unlock(src_map,
12639 		    vm_map_trunc_page(src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)),
12640 		    src_end,
12641 		    remove_flags,
12642 		    KMEM_GUARD_NONE);
12643 	} else {
12644 		/* fix up the damage we did in the base map */
12645 		vm_map_simplify_range(
12646 			src_map,
12647 			vm_map_trunc_page(src_addr_unaligned,
12648 			VM_MAP_PAGE_MASK(src_map)),
12649 			vm_map_round_page(src_end,
12650 			VM_MAP_PAGE_MASK(src_map)));
12651 		vm_map_unlock(src_map);
12652 	}
12653 
12654 	tmp_entry = VM_MAP_ENTRY_NULL;
12655 
12656 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12657 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12658 		vm_map_offset_t original_start, original_offset, original_end;
12659 
12660 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12661 
12662 		/* adjust alignment of first copy_entry's "vme_start" */
12663 		tmp_entry = vm_map_copy_first_entry(copy);
12664 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12665 			vm_map_offset_t adjustment;
12666 
12667 			original_start = tmp_entry->vme_start;
12668 			original_offset = VME_OFFSET(tmp_entry);
12669 
12670 			/* map-align the start of the first copy entry... */
12671 			adjustment = (tmp_entry->vme_start -
12672 			    vm_map_trunc_page(
12673 				    tmp_entry->vme_start,
12674 				    VM_MAP_PAGE_MASK(src_map)));
12675 			tmp_entry->vme_start -= adjustment;
12676 			VME_OFFSET_SET(tmp_entry,
12677 			    VME_OFFSET(tmp_entry) - adjustment);
12678 			copy_addr -= adjustment;
12679 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
12680 			/* ... adjust for mis-aligned start of copy range */
12681 			adjustment =
12682 			    (vm_map_trunc_page(copy->offset,
12683 			    PAGE_MASK) -
12684 			    vm_map_trunc_page(copy->offset,
12685 			    VM_MAP_PAGE_MASK(src_map)));
12686 			if (adjustment) {
12687 				assert(page_aligned(adjustment));
12688 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12689 				tmp_entry->vme_start += adjustment;
12690 				VME_OFFSET_SET(tmp_entry,
12691 				    (VME_OFFSET(tmp_entry) +
12692 				    adjustment));
12693 				copy_addr += adjustment;
12694 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12695 			}
12696 
12697 			/*
12698 			 * Assert that the adjustments haven't exposed
12699 			 * more than was originally copied...
12700 			 */
12701 			assert(tmp_entry->vme_start >= original_start);
12702 			assert(VME_OFFSET(tmp_entry) >= original_offset);
12703 			/*
12704 			 * ... and that it did not adjust outside of a
12705 			 * a single 16K page.
12706 			 */
12707 			assert(vm_map_trunc_page(tmp_entry->vme_start,
12708 			    VM_MAP_PAGE_MASK(src_map)) ==
12709 			    vm_map_trunc_page(original_start,
12710 			    VM_MAP_PAGE_MASK(src_map)));
12711 		}
12712 
12713 		/* adjust alignment of last copy_entry's "vme_end" */
12714 		tmp_entry = vm_map_copy_last_entry(copy);
12715 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12716 			vm_map_offset_t adjustment;
12717 
12718 			original_end = tmp_entry->vme_end;
12719 
12720 			/* map-align the end of the last copy entry... */
12721 			tmp_entry->vme_end =
12722 			    vm_map_round_page(tmp_entry->vme_end,
12723 			    VM_MAP_PAGE_MASK(src_map));
12724 			/* ... adjust for mis-aligned end of copy range */
12725 			adjustment =
12726 			    (vm_map_round_page((copy->offset +
12727 			    copy->size),
12728 			    VM_MAP_PAGE_MASK(src_map)) -
12729 			    vm_map_round_page((copy->offset +
12730 			    copy->size),
12731 			    PAGE_MASK));
12732 			if (adjustment) {
12733 				assert(page_aligned(adjustment));
12734 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12735 				tmp_entry->vme_end -= adjustment;
12736 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12737 			}
12738 
12739 			/*
12740 			 * Assert that the adjustments haven't exposed
12741 			 * more than was originally copied...
12742 			 */
12743 			assert(tmp_entry->vme_end <= original_end);
12744 			/*
12745 			 * ... and that it did not adjust outside of a
12746 			 * a single 16K page.
12747 			 */
12748 			assert(vm_map_round_page(tmp_entry->vme_end,
12749 			    VM_MAP_PAGE_MASK(src_map)) ==
12750 			    vm_map_round_page(original_end,
12751 			    VM_MAP_PAGE_MASK(src_map)));
12752 		}
12753 	}
12754 
12755 	/* Fix-up start and end points in copy.  This is necessary */
12756 	/* when the various entries in the copy object were picked */
12757 	/* up from different sub-maps */
12758 
12759 	tmp_entry = vm_map_copy_first_entry(copy);
12760 	copy_size = 0; /* compute actual size */
12761 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12762 		assert(VM_MAP_PAGE_ALIGNED(
12763 			    copy_addr + (tmp_entry->vme_end -
12764 			    tmp_entry->vme_start),
12765 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12766 		assert(VM_MAP_PAGE_ALIGNED(
12767 			    copy_addr,
12768 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12769 
12770 		/*
12771 		 * The copy_entries will be injected directly into the
12772 		 * destination map and might not be "map aligned" there...
12773 		 */
12774 		tmp_entry->map_aligned = FALSE;
12775 
12776 		tmp_entry->vme_end = copy_addr +
12777 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12778 		tmp_entry->vme_start = copy_addr;
12779 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12780 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12781 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12782 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12783 	}
12784 
12785 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12786 	    copy_size < copy->size) {
12787 		/*
12788 		 * The actual size of the VM map copy is smaller than what
12789 		 * was requested by the caller.  This must be because some
12790 		 * PAGE_SIZE-sized pages are missing at the end of the last
12791 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12792 		 * The caller might not have been aware of those missing
12793 		 * pages and might not want to be aware of it, which is
12794 		 * fine as long as they don't try to access (and crash on)
12795 		 * those missing pages.
12796 		 * Let's adjust the size of the "copy", to avoid failing
12797 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12798 		 */
12799 		assert(vm_map_round_page(copy_size,
12800 		    VM_MAP_PAGE_MASK(src_map)) ==
12801 		    vm_map_round_page(copy->size,
12802 		    VM_MAP_PAGE_MASK(src_map)));
12803 		copy->size = copy_size;
12804 	}
12805 
12806 	*copy_result = copy;
12807 	return KERN_SUCCESS;
12808 
12809 #undef  RETURN
12810 }
12811 
12812 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12813 vm_map_copy_extract(
12814 	vm_map_t                src_map,
12815 	vm_map_address_t        src_addr,
12816 	vm_map_size_t           len,
12817 	boolean_t               do_copy,
12818 	vm_map_copy_t           *copy_result,   /* OUT */
12819 	vm_prot_t               *cur_prot,      /* IN/OUT */
12820 	vm_prot_t               *max_prot,      /* IN/OUT */
12821 	vm_inherit_t            inheritance,
12822 	vm_map_kernel_flags_t   vmk_flags)
12823 {
12824 	vm_map_copy_t   copy;
12825 	kern_return_t   kr;
12826 	vm_prot_t required_cur_prot, required_max_prot;
12827 
12828 	/*
12829 	 *	Check for copies of zero bytes.
12830 	 */
12831 
12832 	if (len == 0) {
12833 		*copy_result = VM_MAP_COPY_NULL;
12834 		return KERN_SUCCESS;
12835 	}
12836 
12837 	/*
12838 	 *	Check that the end address doesn't overflow
12839 	 */
12840 	if (src_addr + len < src_addr) {
12841 		return KERN_INVALID_ADDRESS;
12842 	}
12843 	if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12844 		return KERN_INVALID_ADDRESS;
12845 	}
12846 
12847 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12848 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12849 	}
12850 
12851 	required_cur_prot = *cur_prot;
12852 	required_max_prot = *max_prot;
12853 
12854 	/*
12855 	 *	Allocate a header element for the list.
12856 	 *
12857 	 *	Use the start and end in the header to
12858 	 *	remember the endpoints prior to rounding.
12859 	 */
12860 
12861 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12862 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12863 	copy->offset = 0;
12864 	copy->size = len;
12865 
12866 	kr = vm_map_remap_extract(src_map,
12867 	    src_addr,
12868 	    len,
12869 	    do_copy,             /* copy */
12870 	    copy,
12871 	    cur_prot,            /* IN/OUT */
12872 	    max_prot,            /* IN/OUT */
12873 	    inheritance,
12874 	    vmk_flags);
12875 	if (kr != KERN_SUCCESS) {
12876 		vm_map_copy_discard(copy);
12877 		if ((kr == KERN_INVALID_ADDRESS ||
12878 		    kr == KERN_INVALID_ARGUMENT) &&
12879 		    src_map->terminated) {
12880 			/* tell the caller that this address space is gone */
12881 			kr = KERN_TERMINATED;
12882 		}
12883 		return kr;
12884 	}
12885 	if (required_cur_prot != VM_PROT_NONE) {
12886 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
12887 		assert((*max_prot & required_max_prot) == required_max_prot);
12888 	}
12889 
12890 	*copy_result = copy;
12891 	return KERN_SUCCESS;
12892 }
12893 
12894 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12895 vm_map_fork_share(
12896 	vm_map_t        old_map,
12897 	vm_map_entry_t  old_entry,
12898 	vm_map_t        new_map)
12899 {
12900 	vm_object_t     object;
12901 	vm_map_entry_t  new_entry;
12902 
12903 	/*
12904 	 *	New sharing code.  New map entry
12905 	 *	references original object.  Internal
12906 	 *	objects use asynchronous copy algorithm for
12907 	 *	future copies.  First make sure we have
12908 	 *	the right object.  If we need a shadow,
12909 	 *	or someone else already has one, then
12910 	 *	make a new shadow and share it.
12911 	 */
12912 
12913 	if (!old_entry->is_sub_map) {
12914 		object = VME_OBJECT(old_entry);
12915 	}
12916 
12917 	if (old_entry->is_sub_map) {
12918 		assert(old_entry->wired_count == 0);
12919 #ifndef NO_NESTED_PMAP
12920 #if !PMAP_FORK_NEST
12921 		if (old_entry->use_pmap) {
12922 			kern_return_t   result;
12923 
12924 			result = pmap_nest(new_map->pmap,
12925 			    (VME_SUBMAP(old_entry))->pmap,
12926 			    (addr64_t)old_entry->vme_start,
12927 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12928 			if (result) {
12929 				panic("vm_map_fork_share: pmap_nest failed!");
12930 			}
12931 		}
12932 #endif /* !PMAP_FORK_NEST */
12933 #endif  /* NO_NESTED_PMAP */
12934 	} else if (object == VM_OBJECT_NULL) {
12935 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12936 		    old_entry->vme_start));
12937 		VME_OFFSET_SET(old_entry, 0);
12938 		VME_OBJECT_SET(old_entry, object, false, 0);
12939 		old_entry->use_pmap = TRUE;
12940 //		assert(!old_entry->needs_copy);
12941 	} else if (object->copy_strategy !=
12942 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
12943 		/*
12944 		 *	We are already using an asymmetric
12945 		 *	copy, and therefore we already have
12946 		 *	the right object.
12947 		 */
12948 
12949 		assert(!old_entry->needs_copy);
12950 	} else if (old_entry->needs_copy ||       /* case 1 */
12951 	    object->shadowed ||                 /* case 2 */
12952 	    (!object->true_share &&             /* case 3 */
12953 	    !old_entry->is_shared &&
12954 	    (object->vo_size >
12955 	    (vm_map_size_t)(old_entry->vme_end -
12956 	    old_entry->vme_start)))) {
12957 		bool is_writable;
12958 
12959 		/*
12960 		 *	We need to create a shadow.
12961 		 *	There are three cases here.
12962 		 *	In the first case, we need to
12963 		 *	complete a deferred symmetrical
12964 		 *	copy that we participated in.
12965 		 *	In the second and third cases,
12966 		 *	we need to create the shadow so
12967 		 *	that changes that we make to the
12968 		 *	object do not interfere with
12969 		 *	any symmetrical copies which
12970 		 *	have occured (case 2) or which
12971 		 *	might occur (case 3).
12972 		 *
12973 		 *	The first case is when we had
12974 		 *	deferred shadow object creation
12975 		 *	via the entry->needs_copy mechanism.
12976 		 *	This mechanism only works when
12977 		 *	only one entry points to the source
12978 		 *	object, and we are about to create
12979 		 *	a second entry pointing to the
12980 		 *	same object. The problem is that
12981 		 *	there is no way of mapping from
12982 		 *	an object to the entries pointing
12983 		 *	to it. (Deferred shadow creation
12984 		 *	works with one entry because occurs
12985 		 *	at fault time, and we walk from the
12986 		 *	entry to the object when handling
12987 		 *	the fault.)
12988 		 *
12989 		 *	The second case is when the object
12990 		 *	to be shared has already been copied
12991 		 *	with a symmetric copy, but we point
12992 		 *	directly to the object without
12993 		 *	needs_copy set in our entry. (This
12994 		 *	can happen because different ranges
12995 		 *	of an object can be pointed to by
12996 		 *	different entries. In particular,
12997 		 *	a single entry pointing to an object
12998 		 *	can be split by a call to vm_inherit,
12999 		 *	which, combined with task_create, can
13000 		 *	result in the different entries
13001 		 *	having different needs_copy values.)
13002 		 *	The shadowed flag in the object allows
13003 		 *	us to detect this case. The problem
13004 		 *	with this case is that if this object
13005 		 *	has or will have shadows, then we
13006 		 *	must not perform an asymmetric copy
13007 		 *	of this object, since such a copy
13008 		 *	allows the object to be changed, which
13009 		 *	will break the previous symmetrical
13010 		 *	copies (which rely upon the object
13011 		 *	not changing). In a sense, the shadowed
13012 		 *	flag says "don't change this object".
13013 		 *	We fix this by creating a shadow
13014 		 *	object for this object, and sharing
13015 		 *	that. This works because we are free
13016 		 *	to change the shadow object (and thus
13017 		 *	to use an asymmetric copy strategy);
13018 		 *	this is also semantically correct,
13019 		 *	since this object is temporary, and
13020 		 *	therefore a copy of the object is
13021 		 *	as good as the object itself. (This
13022 		 *	is not true for permanent objects,
13023 		 *	since the pager needs to see changes,
13024 		 *	which won't happen if the changes
13025 		 *	are made to a copy.)
13026 		 *
13027 		 *	The third case is when the object
13028 		 *	to be shared has parts sticking
13029 		 *	outside of the entry we're working
13030 		 *	with, and thus may in the future
13031 		 *	be subject to a symmetrical copy.
13032 		 *	(This is a preemptive version of
13033 		 *	case 2.)
13034 		 */
13035 		VME_OBJECT_SHADOW(old_entry,
13036 		    (vm_map_size_t) (old_entry->vme_end -
13037 		    old_entry->vme_start),
13038 		    vm_map_always_shadow(old_map));
13039 
13040 		/*
13041 		 *	If we're making a shadow for other than
13042 		 *	copy on write reasons, then we have
13043 		 *	to remove write permission.
13044 		 */
13045 
13046 		is_writable = false;
13047 		if (old_entry->protection & VM_PROT_WRITE) {
13048 			is_writable = true;
13049 #if __arm64e__
13050 		} else if (old_entry->used_for_tpro) {
13051 			is_writable = true;
13052 #endif /* __arm64e__ */
13053 		}
13054 		if (!old_entry->needs_copy && is_writable) {
13055 			vm_prot_t prot;
13056 
13057 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13058 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13059 				    __FUNCTION__, old_map, old_map->pmap,
13060 				    old_entry,
13061 				    (uint64_t)old_entry->vme_start,
13062 				    (uint64_t)old_entry->vme_end,
13063 				    old_entry->protection);
13064 			}
13065 
13066 			prot = old_entry->protection & ~VM_PROT_WRITE;
13067 
13068 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13069 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13070 				    __FUNCTION__, old_map, old_map->pmap,
13071 				    old_entry,
13072 				    (uint64_t)old_entry->vme_start,
13073 				    (uint64_t)old_entry->vme_end,
13074 				    prot);
13075 			}
13076 
13077 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13078 				prot |= VM_PROT_EXECUTE;
13079 			}
13080 
13081 
13082 			if (old_map->mapped_in_other_pmaps) {
13083 				vm_object_pmap_protect(
13084 					VME_OBJECT(old_entry),
13085 					VME_OFFSET(old_entry),
13086 					(old_entry->vme_end -
13087 					old_entry->vme_start),
13088 					PMAP_NULL,
13089 					PAGE_SIZE,
13090 					old_entry->vme_start,
13091 					prot);
13092 			} else {
13093 				pmap_protect(old_map->pmap,
13094 				    old_entry->vme_start,
13095 				    old_entry->vme_end,
13096 				    prot);
13097 			}
13098 		}
13099 
13100 		old_entry->needs_copy = FALSE;
13101 		object = VME_OBJECT(old_entry);
13102 	}
13103 
13104 
13105 	/*
13106 	 *	If object was using a symmetric copy strategy,
13107 	 *	change its copy strategy to the default
13108 	 *	asymmetric copy strategy, which is copy_delay
13109 	 *	in the non-norma case and copy_call in the
13110 	 *	norma case. Bump the reference count for the
13111 	 *	new entry.
13112 	 */
13113 
13114 	if (old_entry->is_sub_map) {
13115 		vm_map_reference(VME_SUBMAP(old_entry));
13116 	} else {
13117 		vm_object_lock(object);
13118 		vm_object_reference_locked(object);
13119 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13120 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13121 		}
13122 		vm_object_unlock(object);
13123 	}
13124 
13125 	/*
13126 	 *	Clone the entry, using object ref from above.
13127 	 *	Mark both entries as shared.
13128 	 */
13129 
13130 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13131 	vm_map_entry_copy(old_map, new_entry, old_entry);
13132 	old_entry->is_shared = TRUE;
13133 	new_entry->is_shared = TRUE;
13134 
13135 	/*
13136 	 * We're dealing with a shared mapping, so the resulting mapping
13137 	 * should inherit some of the original mapping's accounting settings.
13138 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13139 	 * "use_pmap" should stay the same as before (if it hasn't been reset
13140 	 * to TRUE when we cleared "iokit_acct").
13141 	 */
13142 	assert(!new_entry->iokit_acct);
13143 
13144 	/*
13145 	 *	If old entry's inheritence is VM_INHERIT_NONE,
13146 	 *	the new entry is for corpse fork, remove the
13147 	 *	write permission from the new entry.
13148 	 */
13149 	if (old_entry->inheritance == VM_INHERIT_NONE) {
13150 		new_entry->protection &= ~VM_PROT_WRITE;
13151 		new_entry->max_protection &= ~VM_PROT_WRITE;
13152 	}
13153 
13154 	/*
13155 	 *	Insert the entry into the new map -- we
13156 	 *	know we're inserting at the end of the new
13157 	 *	map.
13158 	 */
13159 
13160 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13161 	    VM_MAP_KERNEL_FLAGS_NONE);
13162 
13163 	/*
13164 	 *	Update the physical map
13165 	 */
13166 
13167 	if (old_entry->is_sub_map) {
13168 		/* Bill Angell pmap support goes here */
13169 	} else {
13170 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13171 		    old_entry->vme_end - old_entry->vme_start,
13172 		    old_entry->vme_start);
13173 	}
13174 }
13175 
13176 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13177 vm_map_fork_copy(
13178 	vm_map_t        old_map,
13179 	vm_map_entry_t  *old_entry_p,
13180 	vm_map_t        new_map,
13181 	int             vm_map_copyin_flags)
13182 {
13183 	vm_map_entry_t old_entry = *old_entry_p;
13184 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13185 	vm_map_offset_t start = old_entry->vme_start;
13186 	vm_map_copy_t copy;
13187 	vm_map_entry_t last = vm_map_last_entry(new_map);
13188 
13189 	vm_map_unlock(old_map);
13190 	/*
13191 	 *	Use maxprot version of copyin because we
13192 	 *	care about whether this memory can ever
13193 	 *	be accessed, not just whether it's accessible
13194 	 *	right now.
13195 	 */
13196 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13197 	if (vm_map_copyin_internal(old_map, start, entry_size,
13198 	    vm_map_copyin_flags, &copy)
13199 	    != KERN_SUCCESS) {
13200 		/*
13201 		 *	The map might have changed while it
13202 		 *	was unlocked, check it again.  Skip
13203 		 *	any blank space or permanently
13204 		 *	unreadable region.
13205 		 */
13206 		vm_map_lock(old_map);
13207 		if (!vm_map_lookup_entry(old_map, start, &last) ||
13208 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13209 			last = last->vme_next;
13210 		}
13211 		*old_entry_p = last;
13212 
13213 		/*
13214 		 * XXX	For some error returns, want to
13215 		 * XXX	skip to the next element.  Note
13216 		 *	that INVALID_ADDRESS and
13217 		 *	PROTECTION_FAILURE are handled above.
13218 		 */
13219 
13220 		return FALSE;
13221 	}
13222 
13223 	/*
13224 	 * Assert that the vm_map_copy is coming from the right
13225 	 * zone and hasn't been forged
13226 	 */
13227 	vm_map_copy_require(copy);
13228 
13229 	/*
13230 	 *	Insert the copy into the new map
13231 	 */
13232 	vm_map_copy_insert(new_map, last, copy);
13233 
13234 	/*
13235 	 *	Pick up the traversal at the end of
13236 	 *	the copied region.
13237 	 */
13238 
13239 	vm_map_lock(old_map);
13240 	start += entry_size;
13241 	if (!vm_map_lookup_entry(old_map, start, &last)) {
13242 		last = last->vme_next;
13243 	} else {
13244 		if (last->vme_start == start) {
13245 			/*
13246 			 * No need to clip here and we don't
13247 			 * want to cause any unnecessary
13248 			 * unnesting...
13249 			 */
13250 		} else {
13251 			vm_map_clip_start(old_map, last, start);
13252 		}
13253 	}
13254 	*old_entry_p = last;
13255 
13256 	return TRUE;
13257 }
13258 
13259 #if PMAP_FORK_NEST
13260 #define PMAP_FORK_NEST_DEBUG 0
13261 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13262 vm_map_fork_unnest(
13263 	pmap_t new_pmap,
13264 	vm_map_offset_t pre_nested_start,
13265 	vm_map_offset_t pre_nested_end,
13266 	vm_map_offset_t start,
13267 	vm_map_offset_t end)
13268 {
13269 	kern_return_t kr;
13270 	vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13271 
13272 	assertf(pre_nested_start <= pre_nested_end,
13273 	    "pre_nested start 0x%llx end 0x%llx",
13274 	    (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13275 	assertf(start <= end,
13276 	    "start 0x%llx end 0x%llx",
13277 	    (uint64_t) start, (uint64_t)end);
13278 
13279 	if (pre_nested_start == pre_nested_end) {
13280 		/* nothing was pre-nested: done */
13281 		return;
13282 	}
13283 	if (end <= pre_nested_start) {
13284 		/* fully before pre-nested range: done */
13285 		return;
13286 	}
13287 	if (start >= pre_nested_end) {
13288 		/* fully after pre-nested range: done */
13289 		return;
13290 	}
13291 	/* ignore parts of range outside of pre_nested range */
13292 	if (start < pre_nested_start) {
13293 		start = pre_nested_start;
13294 	}
13295 	if (end > pre_nested_end) {
13296 		end = pre_nested_end;
13297 	}
13298 	nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13299 	start_unnest = start & ~nesting_mask;
13300 	end_unnest = (end + nesting_mask) & ~nesting_mask;
13301 	kr = pmap_unnest(new_pmap,
13302 	    (addr64_t)start_unnest,
13303 	    (uint64_t)(end_unnest - start_unnest));
13304 #if PMAP_FORK_NEST_DEBUG
13305 	printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13306 #endif /* PMAP_FORK_NEST_DEBUG */
13307 	assertf(kr == KERN_SUCCESS,
13308 	    "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13309 	    (uint64_t)start, (uint64_t)end, new_pmap,
13310 	    (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13311 	    kr);
13312 }
13313 #endif /* PMAP_FORK_NEST */
13314 
13315 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13316 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13317 {
13318 	new_map->size_limit = old_map->size_limit;
13319 	new_map->data_limit = old_map->data_limit;
13320 	new_map->user_wire_limit = old_map->user_wire_limit;
13321 	new_map->reserved_regions = old_map->reserved_regions;
13322 }
13323 
13324 /*
13325  *	vm_map_fork:
13326  *
13327  *	Create and return a new map based on the old
13328  *	map, according to the inheritance values on the
13329  *	regions in that map and the options.
13330  *
13331  *	The source map must not be locked.
13332  */
13333 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13334 vm_map_fork(
13335 	ledger_t        ledger,
13336 	vm_map_t        old_map,
13337 	int             options)
13338 {
13339 	pmap_t          new_pmap;
13340 	vm_map_t        new_map;
13341 	vm_map_entry_t  old_entry;
13342 	vm_map_size_t   new_size = 0, entry_size;
13343 	vm_map_entry_t  new_entry;
13344 	boolean_t       src_needs_copy;
13345 	boolean_t       new_entry_needs_copy;
13346 	boolean_t       pmap_is64bit;
13347 	int             vm_map_copyin_flags;
13348 	vm_inherit_t    old_entry_inheritance;
13349 	int             map_create_options;
13350 	kern_return_t   footprint_collect_kr;
13351 
13352 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13353 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
13354 	    VM_MAP_FORK_CORPSE_FOOTPRINT |
13355 	    VM_MAP_FORK_SHARE_IF_OWNED)) {
13356 		/* unsupported option */
13357 		return VM_MAP_NULL;
13358 	}
13359 
13360 	pmap_is64bit =
13361 #if defined(__i386__) || defined(__x86_64__)
13362 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13363 #elif defined(__arm64__)
13364 	    old_map->pmap->is_64bit;
13365 #else
13366 #error Unknown architecture.
13367 #endif
13368 
13369 	unsigned int pmap_flags = 0;
13370 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13371 #if defined(HAS_APPLE_PAC)
13372 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13373 #endif
13374 #if CONFIG_ROSETTA
13375 	pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13376 #endif
13377 #if PMAP_CREATE_FORCE_4K_PAGES
13378 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13379 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
13380 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13381 	}
13382 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13383 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13384 	if (new_pmap == NULL) {
13385 		return VM_MAP_NULL;
13386 	}
13387 
13388 	vm_map_reference(old_map);
13389 	vm_map_lock(old_map);
13390 
13391 	map_create_options = 0;
13392 	if (old_map->hdr.entries_pageable) {
13393 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
13394 	}
13395 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13396 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13397 		footprint_collect_kr = KERN_SUCCESS;
13398 	}
13399 	new_map = vm_map_create_options(new_pmap,
13400 	    old_map->min_offset,
13401 	    old_map->max_offset,
13402 	    map_create_options);
13403 
13404 	/* inherit cs_enforcement */
13405 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13406 
13407 	vm_map_lock(new_map);
13408 	vm_commit_pagezero_status(new_map);
13409 	/* inherit the parent map's page size */
13410 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13411 
13412 	/* inherit the parent rlimits */
13413 	vm_map_inherit_limits(new_map, old_map);
13414 
13415 #if CONFIG_MAP_RANGES
13416 	/* inherit the parent map's VM ranges */
13417 	vm_map_range_fork(new_map, old_map);
13418 #endif
13419 
13420 #if CODE_SIGNING_MONITOR
13421 	/* Prepare the monitor for the fork */
13422 	csm_fork_prepare(old_map->pmap, new_pmap);
13423 #endif
13424 
13425 #if PMAP_FORK_NEST
13426 	/*
13427 	 * Pre-nest the shared region's pmap.
13428 	 */
13429 	vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13430 	pmap_fork_nest(old_map->pmap, new_pmap,
13431 	    &pre_nested_start, &pre_nested_end);
13432 #if PMAP_FORK_NEST_DEBUG
13433 	printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13434 #endif /* PMAP_FORK_NEST_DEBUG */
13435 #endif /* PMAP_FORK_NEST */
13436 
13437 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13438 		/*
13439 		 * Abort any corpse collection if the system is shutting down.
13440 		 */
13441 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13442 		    get_system_inshutdown()) {
13443 #if PMAP_FORK_NEST
13444 			new_entry = vm_map_last_entry(new_map);
13445 			if (new_entry == vm_map_to_entry(new_map)) {
13446 				/* unnest all that was pre-nested */
13447 				vm_map_fork_unnest(new_pmap,
13448 				    pre_nested_start, pre_nested_end,
13449 				    vm_map_min(new_map), vm_map_max(new_map));
13450 			} else if (new_entry->vme_end < vm_map_max(new_map)) {
13451 				/* unnest hole at the end, if pre-nested */
13452 				vm_map_fork_unnest(new_pmap,
13453 				    pre_nested_start, pre_nested_end,
13454 				    new_entry->vme_end, vm_map_max(new_map));
13455 			}
13456 #endif /* PMAP_FORK_NEST */
13457 			vm_map_corpse_footprint_collect_done(new_map);
13458 			vm_map_unlock(new_map);
13459 			vm_map_unlock(old_map);
13460 			vm_map_deallocate(new_map);
13461 			vm_map_deallocate(old_map);
13462 			printf("Aborting corpse map due to system shutdown\n");
13463 			return VM_MAP_NULL;
13464 		}
13465 
13466 		entry_size = old_entry->vme_end - old_entry->vme_start;
13467 
13468 #if PMAP_FORK_NEST
13469 		/*
13470 		 * Undo any unnecessary pre-nesting.
13471 		 */
13472 		vm_map_offset_t prev_end;
13473 		if (old_entry == vm_map_first_entry(old_map)) {
13474 			prev_end = vm_map_min(old_map);
13475 		} else {
13476 			prev_end = old_entry->vme_prev->vme_end;
13477 		}
13478 		if (prev_end < old_entry->vme_start) {
13479 			/* unnest hole before this entry, if pre-nested */
13480 			vm_map_fork_unnest(new_pmap,
13481 			    pre_nested_start, pre_nested_end,
13482 			    prev_end, old_entry->vme_start);
13483 		}
13484 		if (old_entry->is_sub_map && old_entry->use_pmap) {
13485 			/* keep this entry nested in the child */
13486 #if PMAP_FORK_NEST_DEBUG
13487 			printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13488 #endif /* PMAP_FORK_NEST_DEBUG */
13489 		} else {
13490 			/* undo nesting for this entry, if pre-nested */
13491 			vm_map_fork_unnest(new_pmap,
13492 			    pre_nested_start, pre_nested_end,
13493 			    old_entry->vme_start, old_entry->vme_end);
13494 		}
13495 #endif /* PMAP_FORK_NEST */
13496 
13497 		old_entry_inheritance = old_entry->inheritance;
13498 		/*
13499 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13500 		 * share VM_INHERIT_NONE entries that are not backed by a
13501 		 * device pager.
13502 		 */
13503 		if (old_entry_inheritance == VM_INHERIT_NONE &&
13504 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13505 		    (old_entry->protection & VM_PROT_READ) &&
13506 		    !(!old_entry->is_sub_map &&
13507 		    VME_OBJECT(old_entry) != NULL &&
13508 		    VME_OBJECT(old_entry)->pager != NULL &&
13509 		    is_device_pager_ops(
13510 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13511 			old_entry_inheritance = VM_INHERIT_SHARE;
13512 		}
13513 		if (old_entry_inheritance == VM_INHERIT_COPY &&
13514 		    (options & VM_MAP_FORK_SHARE_IF_OWNED) &&
13515 		    !old_entry->is_sub_map &&
13516 		    VME_OBJECT(old_entry) != VM_OBJECT_NULL) {
13517 			vm_object_t object;
13518 			task_t owner;
13519 			object = VME_OBJECT(old_entry);
13520 			owner = VM_OBJECT_OWNER(object);
13521 			if (owner != TASK_NULL &&
13522 			    owner->map == old_map) {
13523 				/*
13524 				 * This mapping points at a VM object owned
13525 				 * by the task being forked.
13526 				 * Some tools reporting memory accounting
13527 				 * info rely on the object ID, so share this
13528 				 * mapping instead of copying, to make the
13529 				 * corpse look exactly like the original
13530 				 * task in that respect.
13531 				 */
13532 				assert(object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC);
13533 				old_entry_inheritance = VM_INHERIT_SHARE;
13534 			}
13535 		}
13536 
13537 		if (old_entry_inheritance != VM_INHERIT_NONE &&
13538 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13539 		    footprint_collect_kr == KERN_SUCCESS) {
13540 			/*
13541 			 * The corpse won't have old_map->pmap to query
13542 			 * footprint information, so collect that data now
13543 			 * and store it in new_map->vmmap_corpse_footprint
13544 			 * for later autopsy.
13545 			 */
13546 			footprint_collect_kr =
13547 			    vm_map_corpse_footprint_collect(old_map,
13548 			    old_entry,
13549 			    new_map);
13550 		}
13551 
13552 		switch (old_entry_inheritance) {
13553 		case VM_INHERIT_NONE:
13554 			break;
13555 
13556 		case VM_INHERIT_SHARE:
13557 			vm_map_fork_share(old_map, old_entry, new_map);
13558 			new_size += entry_size;
13559 			break;
13560 
13561 		case VM_INHERIT_COPY:
13562 
13563 			/*
13564 			 *	Inline the copy_quickly case;
13565 			 *	upon failure, fall back on call
13566 			 *	to vm_map_fork_copy.
13567 			 */
13568 
13569 			if (old_entry->is_sub_map) {
13570 				break;
13571 			}
13572 			if ((old_entry->wired_count != 0) ||
13573 			    ((VME_OBJECT(old_entry) != NULL) &&
13574 			    (VME_OBJECT(old_entry)->true_share))) {
13575 				goto slow_vm_map_fork_copy;
13576 			}
13577 
13578 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13579 			vm_map_entry_copy(old_map, new_entry, old_entry);
13580 			if (old_entry->vme_permanent) {
13581 				/* inherit "permanent" on fork() */
13582 				new_entry->vme_permanent = TRUE;
13583 			}
13584 
13585 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13586 				new_map->jit_entry_exists = TRUE;
13587 			}
13588 
13589 			if (new_entry->is_sub_map) {
13590 				/* clear address space specifics */
13591 				new_entry->use_pmap = FALSE;
13592 			} else {
13593 				/*
13594 				 * We're dealing with a copy-on-write operation,
13595 				 * so the resulting mapping should not inherit
13596 				 * the original mapping's accounting settings.
13597 				 * "iokit_acct" should have been cleared in
13598 				 * vm_map_entry_copy().
13599 				 * "use_pmap" should be reset to its default
13600 				 * (TRUE) so that the new mapping gets
13601 				 * accounted for in the task's memory footprint.
13602 				 */
13603 				assert(!new_entry->iokit_acct);
13604 				new_entry->use_pmap = TRUE;
13605 			}
13606 
13607 			if (!vm_object_copy_quickly(
13608 				    VME_OBJECT(new_entry),
13609 				    VME_OFFSET(old_entry),
13610 				    (old_entry->vme_end -
13611 				    old_entry->vme_start),
13612 				    &src_needs_copy,
13613 				    &new_entry_needs_copy)) {
13614 				vm_map_entry_dispose(new_entry);
13615 				goto slow_vm_map_fork_copy;
13616 			}
13617 
13618 			/*
13619 			 *	Handle copy-on-write obligations
13620 			 */
13621 
13622 			if (src_needs_copy && !old_entry->needs_copy) {
13623 				vm_prot_t prot;
13624 
13625 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13626 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13627 					    __FUNCTION__,
13628 					    old_map, old_map->pmap, old_entry,
13629 					    (uint64_t)old_entry->vme_start,
13630 					    (uint64_t)old_entry->vme_end,
13631 					    old_entry->protection);
13632 				}
13633 
13634 				prot = old_entry->protection & ~VM_PROT_WRITE;
13635 
13636 				if (override_nx(old_map, VME_ALIAS(old_entry))
13637 				    && prot) {
13638 					prot |= VM_PROT_EXECUTE;
13639 				}
13640 
13641 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13642 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13643 					    __FUNCTION__,
13644 					    old_map, old_map->pmap, old_entry,
13645 					    (uint64_t)old_entry->vme_start,
13646 					    (uint64_t)old_entry->vme_end,
13647 					    prot);
13648 				}
13649 
13650 				vm_object_pmap_protect(
13651 					VME_OBJECT(old_entry),
13652 					VME_OFFSET(old_entry),
13653 					(old_entry->vme_end -
13654 					old_entry->vme_start),
13655 					((old_entry->is_shared
13656 					|| old_map->mapped_in_other_pmaps)
13657 					? PMAP_NULL :
13658 					old_map->pmap),
13659 					VM_MAP_PAGE_SIZE(old_map),
13660 					old_entry->vme_start,
13661 					prot);
13662 
13663 				assert(old_entry->wired_count == 0);
13664 				old_entry->needs_copy = TRUE;
13665 			}
13666 			new_entry->needs_copy = new_entry_needs_copy;
13667 
13668 			/*
13669 			 *	Insert the entry at the end
13670 			 *	of the map.
13671 			 */
13672 
13673 			vm_map_store_entry_link(new_map,
13674 			    vm_map_last_entry(new_map),
13675 			    new_entry,
13676 			    VM_MAP_KERNEL_FLAGS_NONE);
13677 			new_size += entry_size;
13678 			break;
13679 
13680 slow_vm_map_fork_copy:
13681 			vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13682 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13683 				vm_map_copyin_flags |=
13684 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13685 			}
13686 			if (vm_map_fork_copy(old_map,
13687 			    &old_entry,
13688 			    new_map,
13689 			    vm_map_copyin_flags)) {
13690 				new_size += entry_size;
13691 			}
13692 			continue;
13693 		}
13694 		old_entry = old_entry->vme_next;
13695 	}
13696 
13697 #if PMAP_FORK_NEST
13698 	new_entry = vm_map_last_entry(new_map);
13699 	if (new_entry == vm_map_to_entry(new_map)) {
13700 		/* unnest all that was pre-nested */
13701 		vm_map_fork_unnest(new_pmap,
13702 		    pre_nested_start, pre_nested_end,
13703 		    vm_map_min(new_map), vm_map_max(new_map));
13704 	} else if (new_entry->vme_end < vm_map_max(new_map)) {
13705 		/* unnest hole at the end, if pre-nested */
13706 		vm_map_fork_unnest(new_pmap,
13707 		    pre_nested_start, pre_nested_end,
13708 		    new_entry->vme_end, vm_map_max(new_map));
13709 	}
13710 #endif /* PMAP_FORK_NEST */
13711 
13712 #if defined(__arm64__)
13713 	pmap_insert_commpage(new_map->pmap);
13714 #endif /* __arm64__ */
13715 
13716 	new_map->size = new_size;
13717 
13718 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13719 		vm_map_corpse_footprint_collect_done(new_map);
13720 	}
13721 
13722 	/* Propagate JIT entitlement for the pmap layer. */
13723 	if (pmap_get_jit_entitled(old_map->pmap)) {
13724 		/* Tell the pmap that it supports JIT. */
13725 		pmap_set_jit_entitled(new_map->pmap);
13726 	}
13727 
13728 	/* Propagate TPRO settings for the pmap layer */
13729 	if (pmap_get_tpro(old_map->pmap)) {
13730 		/* Tell the pmap that it supports TPRO */
13731 		pmap_set_tpro(new_map->pmap);
13732 	}
13733 
13734 
13735 	vm_map_unlock(new_map);
13736 	vm_map_unlock(old_map);
13737 	vm_map_deallocate(old_map);
13738 
13739 	return new_map;
13740 }
13741 
13742 /*
13743  * vm_map_exec:
13744  *
13745  *      Setup the "new_map" with the proper execution environment according
13746  *	to the type of executable (platform, 64bit, chroot environment).
13747  *	Map the comm page and shared region, etc...
13748  */
13749 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13750 vm_map_exec(
13751 	vm_map_t        new_map,
13752 	task_t          task,
13753 	boolean_t       is64bit,
13754 	void            *fsroot,
13755 	cpu_type_t      cpu,
13756 	cpu_subtype_t   cpu_subtype,
13757 	boolean_t       reslide,
13758 	boolean_t       is_driverkit,
13759 	uint32_t        rsr_version)
13760 {
13761 	SHARED_REGION_TRACE_DEBUG(
13762 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13763 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13764 		(void *)VM_KERNEL_ADDRPERM(new_map),
13765 		(void *)VM_KERNEL_ADDRPERM(task),
13766 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13767 		cpu,
13768 		cpu_subtype));
13769 	(void) vm_commpage_enter(new_map, task, is64bit);
13770 
13771 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13772 
13773 	SHARED_REGION_TRACE_DEBUG(
13774 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13775 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13776 		(void *)VM_KERNEL_ADDRPERM(new_map),
13777 		(void *)VM_KERNEL_ADDRPERM(task),
13778 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13779 		cpu,
13780 		cpu_subtype));
13781 
13782 	/*
13783 	 * Some devices have region(s) of memory that shouldn't get allocated by
13784 	 * user processes. The following code creates dummy vm_map_entry_t's for each
13785 	 * of the regions that needs to be reserved to prevent any allocations in
13786 	 * those regions.
13787 	 */
13788 	kern_return_t kr = KERN_FAILURE;
13789 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13790 	vmk_flags.vmkf_beyond_max = true;
13791 
13792 	const struct vm_reserved_region *regions = NULL;
13793 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
13794 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13795 
13796 	for (size_t i = 0; i < num_regions; ++i) {
13797 		vm_map_offset_t address = regions[i].vmrr_addr;
13798 
13799 		kr = vm_map_enter(
13800 			new_map,
13801 			&address,
13802 			regions[i].vmrr_size,
13803 			(vm_map_offset_t)0,
13804 			vmk_flags,
13805 			VM_OBJECT_NULL,
13806 			(vm_object_offset_t)0,
13807 			FALSE,
13808 			VM_PROT_NONE,
13809 			VM_PROT_NONE,
13810 			VM_INHERIT_COPY);
13811 
13812 		if (kr != KERN_SUCCESS) {
13813 			panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13814 		}
13815 	}
13816 
13817 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13818 
13819 	return KERN_SUCCESS;
13820 }
13821 
13822 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13823 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13824 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13825 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13826 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13827 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13828 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13829 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13830 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13831 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13832 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13833 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13834 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13835 /*
13836  *	vm_map_lookup_and_lock_object:
13837  *
13838  *	Finds the VM object, offset, and
13839  *	protection for a given virtual address in the
13840  *	specified map, assuming a page fault of the
13841  *	type specified.
13842  *
13843  *	Returns the (object, offset, protection) for
13844  *	this address, whether it is wired down, and whether
13845  *	this map has the only reference to the data in question.
13846  *	In order to later verify this lookup, a "version"
13847  *	is returned.
13848  *	If contended != NULL, *contended will be set to
13849  *	true iff the thread had to spin or block to acquire
13850  *	an exclusive lock.
13851  *
13852  *	The map MUST be locked by the caller and WILL be
13853  *	locked on exit.  In order to guarantee the
13854  *	existence of the returned object, it is returned
13855  *	locked.
13856  *
13857  *	If a lookup is requested with "write protection"
13858  *	specified, the map may be changed to perform virtual
13859  *	copying operations, although the data referenced will
13860  *	remain the same.
13861  */
13862 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13863 vm_map_lookup_and_lock_object(
13864 	vm_map_t                *var_map,       /* IN/OUT */
13865 	vm_map_offset_t         vaddr,
13866 	vm_prot_t               fault_type,
13867 	int                     object_lock_type,
13868 	vm_map_version_t        *out_version,   /* OUT */
13869 	vm_object_t             *object,        /* OUT */
13870 	vm_object_offset_t      *offset,        /* OUT */
13871 	vm_prot_t               *out_prot,      /* OUT */
13872 	boolean_t               *wired,         /* OUT */
13873 	vm_object_fault_info_t  fault_info,     /* OUT */
13874 	vm_map_t                *real_map,      /* OUT */
13875 	bool                    *contended)     /* OUT */
13876 {
13877 	vm_map_entry_t                  entry;
13878 	vm_map_t                        map = *var_map;
13879 	vm_map_t                        old_map = *var_map;
13880 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
13881 	vm_map_offset_t                 cow_parent_vaddr = 0;
13882 	vm_map_offset_t                 old_start = 0;
13883 	vm_map_offset_t                 old_end = 0;
13884 	vm_prot_t                       prot;
13885 	boolean_t                       mask_protections;
13886 	boolean_t                       force_copy;
13887 	boolean_t                       no_force_copy_if_executable;
13888 	boolean_t                       submap_needed_copy;
13889 	vm_prot_t                       original_fault_type;
13890 	vm_map_size_t                   fault_page_mask;
13891 
13892 	/*
13893 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13894 	 * as a mask against the mapping's actual protections, not as an
13895 	 * absolute value.
13896 	 */
13897 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13898 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13899 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13900 	fault_type &= VM_PROT_ALL;
13901 	original_fault_type = fault_type;
13902 	if (contended) {
13903 		*contended = false;
13904 	}
13905 
13906 	*real_map = map;
13907 
13908 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13909 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13910 
13911 RetryLookup:
13912 	fault_type = original_fault_type;
13913 
13914 	/*
13915 	 *	If the map has an interesting hint, try it before calling
13916 	 *	full blown lookup routine.
13917 	 */
13918 	entry = map->hint;
13919 
13920 	if ((entry == vm_map_to_entry(map)) ||
13921 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13922 		vm_map_entry_t  tmp_entry;
13923 
13924 		/*
13925 		 *	Entry was either not a valid hint, or the vaddr
13926 		 *	was not contained in the entry, so do a full lookup.
13927 		 */
13928 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13929 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13930 				vm_map_unlock(cow_sub_map_parent);
13931 			}
13932 			if ((*real_map != map)
13933 			    && (*real_map != cow_sub_map_parent)) {
13934 				vm_map_unlock(*real_map);
13935 			}
13936 			return KERN_INVALID_ADDRESS;
13937 		}
13938 
13939 		entry = tmp_entry;
13940 	}
13941 	if (map == old_map) {
13942 		old_start = entry->vme_start;
13943 		old_end = entry->vme_end;
13944 	}
13945 
13946 	/*
13947 	 *	Handle submaps.  Drop lock on upper map, submap is
13948 	 *	returned locked.
13949 	 */
13950 
13951 	submap_needed_copy = FALSE;
13952 submap_recurse:
13953 	if (entry->is_sub_map) {
13954 		vm_map_offset_t         local_vaddr;
13955 		vm_map_offset_t         end_delta;
13956 		vm_map_offset_t         start_delta;
13957 		vm_map_offset_t         top_entry_saved_start;
13958 		vm_object_offset_t      top_entry_saved_offset;
13959 		vm_map_entry_t          submap_entry, saved_submap_entry;
13960 		vm_object_offset_t      submap_entry_offset;
13961 		vm_object_size_t        submap_entry_size;
13962 		vm_prot_t               subentry_protection;
13963 		vm_prot_t               subentry_max_protection;
13964 		boolean_t               subentry_no_copy_on_read;
13965 		boolean_t               subentry_permanent;
13966 		boolean_t               subentry_csm_associated;
13967 #if __arm64e__
13968 		boolean_t               subentry_used_for_tpro;
13969 #endif /* __arm64e__ */
13970 		boolean_t               mapped_needs_copy = FALSE;
13971 		vm_map_version_t        version;
13972 
13973 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13974 		    "map %p (%d) entry %p submap %p (%d)\n",
13975 		    map, VM_MAP_PAGE_SHIFT(map), entry,
13976 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13977 
13978 		local_vaddr = vaddr;
13979 		top_entry_saved_start = entry->vme_start;
13980 		top_entry_saved_offset = VME_OFFSET(entry);
13981 
13982 		if ((entry->use_pmap &&
13983 		    !((fault_type & VM_PROT_WRITE) ||
13984 		    force_copy))) {
13985 			/* if real_map equals map we unlock below */
13986 			if ((*real_map != map) &&
13987 			    (*real_map != cow_sub_map_parent)) {
13988 				vm_map_unlock(*real_map);
13989 			}
13990 			*real_map = VME_SUBMAP(entry);
13991 		}
13992 
13993 		if (entry->needs_copy &&
13994 		    ((fault_type & VM_PROT_WRITE) ||
13995 		    force_copy)) {
13996 			if (!mapped_needs_copy) {
13997 				if (vm_map_lock_read_to_write(map)) {
13998 					vm_map_lock_read(map);
13999 					*real_map = map;
14000 					goto RetryLookup;
14001 				}
14002 				vm_map_lock_read(VME_SUBMAP(entry));
14003 				*var_map = VME_SUBMAP(entry);
14004 				cow_sub_map_parent = map;
14005 				/* reset base to map before cow object */
14006 				/* this is the map which will accept   */
14007 				/* the new cow object */
14008 				old_start = entry->vme_start;
14009 				old_end = entry->vme_end;
14010 				cow_parent_vaddr = vaddr;
14011 				mapped_needs_copy = TRUE;
14012 			} else {
14013 				vm_map_lock_read(VME_SUBMAP(entry));
14014 				*var_map = VME_SUBMAP(entry);
14015 				if ((cow_sub_map_parent != map) &&
14016 				    (*real_map != map)) {
14017 					vm_map_unlock(map);
14018 				}
14019 			}
14020 		} else {
14021 			if (entry->needs_copy) {
14022 				submap_needed_copy = TRUE;
14023 			}
14024 			vm_map_lock_read(VME_SUBMAP(entry));
14025 			*var_map = VME_SUBMAP(entry);
14026 			/* leave map locked if it is a target */
14027 			/* cow sub_map above otherwise, just  */
14028 			/* follow the maps down to the object */
14029 			/* here we unlock knowing we are not  */
14030 			/* revisiting the map.  */
14031 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
14032 				vm_map_unlock_read(map);
14033 			}
14034 		}
14035 
14036 		entry = NULL;
14037 		map = *var_map;
14038 
14039 		/* calculate the offset in the submap for vaddr */
14040 		local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
14041 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
14042 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
14043 		    (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
14044 
14045 RetrySubMap:
14046 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
14047 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14048 				vm_map_unlock(cow_sub_map_parent);
14049 			}
14050 			if ((*real_map != map)
14051 			    && (*real_map != cow_sub_map_parent)) {
14052 				vm_map_unlock(*real_map);
14053 			}
14054 			*real_map = map;
14055 			return KERN_INVALID_ADDRESS;
14056 		}
14057 
14058 		/* find the attenuated shadow of the underlying object */
14059 		/* on our target map */
14060 
14061 		/* in english the submap object may extend beyond the     */
14062 		/* region mapped by the entry or, may only fill a portion */
14063 		/* of it.  For our purposes, we only care if the object   */
14064 		/* doesn't fill.  In this case the area which will        */
14065 		/* ultimately be clipped in the top map will only need    */
14066 		/* to be as big as the portion of the underlying entry    */
14067 		/* which is mapped */
14068 		start_delta = submap_entry->vme_start > top_entry_saved_offset ?
14069 		    submap_entry->vme_start - top_entry_saved_offset : 0;
14070 
14071 		end_delta =
14072 		    (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
14073 		    submap_entry->vme_end ?
14074 		    0 : (top_entry_saved_offset +
14075 		    (old_end - old_start))
14076 		    - submap_entry->vme_end;
14077 
14078 		old_start += start_delta;
14079 		old_end -= end_delta;
14080 
14081 		if (submap_entry->is_sub_map) {
14082 			entry = submap_entry;
14083 			vaddr = local_vaddr;
14084 			goto submap_recurse;
14085 		}
14086 
14087 		if (((fault_type & VM_PROT_WRITE) ||
14088 		    force_copy)
14089 		    && cow_sub_map_parent) {
14090 			vm_object_t     sub_object, copy_object;
14091 			vm_object_offset_t copy_offset;
14092 			vm_map_offset_t local_start;
14093 			vm_map_offset_t local_end;
14094 			boolean_t       object_copied = FALSE;
14095 			vm_object_offset_t object_copied_offset = 0;
14096 			boolean_t       object_copied_needs_copy = FALSE;
14097 			kern_return_t   kr = KERN_SUCCESS;
14098 
14099 			if (vm_map_lock_read_to_write(map)) {
14100 				vm_map_lock_read(map);
14101 				old_start -= start_delta;
14102 				old_end += end_delta;
14103 				goto RetrySubMap;
14104 			}
14105 
14106 
14107 			sub_object = VME_OBJECT(submap_entry);
14108 			if (sub_object == VM_OBJECT_NULL) {
14109 				sub_object =
14110 				    vm_object_allocate(
14111 					(vm_map_size_t)
14112 					(submap_entry->vme_end -
14113 					submap_entry->vme_start));
14114 				VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14115 				VME_OFFSET_SET(submap_entry, 0);
14116 				assert(!submap_entry->is_sub_map);
14117 				assert(submap_entry->use_pmap);
14118 			}
14119 			local_start =  local_vaddr -
14120 			    (cow_parent_vaddr - old_start);
14121 			local_end = local_vaddr +
14122 			    (old_end - cow_parent_vaddr);
14123 			vm_map_clip_start(map, submap_entry, local_start);
14124 			vm_map_clip_end(map, submap_entry, local_end);
14125 			if (submap_entry->is_sub_map) {
14126 				/* unnesting was done when clipping */
14127 				assert(!submap_entry->use_pmap);
14128 			}
14129 
14130 			/* This is the COW case, lets connect */
14131 			/* an entry in our space to the underlying */
14132 			/* object in the submap, bypassing the  */
14133 			/* submap. */
14134 			submap_entry_offset = VME_OFFSET(submap_entry);
14135 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14136 
14137 			if ((submap_entry->wired_count != 0 ||
14138 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14139 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
14140 			    no_force_copy_if_executable) {
14141 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14142 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14143 					vm_map_unlock(cow_sub_map_parent);
14144 				}
14145 				if ((*real_map != map)
14146 				    && (*real_map != cow_sub_map_parent)) {
14147 					vm_map_unlock(*real_map);
14148 				}
14149 				*real_map = map;
14150 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14151 				vm_map_lock_write_to_read(map);
14152 				kr = KERN_PROTECTION_FAILURE;
14153 				DTRACE_VM4(submap_no_copy_executable,
14154 				    vm_map_t, map,
14155 				    vm_object_offset_t, submap_entry_offset,
14156 				    vm_object_size_t, submap_entry_size,
14157 				    int, kr);
14158 				return kr;
14159 			}
14160 
14161 			if (submap_entry->wired_count != 0) {
14162 				vm_object_reference(sub_object);
14163 
14164 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14165 				    "submap_entry %p offset 0x%llx\n",
14166 				    submap_entry, VME_OFFSET(submap_entry));
14167 
14168 				DTRACE_VM6(submap_copy_slowly,
14169 				    vm_map_t, cow_sub_map_parent,
14170 				    vm_map_offset_t, vaddr,
14171 				    vm_map_t, map,
14172 				    vm_object_size_t, submap_entry_size,
14173 				    int, submap_entry->wired_count,
14174 				    int, sub_object->copy_strategy);
14175 
14176 				saved_submap_entry = submap_entry;
14177 				version.main_timestamp = map->timestamp;
14178 				vm_map_unlock(map); /* Increments timestamp by 1 */
14179 				submap_entry = VM_MAP_ENTRY_NULL;
14180 
14181 				vm_object_lock(sub_object);
14182 				kr = vm_object_copy_slowly(sub_object,
14183 				    submap_entry_offset,
14184 				    submap_entry_size,
14185 				    FALSE,
14186 				    &copy_object);
14187 				object_copied = TRUE;
14188 				object_copied_offset = 0;
14189 				/* 4k: account for extra offset in physical page */
14190 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14191 				object_copied_needs_copy = FALSE;
14192 				vm_object_deallocate(sub_object);
14193 
14194 				vm_map_lock(map);
14195 
14196 				if (kr != KERN_SUCCESS &&
14197 				    kr != KERN_MEMORY_RESTART_COPY) {
14198 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14199 						vm_map_unlock(cow_sub_map_parent);
14200 					}
14201 					if ((*real_map != map)
14202 					    && (*real_map != cow_sub_map_parent)) {
14203 						vm_map_unlock(*real_map);
14204 					}
14205 					*real_map = map;
14206 					vm_object_deallocate(copy_object);
14207 					copy_object = VM_OBJECT_NULL;
14208 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14209 					vm_map_lock_write_to_read(map);
14210 					DTRACE_VM4(submap_copy_error_slowly,
14211 					    vm_object_t, sub_object,
14212 					    vm_object_offset_t, submap_entry_offset,
14213 					    vm_object_size_t, submap_entry_size,
14214 					    int, kr);
14215 					vm_map_lookup_and_lock_object_copy_slowly_error++;
14216 					return kr;
14217 				}
14218 
14219 				if ((kr == KERN_SUCCESS) &&
14220 				    (version.main_timestamp + 1) == map->timestamp) {
14221 					submap_entry = saved_submap_entry;
14222 				} else {
14223 					saved_submap_entry = NULL;
14224 					old_start -= start_delta;
14225 					old_end += end_delta;
14226 					vm_object_deallocate(copy_object);
14227 					copy_object = VM_OBJECT_NULL;
14228 					vm_map_lock_write_to_read(map);
14229 					vm_map_lookup_and_lock_object_copy_slowly_restart++;
14230 					goto RetrySubMap;
14231 				}
14232 				vm_map_lookup_and_lock_object_copy_slowly_count++;
14233 				vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14234 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14235 					vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14236 				}
14237 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14238 				submap_entry_offset = VME_OFFSET(submap_entry);
14239 				copy_object = VM_OBJECT_NULL;
14240 				object_copied_offset = submap_entry_offset;
14241 				object_copied_needs_copy = FALSE;
14242 				DTRACE_VM6(submap_copy_strategically,
14243 				    vm_map_t, cow_sub_map_parent,
14244 				    vm_map_offset_t, vaddr,
14245 				    vm_map_t, map,
14246 				    vm_object_size_t, submap_entry_size,
14247 				    int, submap_entry->wired_count,
14248 				    int, sub_object->copy_strategy);
14249 				kr = vm_object_copy_strategically(
14250 					sub_object,
14251 					submap_entry_offset,
14252 					submap_entry->vme_end - submap_entry->vme_start,
14253 					false, /* forking */
14254 					&copy_object,
14255 					&object_copied_offset,
14256 					&object_copied_needs_copy);
14257 				if (kr == KERN_MEMORY_RESTART_COPY) {
14258 					old_start -= start_delta;
14259 					old_end += end_delta;
14260 					vm_object_deallocate(copy_object);
14261 					copy_object = VM_OBJECT_NULL;
14262 					vm_map_lock_write_to_read(map);
14263 					vm_map_lookup_and_lock_object_copy_strategically_restart++;
14264 					goto RetrySubMap;
14265 				}
14266 				if (kr != KERN_SUCCESS) {
14267 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14268 						vm_map_unlock(cow_sub_map_parent);
14269 					}
14270 					if ((*real_map != map)
14271 					    && (*real_map != cow_sub_map_parent)) {
14272 						vm_map_unlock(*real_map);
14273 					}
14274 					*real_map = map;
14275 					vm_object_deallocate(copy_object);
14276 					copy_object = VM_OBJECT_NULL;
14277 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14278 					vm_map_lock_write_to_read(map);
14279 					DTRACE_VM4(submap_copy_error_strategically,
14280 					    vm_object_t, sub_object,
14281 					    vm_object_offset_t, submap_entry_offset,
14282 					    vm_object_size_t, submap_entry_size,
14283 					    int, kr);
14284 					vm_map_lookup_and_lock_object_copy_strategically_error++;
14285 					return kr;
14286 				}
14287 				assert(copy_object != VM_OBJECT_NULL);
14288 				assert(copy_object != sub_object);
14289 				object_copied = TRUE;
14290 				vm_map_lookup_and_lock_object_copy_strategically_count++;
14291 				vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14292 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14293 					vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14294 				}
14295 			} else {
14296 				/* set up shadow object */
14297 				object_copied = FALSE;
14298 				copy_object = sub_object;
14299 				vm_object_lock(sub_object);
14300 				vm_object_reference_locked(sub_object);
14301 				VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14302 				vm_object_unlock(sub_object);
14303 
14304 				assert(submap_entry->wired_count == 0);
14305 				submap_entry->needs_copy = TRUE;
14306 
14307 				prot = submap_entry->protection;
14308 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14309 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14310 					    __FUNCTION__,
14311 					    map, map->pmap, submap_entry,
14312 					    (uint64_t)submap_entry->vme_start,
14313 					    (uint64_t)submap_entry->vme_end,
14314 					    prot);
14315 				}
14316 				prot = prot & ~VM_PROT_WRITE;
14317 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14318 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14319 					    __FUNCTION__,
14320 					    map, map->pmap, submap_entry,
14321 					    (uint64_t)submap_entry->vme_start,
14322 					    (uint64_t)submap_entry->vme_end,
14323 					    prot);
14324 				}
14325 
14326 				if (override_nx(old_map,
14327 				    VME_ALIAS(submap_entry))
14328 				    && prot) {
14329 					prot |= VM_PROT_EXECUTE;
14330 				}
14331 
14332 				vm_object_pmap_protect(
14333 					sub_object,
14334 					VME_OFFSET(submap_entry),
14335 					submap_entry->vme_end -
14336 					submap_entry->vme_start,
14337 					(submap_entry->is_shared
14338 					|| map->mapped_in_other_pmaps) ?
14339 					PMAP_NULL : map->pmap,
14340 					VM_MAP_PAGE_SIZE(map),
14341 					submap_entry->vme_start,
14342 					prot);
14343 				vm_map_lookup_and_lock_object_copy_shadow_count++;
14344 				vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14345 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14346 					vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14347 				}
14348 			}
14349 
14350 			/*
14351 			 * Adjust the fault offset to the submap entry.
14352 			 */
14353 			copy_offset = (local_vaddr -
14354 			    submap_entry->vme_start +
14355 			    VME_OFFSET(submap_entry));
14356 
14357 			/* This works diffently than the   */
14358 			/* normal submap case. We go back  */
14359 			/* to the parent of the cow map and*/
14360 			/* clip out the target portion of  */
14361 			/* the sub_map, substituting the   */
14362 			/* new copy object,                */
14363 
14364 			subentry_protection = submap_entry->protection;
14365 			subentry_max_protection = submap_entry->max_protection;
14366 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14367 			subentry_permanent = submap_entry->vme_permanent;
14368 			subentry_csm_associated = submap_entry->csm_associated;
14369 #if __arm64e__
14370 			subentry_used_for_tpro = submap_entry->used_for_tpro;
14371 #endif // __arm64e__
14372 			vm_map_unlock(map);
14373 			submap_entry = NULL; /* not valid after map unlock */
14374 
14375 			local_start = old_start;
14376 			local_end = old_end;
14377 			map = cow_sub_map_parent;
14378 			*var_map = cow_sub_map_parent;
14379 			vaddr = cow_parent_vaddr;
14380 			cow_sub_map_parent = NULL;
14381 
14382 			if (!vm_map_lookup_entry(map,
14383 			    vaddr, &entry)) {
14384 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14385 					vm_map_unlock(cow_sub_map_parent);
14386 				}
14387 				if ((*real_map != map)
14388 				    && (*real_map != cow_sub_map_parent)) {
14389 					vm_map_unlock(*real_map);
14390 				}
14391 				*real_map = map;
14392 				vm_object_deallocate(
14393 					copy_object);
14394 				copy_object = VM_OBJECT_NULL;
14395 				vm_map_lock_write_to_read(map);
14396 				DTRACE_VM4(submap_lookup_post_unlock,
14397 				    uint64_t, (uint64_t)entry->vme_start,
14398 				    uint64_t, (uint64_t)entry->vme_end,
14399 				    vm_map_offset_t, vaddr,
14400 				    int, object_copied);
14401 				return KERN_INVALID_ADDRESS;
14402 			}
14403 
14404 			/* clip out the portion of space */
14405 			/* mapped by the sub map which   */
14406 			/* corresponds to the underlying */
14407 			/* object */
14408 
14409 			/*
14410 			 * Clip (and unnest) the smallest nested chunk
14411 			 * possible around the faulting address...
14412 			 */
14413 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14414 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
14415 			/*
14416 			 * ... but don't go beyond the "old_start" to "old_end"
14417 			 * range, to avoid spanning over another VM region
14418 			 * with a possibly different VM object and/or offset.
14419 			 */
14420 			if (local_start < old_start) {
14421 				local_start = old_start;
14422 			}
14423 			if (local_end > old_end) {
14424 				local_end = old_end;
14425 			}
14426 			/*
14427 			 * Adjust copy_offset to the start of the range.
14428 			 */
14429 			copy_offset -= (vaddr - local_start);
14430 
14431 			vm_map_clip_start(map, entry, local_start);
14432 			vm_map_clip_end(map, entry, local_end);
14433 			if (entry->is_sub_map) {
14434 				/* unnesting was done when clipping */
14435 				assert(!entry->use_pmap);
14436 			}
14437 
14438 			/* substitute copy object for */
14439 			/* shared map entry           */
14440 			vm_map_deallocate(VME_SUBMAP(entry));
14441 			assert(!entry->iokit_acct);
14442 			entry->use_pmap = TRUE;
14443 			VME_OBJECT_SET(entry, copy_object, false, 0);
14444 
14445 			/* propagate the submap entry's protections */
14446 			if (entry->protection != VM_PROT_READ) {
14447 				/*
14448 				 * Someone has already altered the top entry's
14449 				 * protections via vm_protect(VM_PROT_COPY).
14450 				 * Respect these new values and ignore the
14451 				 * submap entry's protections.
14452 				 */
14453 			} else {
14454 				/*
14455 				 * Regular copy-on-write: propagate the submap
14456 				 * entry's protections to the top map entry.
14457 				 */
14458 				entry->protection |= subentry_protection;
14459 			}
14460 			entry->max_protection |= subentry_max_protection;
14461 			/* propagate some attributes from subentry */
14462 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14463 			entry->vme_permanent = subentry_permanent;
14464 			entry->csm_associated = subentry_csm_associated;
14465 #if __arm64e__
14466 			/* propagate TPRO iff the destination map has TPRO enabled */
14467 			if (subentry_used_for_tpro) {
14468 				if (vm_map_tpro(map)) {
14469 					entry->used_for_tpro = subentry_used_for_tpro;
14470 				} else {
14471 					/* "permanent" came from being TPRO */
14472 					entry->vme_permanent = FALSE;
14473 				}
14474 			}
14475 #endif /* __arm64e */
14476 			if ((entry->protection & VM_PROT_WRITE) &&
14477 			    (entry->protection & VM_PROT_EXECUTE) &&
14478 #if XNU_TARGET_OS_OSX
14479 			    map->pmap != kernel_pmap &&
14480 			    (vm_map_cs_enforcement(map)
14481 #if __arm64__
14482 			    || !VM_MAP_IS_EXOTIC(map)
14483 #endif /* __arm64__ */
14484 			    ) &&
14485 #endif /* XNU_TARGET_OS_OSX */
14486 #if CODE_SIGNING_MONITOR
14487 			    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14488 #endif
14489 			    !(entry->used_for_jit) &&
14490 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
14491 				DTRACE_VM3(cs_wx,
14492 				    uint64_t, (uint64_t)entry->vme_start,
14493 				    uint64_t, (uint64_t)entry->vme_end,
14494 				    vm_prot_t, entry->protection);
14495 				printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14496 				    proc_selfpid(),
14497 				    (get_bsdtask_info(current_task())
14498 				    ? proc_name_address(get_bsdtask_info(current_task()))
14499 				    : "?"),
14500 				    __FUNCTION__, __LINE__,
14501 #if DEVELOPMENT || DEBUG
14502 				    (uint64_t)entry->vme_start,
14503 				    (uint64_t)entry->vme_end,
14504 #else /* DEVELOPMENT || DEBUG */
14505 				    (uint64_t)0,
14506 				    (uint64_t)0,
14507 #endif /* DEVELOPMENT || DEBUG */
14508 				    entry->protection);
14509 				entry->protection &= ~VM_PROT_EXECUTE;
14510 			}
14511 
14512 			if (object_copied) {
14513 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14514 				entry->needs_copy = object_copied_needs_copy;
14515 				entry->is_shared = FALSE;
14516 			} else {
14517 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14518 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14519 				assert(entry->wired_count == 0);
14520 				VME_OFFSET_SET(entry, copy_offset);
14521 				entry->needs_copy = TRUE;
14522 				if (map != old_map) {
14523 					entry->is_shared = TRUE;
14524 				}
14525 			}
14526 			if (entry->inheritance == VM_INHERIT_SHARE) {
14527 				entry->inheritance = VM_INHERIT_COPY;
14528 			}
14529 
14530 			vm_map_lock_write_to_read(map);
14531 		} else {
14532 			if ((cow_sub_map_parent)
14533 			    && (cow_sub_map_parent != *real_map)
14534 			    && (cow_sub_map_parent != map)) {
14535 				vm_map_unlock(cow_sub_map_parent);
14536 			}
14537 			entry = submap_entry;
14538 			vaddr = local_vaddr;
14539 		}
14540 	}
14541 
14542 	/*
14543 	 *	Check whether this task is allowed to have
14544 	 *	this page.
14545 	 */
14546 
14547 	prot = entry->protection;
14548 
14549 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14550 		/*
14551 		 * HACK -- if not a stack, then allow execution
14552 		 */
14553 		prot |= VM_PROT_EXECUTE;
14554 	}
14555 
14556 #if __arm64e__
14557 	/*
14558 	 * If the entry we're dealing with is TPRO and we have a write
14559 	 * fault, inject VM_PROT_WRITE into protections. This allows us
14560 	 * to maintain RO permissions when not marked as TPRO.
14561 	 */
14562 	if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14563 		prot |= VM_PROT_WRITE;
14564 	}
14565 #endif /* __arm64e__ */
14566 	if (mask_protections) {
14567 		fault_type &= prot;
14568 		if (fault_type == VM_PROT_NONE) {
14569 			goto protection_failure;
14570 		}
14571 	}
14572 	if (((fault_type & prot) != fault_type)
14573 #if __arm64__
14574 	    /* prefetch abort in execute-only page */
14575 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14576 #elif defined(__x86_64__)
14577 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
14578 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14579 #endif
14580 	    ) {
14581 protection_failure:
14582 		if (*real_map != map) {
14583 			vm_map_unlock(*real_map);
14584 		}
14585 		*real_map = map;
14586 
14587 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
14588 			log_stack_execution_failure((addr64_t)vaddr, prot);
14589 		}
14590 
14591 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14592 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14593 		/*
14594 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14595 		 *
14596 		 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14597 		 */
14598 		return KERN_PROTECTION_FAILURE;
14599 	}
14600 
14601 	/*
14602 	 *	If this page is not pageable, we have to get
14603 	 *	it for all possible accesses.
14604 	 */
14605 
14606 	*wired = (entry->wired_count != 0);
14607 	if (*wired) {
14608 		fault_type = prot;
14609 	}
14610 
14611 	/*
14612 	 *	If the entry was copy-on-write, we either ...
14613 	 */
14614 
14615 	if (entry->needs_copy) {
14616 		/*
14617 		 *	If we want to write the page, we may as well
14618 		 *	handle that now since we've got the map locked.
14619 		 *
14620 		 *	If we don't need to write the page, we just
14621 		 *	demote the permissions allowed.
14622 		 */
14623 
14624 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14625 			/*
14626 			 *	Make a new object, and place it in the
14627 			 *	object chain.  Note that no new references
14628 			 *	have appeared -- one just moved from the
14629 			 *	map to the new object.
14630 			 */
14631 
14632 			if (vm_map_lock_read_to_write(map)) {
14633 				vm_map_lock_read(map);
14634 				goto RetryLookup;
14635 			}
14636 
14637 			if (VME_OBJECT(entry)->shadowed == FALSE) {
14638 				vm_object_lock(VME_OBJECT(entry));
14639 				VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14640 				vm_object_unlock(VME_OBJECT(entry));
14641 			}
14642 			VME_OBJECT_SHADOW(entry,
14643 			    (vm_map_size_t) (entry->vme_end -
14644 			    entry->vme_start),
14645 			    vm_map_always_shadow(map));
14646 			entry->needs_copy = FALSE;
14647 
14648 			vm_map_lock_write_to_read(map);
14649 		}
14650 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14651 			/*
14652 			 *	We're attempting to read a copy-on-write
14653 			 *	page -- don't allow writes.
14654 			 */
14655 
14656 			prot &= (~VM_PROT_WRITE);
14657 		}
14658 	}
14659 
14660 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14661 		/*
14662 		 * We went through a "needs_copy" submap without triggering
14663 		 * a copy, so granting write access to the page would bypass
14664 		 * that submap's "needs_copy".
14665 		 */
14666 		assert(!(fault_type & VM_PROT_WRITE));
14667 		assert(!*wired);
14668 		assert(!force_copy);
14669 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14670 		prot &= ~VM_PROT_WRITE;
14671 	}
14672 
14673 	/*
14674 	 *	Create an object if necessary.
14675 	 */
14676 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14677 		if (vm_map_lock_read_to_write(map)) {
14678 			vm_map_lock_read(map);
14679 			goto RetryLookup;
14680 		}
14681 
14682 		VME_OBJECT_SET(entry,
14683 		    vm_object_allocate(
14684 			    (vm_map_size_t)(entry->vme_end -
14685 			    entry->vme_start)), false, 0);
14686 		VME_OFFSET_SET(entry, 0);
14687 		assert(entry->use_pmap);
14688 		vm_map_lock_write_to_read(map);
14689 	}
14690 
14691 	/*
14692 	 *	Return the object/offset from this entry.  If the entry
14693 	 *	was copy-on-write or empty, it has been fixed up.  Also
14694 	 *	return the protection.
14695 	 */
14696 
14697 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14698 	*object = VME_OBJECT(entry);
14699 	*out_prot = prot;
14700 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14701 
14702 	if (fault_info) {
14703 		/* ... the caller will change "interruptible" if needed */
14704 		fault_info->user_tag = VME_ALIAS(entry);
14705 		fault_info->pmap_options = 0;
14706 		if (entry->iokit_acct ||
14707 		    (!entry->is_sub_map && !entry->use_pmap)) {
14708 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14709 		}
14710 		if (fault_info->behavior == VM_BEHAVIOR_DEFAULT) {
14711 			fault_info->behavior = entry->behavior;
14712 		}
14713 		fault_info->lo_offset = VME_OFFSET(entry);
14714 		fault_info->hi_offset =
14715 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14716 		fault_info->no_cache  = entry->no_cache;
14717 		fault_info->stealth = FALSE;
14718 		fault_info->io_sync = FALSE;
14719 		if (entry->used_for_jit ||
14720 #if CODE_SIGNING_MONITOR
14721 		    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14722 #endif
14723 		    entry->vme_resilient_codesign) {
14724 			fault_info->cs_bypass = TRUE;
14725 		} else {
14726 			fault_info->cs_bypass = FALSE;
14727 		}
14728 		fault_info->csm_associated = FALSE;
14729 #if CODE_SIGNING_MONITOR
14730 		if (entry->csm_associated) {
14731 			/*
14732 			 * The pmap layer will validate this page
14733 			 * before allowing it to be executed from.
14734 			 */
14735 			fault_info->csm_associated = TRUE;
14736 		}
14737 #endif
14738 		fault_info->mark_zf_absent = FALSE;
14739 		fault_info->batch_pmap_op = FALSE;
14740 		fault_info->resilient_media = entry->vme_resilient_media;
14741 		fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14742 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14743 #if __arm64e__
14744 		fault_info->fi_used_for_tpro = entry->used_for_tpro;
14745 #else /* __arm64e__ */
14746 		fault_info->fi_used_for_tpro = FALSE;
14747 #endif
14748 		if (entry->translated_allow_execute) {
14749 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14750 		}
14751 	}
14752 
14753 	/*
14754 	 *	Lock the object to prevent it from disappearing
14755 	 */
14756 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14757 		if (contended == NULL) {
14758 			vm_object_lock(*object);
14759 		} else {
14760 			*contended = vm_object_lock_check_contended(*object);
14761 		}
14762 	} else {
14763 		vm_object_lock_shared(*object);
14764 	}
14765 
14766 	/*
14767 	 *	Save the version number
14768 	 */
14769 
14770 	out_version->main_timestamp = map->timestamp;
14771 
14772 	return KERN_SUCCESS;
14773 }
14774 
14775 
14776 /*
14777  *	vm_map_verify:
14778  *
14779  *	Verifies that the map in question has not changed
14780  *	since the given version. The map has to be locked
14781  *	("shared" mode is fine) before calling this function
14782  *	and it will be returned locked too.
14783  */
14784 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14785 vm_map_verify(
14786 	vm_map_t                map,
14787 	vm_map_version_t        *version)       /* REF */
14788 {
14789 	boolean_t       result;
14790 
14791 	vm_map_lock_assert_held(map);
14792 	result = (map->timestamp == version->main_timestamp);
14793 
14794 	return result;
14795 }
14796 
14797 /*
14798  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14799  *	Goes away after regular vm_region_recurse function migrates to
14800  *	64 bits
14801  *	vm_region_recurse: A form of vm_region which follows the
14802  *	submaps in a target map
14803  *
14804  */
14805 
14806 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14807 vm_map_region_recurse_64(
14808 	vm_map_t                map,
14809 	vm_map_offset_ut       *address_u,      /* IN/OUT */
14810 	vm_map_size_ut         *size_u,         /* OUT */
14811 	natural_t              *nesting_depth,  /* IN/OUT */
14812 	vm_region_submap_info_64_t submap_info, /* IN/OUT */
14813 	mach_msg_type_number_t *count)          /* IN/OUT */
14814 {
14815 	mach_msg_type_number_t  original_count;
14816 	vm_region_extended_info_data_t  extended;
14817 	vm_map_entry_t                  tmp_entry;
14818 	vm_map_offset_t                 user_address;
14819 	unsigned int                    user_max_depth;
14820 
14821 	/*
14822 	 * "curr_entry" is the VM map entry preceding or including the
14823 	 * address we're looking for.
14824 	 * "curr_map" is the map or sub-map containing "curr_entry".
14825 	 * "curr_address" is the equivalent of the top map's "user_address"
14826 	 * in the current map.
14827 	 * "curr_offset" is the cumulated offset of "curr_map" in the
14828 	 * target task's address space.
14829 	 * "curr_depth" is the depth of "curr_map" in the chain of
14830 	 * sub-maps.
14831 	 *
14832 	 * "curr_max_below" and "curr_max_above" limit the range (around
14833 	 * "curr_address") we should take into account in the current (sub)map.
14834 	 * They limit the range to what's visible through the map entries
14835 	 * we've traversed from the top map to the current map.
14836 	 *
14837 	 */
14838 	vm_map_entry_t                  curr_entry;
14839 	vm_map_address_t                curr_address;
14840 	vm_map_offset_t                 curr_offset;
14841 	vm_map_t                        curr_map;
14842 	unsigned int                    curr_depth;
14843 	vm_map_offset_t                 curr_max_below, curr_max_above;
14844 	vm_map_offset_t                 curr_skip;
14845 
14846 	/*
14847 	 * "next_" is the same as "curr_" but for the VM region immediately
14848 	 * after the address we're looking for.  We need to keep track of this
14849 	 * too because we want to return info about that region if the
14850 	 * address we're looking for is not mapped.
14851 	 */
14852 	vm_map_entry_t                  next_entry;
14853 	vm_map_offset_t                 next_offset;
14854 	vm_map_offset_t                 next_address;
14855 	vm_map_t                        next_map;
14856 	unsigned int                    next_depth;
14857 	vm_map_offset_t                 next_max_below, next_max_above;
14858 	vm_map_offset_t                 next_skip;
14859 
14860 	boolean_t                       look_for_pages;
14861 	vm_region_submap_short_info_64_t short_info;
14862 	boolean_t                       do_region_footprint;
14863 	int                             effective_page_size, effective_page_shift;
14864 	boolean_t                       submap_needed_copy;
14865 
14866 	if (map == VM_MAP_NULL) {
14867 		/* no address space to work on */
14868 		return KERN_INVALID_ARGUMENT;
14869 	}
14870 
14871 	user_address = vm_sanitize_addr(map, *address_u);
14872 
14873 	effective_page_shift = vm_self_region_page_shift(map);
14874 	effective_page_size = (1 << effective_page_shift);
14875 
14876 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14877 		/*
14878 		 * "info" structure is not big enough and
14879 		 * would overflow
14880 		 */
14881 		return KERN_INVALID_ARGUMENT;
14882 	}
14883 
14884 	do_region_footprint = task_self_region_footprint();
14885 	original_count = *count;
14886 
14887 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14888 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14889 		look_for_pages = FALSE;
14890 		short_info = (vm_region_submap_short_info_64_t) submap_info;
14891 		submap_info = NULL;
14892 	} else {
14893 		look_for_pages = TRUE;
14894 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14895 		short_info = NULL;
14896 
14897 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14898 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14899 		}
14900 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14901 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14902 		}
14903 	}
14904 
14905 	user_max_depth = *nesting_depth;
14906 	submap_needed_copy = FALSE;
14907 
14908 	if (not_in_kdp) {
14909 		vm_map_lock_read(map);
14910 	}
14911 
14912 recurse_again:
14913 	curr_entry = NULL;
14914 	curr_map = map;
14915 	curr_address = user_address;
14916 	curr_offset = 0;
14917 	curr_skip = 0;
14918 	curr_depth = 0;
14919 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14920 	curr_max_below = curr_address;
14921 
14922 	next_entry = NULL;
14923 	next_map = NULL;
14924 	next_address = 0;
14925 	next_offset = 0;
14926 	next_skip = 0;
14927 	next_depth = 0;
14928 	next_max_above = (vm_map_offset_t) -1;
14929 	next_max_below = (vm_map_offset_t) -1;
14930 
14931 	for (;;) {
14932 		if (vm_map_lookup_entry(curr_map,
14933 		    curr_address,
14934 		    &tmp_entry)) {
14935 			/* tmp_entry contains the address we're looking for */
14936 			curr_entry = tmp_entry;
14937 		} else {
14938 			vm_map_offset_t skip;
14939 			/*
14940 			 * The address is not mapped.  "tmp_entry" is the
14941 			 * map entry preceding the address.  We want the next
14942 			 * one, if it exists.
14943 			 */
14944 			curr_entry = tmp_entry->vme_next;
14945 
14946 			if (curr_entry == vm_map_to_entry(curr_map) ||
14947 			    (curr_entry->vme_start >=
14948 			    curr_address + curr_max_above)) {
14949 				/* no next entry at this level: stop looking */
14950 				if (not_in_kdp) {
14951 					vm_map_unlock_read(curr_map);
14952 				}
14953 				curr_entry = NULL;
14954 				curr_map = NULL;
14955 				curr_skip = 0;
14956 				curr_offset = 0;
14957 				curr_depth = 0;
14958 				curr_max_above = 0;
14959 				curr_max_below = 0;
14960 				break;
14961 			}
14962 
14963 			/* adjust current address and offset */
14964 			skip = curr_entry->vme_start - curr_address;
14965 			curr_address = curr_entry->vme_start;
14966 			curr_skip += skip;
14967 			curr_offset += skip;
14968 			curr_max_above -= skip;
14969 			curr_max_below = 0;
14970 		}
14971 
14972 		/*
14973 		 * Is the next entry at this level closer to the address (or
14974 		 * deeper in the submap chain) than the one we had
14975 		 * so far ?
14976 		 */
14977 		tmp_entry = curr_entry->vme_next;
14978 		if (tmp_entry == vm_map_to_entry(curr_map)) {
14979 			/* no next entry at this level */
14980 		} else if (tmp_entry->vme_start >=
14981 		    curr_address + curr_max_above) {
14982 			/*
14983 			 * tmp_entry is beyond the scope of what we mapped of
14984 			 * this submap in the upper level: ignore it.
14985 			 */
14986 		} else if ((next_entry == NULL) ||
14987 		    (tmp_entry->vme_start + curr_offset <=
14988 		    next_entry->vme_start + next_offset)) {
14989 			/*
14990 			 * We didn't have a "next_entry" or this one is
14991 			 * closer to the address we're looking for:
14992 			 * use this "tmp_entry" as the new "next_entry".
14993 			 */
14994 			if (next_entry != NULL) {
14995 				/* unlock the last "next_map" */
14996 				if (next_map != curr_map && not_in_kdp) {
14997 					vm_map_unlock_read(next_map);
14998 				}
14999 			}
15000 			next_entry = tmp_entry;
15001 			next_map = curr_map;
15002 			next_depth = curr_depth;
15003 			next_address = next_entry->vme_start;
15004 			next_skip = curr_skip;
15005 			next_skip += (next_address - curr_address);
15006 			next_offset = curr_offset;
15007 			next_offset += (next_address - curr_address);
15008 			next_max_above = MIN(next_max_above, curr_max_above);
15009 			next_max_above = MIN(next_max_above,
15010 			    next_entry->vme_end - next_address);
15011 			next_max_below = MIN(next_max_below, curr_max_below);
15012 			next_max_below = MIN(next_max_below,
15013 			    next_address - next_entry->vme_start);
15014 		}
15015 
15016 		/*
15017 		 * "curr_max_{above,below}" allow us to keep track of the
15018 		 * portion of the submap that is actually mapped at this level:
15019 		 * the rest of that submap is irrelevant to us, since it's not
15020 		 * mapped here.
15021 		 * The relevant portion of the map starts at
15022 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
15023 		 */
15024 		curr_max_above = MIN(curr_max_above,
15025 		    curr_entry->vme_end - curr_address);
15026 		curr_max_below = MIN(curr_max_below,
15027 		    curr_address - curr_entry->vme_start);
15028 
15029 		if (!curr_entry->is_sub_map ||
15030 		    curr_depth >= user_max_depth) {
15031 			/*
15032 			 * We hit a leaf map or we reached the maximum depth
15033 			 * we could, so stop looking.  Keep the current map
15034 			 * locked.
15035 			 */
15036 			break;
15037 		}
15038 
15039 		/*
15040 		 * Get down to the next submap level.
15041 		 */
15042 
15043 		if (curr_entry->needs_copy) {
15044 			/* everything below this is effectively copy-on-write */
15045 			submap_needed_copy = TRUE;
15046 		}
15047 
15048 		/*
15049 		 * Lock the next level and unlock the current level,
15050 		 * unless we need to keep it locked to access the "next_entry"
15051 		 * later.
15052 		 */
15053 		if (not_in_kdp) {
15054 			vm_map_lock_read(VME_SUBMAP(curr_entry));
15055 		}
15056 		if (curr_map == next_map) {
15057 			/* keep "next_map" locked in case we need it */
15058 		} else {
15059 			/* release this map */
15060 			if (not_in_kdp) {
15061 				vm_map_unlock_read(curr_map);
15062 			}
15063 		}
15064 
15065 		/*
15066 		 * Adjust the offset.  "curr_entry" maps the submap
15067 		 * at relative address "curr_entry->vme_start" in the
15068 		 * curr_map but skips the first "VME_OFFSET(curr_entry)"
15069 		 * bytes of the submap.
15070 		 * "curr_offset" always represents the offset of a virtual
15071 		 * address in the curr_map relative to the absolute address
15072 		 * space (i.e. the top-level VM map).
15073 		 */
15074 		curr_offset +=
15075 		    (VME_OFFSET(curr_entry) - curr_entry->vme_start);
15076 		curr_address = user_address + curr_offset;
15077 		/* switch to the submap */
15078 		curr_map = VME_SUBMAP(curr_entry);
15079 		curr_depth++;
15080 		curr_entry = NULL;
15081 	}
15082 
15083 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
15084 // so probably should be a real 32b ID vs. ptr.
15085 // Current users just check for equality
15086 
15087 	if (curr_entry == NULL) {
15088 		/* no VM region contains the address... */
15089 
15090 		if (do_region_footprint && /* we want footprint numbers */
15091 		    next_entry == NULL && /* & there are no more regions */
15092 		    /* & we haven't already provided our fake region: */
15093 		    user_address <= vm_map_last_entry(map)->vme_end) {
15094 			ledger_amount_t ledger_resident, ledger_compressed;
15095 
15096 			/*
15097 			 * Add a fake memory region to account for
15098 			 * purgeable and/or ledger-tagged memory that
15099 			 * counts towards this task's memory footprint,
15100 			 * i.e. the resident/compressed pages of non-volatile
15101 			 * objects owned by that task.
15102 			 */
15103 			task_ledgers_footprint(map->pmap->ledger,
15104 			    &ledger_resident,
15105 			    &ledger_compressed);
15106 			if (ledger_resident + ledger_compressed == 0) {
15107 				/* no purgeable memory usage to report */
15108 				return KERN_INVALID_ADDRESS;
15109 			}
15110 			/* fake region to show nonvolatile footprint */
15111 			if (look_for_pages) {
15112 				submap_info->protection = VM_PROT_DEFAULT;
15113 				submap_info->max_protection = VM_PROT_DEFAULT;
15114 				submap_info->inheritance = VM_INHERIT_DEFAULT;
15115 				submap_info->offset = 0;
15116 				submap_info->user_tag = -1;
15117 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15118 				submap_info->pages_shared_now_private = 0;
15119 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15120 				submap_info->pages_dirtied = submap_info->pages_resident;
15121 				submap_info->ref_count = 1;
15122 				submap_info->shadow_depth = 0;
15123 				submap_info->external_pager = 0;
15124 				submap_info->share_mode = SM_PRIVATE;
15125 				if (submap_needed_copy) {
15126 					submap_info->share_mode = SM_COW;
15127 				}
15128 				submap_info->is_submap = 0;
15129 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15130 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15131 				submap_info->user_wired_count = 0;
15132 				submap_info->pages_reusable = 0;
15133 			} else {
15134 				short_info->user_tag = -1;
15135 				short_info->offset = 0;
15136 				short_info->protection = VM_PROT_DEFAULT;
15137 				short_info->inheritance = VM_INHERIT_DEFAULT;
15138 				short_info->max_protection = VM_PROT_DEFAULT;
15139 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
15140 				short_info->user_wired_count = 0;
15141 				short_info->is_submap = 0;
15142 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15143 				short_info->external_pager = 0;
15144 				short_info->shadow_depth = 0;
15145 				short_info->share_mode = SM_PRIVATE;
15146 				if (submap_needed_copy) {
15147 					short_info->share_mode = SM_COW;
15148 				}
15149 				short_info->ref_count = 1;
15150 			}
15151 			*nesting_depth = 0;
15152 			*address_u = vm_sanitize_wrap_addr(vm_map_last_entry(map)->vme_end);
15153 			*size_u    = vm_sanitize_wrap_size(ledger_resident + ledger_compressed);
15154 			return KERN_SUCCESS;
15155 		}
15156 
15157 		if (next_entry == NULL) {
15158 			/* ... and no VM region follows it either */
15159 			return KERN_INVALID_ADDRESS;
15160 		}
15161 		/* ... gather info about the next VM region */
15162 		curr_entry = next_entry;
15163 		curr_map = next_map;    /* still locked ... */
15164 		curr_address = next_address;
15165 		curr_skip = next_skip;
15166 		curr_offset = next_offset;
15167 		curr_depth = next_depth;
15168 		curr_max_above = next_max_above;
15169 		curr_max_below = next_max_below;
15170 	} else {
15171 		/* we won't need "next_entry" after all */
15172 		if (next_entry != NULL) {
15173 			/* release "next_map" */
15174 			if (next_map != curr_map && not_in_kdp) {
15175 				vm_map_unlock_read(next_map);
15176 			}
15177 		}
15178 	}
15179 	next_entry = NULL;
15180 	next_map = NULL;
15181 	next_offset = 0;
15182 	next_skip = 0;
15183 	next_depth = 0;
15184 	next_max_below = -1;
15185 	next_max_above = -1;
15186 
15187 	if (curr_entry->is_sub_map &&
15188 	    curr_depth < user_max_depth) {
15189 		/*
15190 		 * We're not as deep as we could be:  we must have
15191 		 * gone back up after not finding anything mapped
15192 		 * below the original top-level map entry's.
15193 		 * Let's move "curr_address" forward and recurse again.
15194 		 */
15195 		user_address = curr_address;
15196 		goto recurse_again;
15197 	}
15198 
15199 	*nesting_depth = curr_depth;
15200 	*address_u = vm_sanitize_wrap_addr(
15201 		user_address + curr_skip - curr_max_below);
15202 	*size_u    = vm_sanitize_wrap_size(curr_max_above + curr_max_below);
15203 
15204 	if (look_for_pages) {
15205 		submap_info->user_tag = VME_ALIAS(curr_entry);
15206 		submap_info->offset = VME_OFFSET(curr_entry);
15207 		submap_info->protection = curr_entry->protection;
15208 		submap_info->inheritance = curr_entry->inheritance;
15209 		submap_info->max_protection = curr_entry->max_protection;
15210 		submap_info->behavior = curr_entry->behavior;
15211 		submap_info->user_wired_count = curr_entry->user_wired_count;
15212 		submap_info->is_submap = curr_entry->is_sub_map;
15213 		if (curr_entry->is_sub_map) {
15214 			submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15215 		} else {
15216 			submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15217 		}
15218 	} else {
15219 		short_info->user_tag = VME_ALIAS(curr_entry);
15220 		short_info->offset = VME_OFFSET(curr_entry);
15221 		short_info->protection = curr_entry->protection;
15222 		short_info->inheritance = curr_entry->inheritance;
15223 		short_info->max_protection = curr_entry->max_protection;
15224 		short_info->behavior = curr_entry->behavior;
15225 		short_info->user_wired_count = curr_entry->user_wired_count;
15226 		short_info->is_submap = curr_entry->is_sub_map;
15227 		if (curr_entry->is_sub_map) {
15228 			short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15229 		} else {
15230 			short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15231 		}
15232 	}
15233 
15234 	extended.pages_resident = 0;
15235 	extended.pages_swapped_out = 0;
15236 	extended.pages_shared_now_private = 0;
15237 	extended.pages_dirtied = 0;
15238 	extended.pages_reusable = 0;
15239 	extended.external_pager = 0;
15240 	extended.shadow_depth = 0;
15241 	extended.share_mode = SM_EMPTY;
15242 	extended.ref_count = 0;
15243 
15244 	if (not_in_kdp) {
15245 		if (!curr_entry->is_sub_map) {
15246 			vm_map_offset_t range_start, range_end;
15247 			range_start = MAX((curr_address - curr_max_below),
15248 			    curr_entry->vme_start);
15249 			range_end = MIN((curr_address + curr_max_above),
15250 			    curr_entry->vme_end);
15251 			vm_map_region_walk(curr_map,
15252 			    range_start,
15253 			    curr_entry,
15254 			    (VME_OFFSET(curr_entry) +
15255 			    (range_start -
15256 			    curr_entry->vme_start)),
15257 			    range_end - range_start,
15258 			    &extended,
15259 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15260 			if (submap_needed_copy) {
15261 				extended.share_mode = SM_COW;
15262 			}
15263 		} else {
15264 			if (curr_entry->use_pmap) {
15265 				extended.share_mode = SM_TRUESHARED;
15266 			} else {
15267 				extended.share_mode = SM_PRIVATE;
15268 			}
15269 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15270 		}
15271 	}
15272 
15273 	if (look_for_pages) {
15274 		submap_info->pages_resident = extended.pages_resident;
15275 		submap_info->pages_swapped_out = extended.pages_swapped_out;
15276 		submap_info->pages_shared_now_private =
15277 		    extended.pages_shared_now_private;
15278 		submap_info->pages_dirtied = extended.pages_dirtied;
15279 		submap_info->external_pager = extended.external_pager;
15280 		submap_info->shadow_depth = extended.shadow_depth;
15281 		submap_info->share_mode = extended.share_mode;
15282 		submap_info->ref_count = extended.ref_count;
15283 
15284 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15285 			submap_info->pages_reusable = extended.pages_reusable;
15286 		}
15287 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15288 			if (curr_entry->is_sub_map) {
15289 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15290 			} else if (VME_OBJECT(curr_entry)) {
15291 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15292 			} else {
15293 				submap_info->object_id_full = 0ull;
15294 			}
15295 		}
15296 	} else {
15297 		short_info->external_pager = extended.external_pager;
15298 		short_info->shadow_depth = extended.shadow_depth;
15299 		short_info->share_mode = extended.share_mode;
15300 		short_info->ref_count = extended.ref_count;
15301 	}
15302 
15303 	if (not_in_kdp) {
15304 		vm_map_unlock_read(curr_map);
15305 	}
15306 
15307 	return KERN_SUCCESS;
15308 }
15309 
15310 /*
15311  *	vm_region:
15312  *
15313  *	User call to obtain information about a region in
15314  *	a task's address map. Currently, only one flavor is
15315  *	supported.
15316  *
15317  *	XXX The reserved and behavior fields cannot be filled
15318  *	    in until the vm merge from the IK is completed, and
15319  *	    vm_reserve is implemented.
15320  */
15321 
15322 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15323 vm_map_region(
15324 	vm_map_t                map,
15325 	vm_map_offset_ut       *address_u,      /* IN/OUT */
15326 	vm_map_size_ut         *size_u,         /* OUT */
15327 	vm_region_flavor_t      flavor,         /* IN */
15328 	vm_region_info_t        info,           /* OUT */
15329 	mach_msg_type_number_t *count,          /* IN/OUT */
15330 	mach_port_t            *object_name)    /* OUT */
15331 {
15332 	vm_map_entry_t          tmp_entry;
15333 	vm_map_entry_t          entry;
15334 	vm_map_offset_t         start;
15335 
15336 	if (map == VM_MAP_NULL) {
15337 		return KERN_INVALID_ARGUMENT;
15338 	}
15339 
15340 	start = vm_sanitize_addr(map, *address_u);
15341 
15342 	switch (flavor) {
15343 	case VM_REGION_BASIC_INFO:
15344 		/* legacy for old 32-bit objects info */
15345 	{
15346 		vm_region_basic_info_t  basic;
15347 
15348 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
15349 			return KERN_INVALID_ARGUMENT;
15350 		}
15351 
15352 		basic = (vm_region_basic_info_t) info;
15353 		*count = VM_REGION_BASIC_INFO_COUNT;
15354 
15355 		vm_map_lock_read(map);
15356 
15357 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15358 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15359 				vm_map_unlock_read(map);
15360 				return KERN_INVALID_ADDRESS;
15361 			}
15362 		} else {
15363 			entry = tmp_entry;
15364 		}
15365 
15366 		start = entry->vme_start;
15367 
15368 		basic->offset = (uint32_t)VME_OFFSET(entry);
15369 		basic->protection = entry->protection;
15370 		basic->inheritance = entry->inheritance;
15371 		basic->max_protection = entry->max_protection;
15372 		basic->behavior = entry->behavior;
15373 		basic->user_wired_count = entry->user_wired_count;
15374 		basic->reserved = entry->is_sub_map;
15375 
15376 		*address_u = vm_sanitize_wrap_addr(start);
15377 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15378 
15379 		if (object_name) {
15380 			*object_name = IP_NULL;
15381 		}
15382 		if (entry->is_sub_map) {
15383 			basic->shared = FALSE;
15384 		} else {
15385 			basic->shared = entry->is_shared;
15386 		}
15387 
15388 		vm_map_unlock_read(map);
15389 		return KERN_SUCCESS;
15390 	}
15391 
15392 	case VM_REGION_BASIC_INFO_64:
15393 	{
15394 		vm_region_basic_info_64_t       basic;
15395 
15396 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15397 			return KERN_INVALID_ARGUMENT;
15398 		}
15399 
15400 		basic = (vm_region_basic_info_64_t) info;
15401 		*count = VM_REGION_BASIC_INFO_COUNT_64;
15402 
15403 		vm_map_lock_read(map);
15404 
15405 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15406 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15407 				vm_map_unlock_read(map);
15408 				return KERN_INVALID_ADDRESS;
15409 			}
15410 		} else {
15411 			entry = tmp_entry;
15412 		}
15413 
15414 		start = entry->vme_start;
15415 
15416 		basic->offset = VME_OFFSET(entry);
15417 		basic->protection = entry->protection;
15418 		basic->inheritance = entry->inheritance;
15419 		basic->max_protection = entry->max_protection;
15420 		basic->behavior = entry->behavior;
15421 		basic->user_wired_count = entry->user_wired_count;
15422 		basic->reserved = entry->is_sub_map;
15423 
15424 		*address_u = vm_sanitize_wrap_addr(start);
15425 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15426 
15427 		if (object_name) {
15428 			*object_name = IP_NULL;
15429 		}
15430 		if (entry->is_sub_map) {
15431 			basic->shared = FALSE;
15432 		} else {
15433 			basic->shared = entry->is_shared;
15434 		}
15435 
15436 		vm_map_unlock_read(map);
15437 		return KERN_SUCCESS;
15438 	}
15439 	case VM_REGION_EXTENDED_INFO:
15440 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15441 			return KERN_INVALID_ARGUMENT;
15442 		}
15443 		OS_FALLTHROUGH;
15444 	case VM_REGION_EXTENDED_INFO__legacy:
15445 	{
15446 		vm_region_extended_info_t       extended;
15447 		mach_msg_type_number_t original_count;
15448 		int effective_page_size, effective_page_shift;
15449 
15450 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15451 			return KERN_INVALID_ARGUMENT;
15452 		}
15453 
15454 		extended = (vm_region_extended_info_t) info;
15455 
15456 		effective_page_shift = vm_self_region_page_shift(map);
15457 		effective_page_size = (1 << effective_page_shift);
15458 
15459 		vm_map_lock_read(map);
15460 
15461 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15462 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15463 				vm_map_unlock_read(map);
15464 				return KERN_INVALID_ADDRESS;
15465 			}
15466 		} else {
15467 			entry = tmp_entry;
15468 		}
15469 		start = entry->vme_start;
15470 
15471 		extended->protection = entry->protection;
15472 		extended->user_tag = VME_ALIAS(entry);
15473 		extended->pages_resident = 0;
15474 		extended->pages_swapped_out = 0;
15475 		extended->pages_shared_now_private = 0;
15476 		extended->pages_dirtied = 0;
15477 		extended->external_pager = 0;
15478 		extended->shadow_depth = 0;
15479 
15480 		original_count = *count;
15481 		if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15482 			*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15483 		} else {
15484 			extended->pages_reusable = 0;
15485 			*count = VM_REGION_EXTENDED_INFO_COUNT;
15486 		}
15487 
15488 		vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15489 
15490 		if (object_name) {
15491 			*object_name = IP_NULL;
15492 		}
15493 
15494 		*address_u = vm_sanitize_wrap_addr(start);
15495 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15496 
15497 		vm_map_unlock_read(map);
15498 		return KERN_SUCCESS;
15499 	}
15500 	case VM_REGION_TOP_INFO:
15501 	{
15502 		vm_region_top_info_t    top;
15503 
15504 		if (*count < VM_REGION_TOP_INFO_COUNT) {
15505 			return KERN_INVALID_ARGUMENT;
15506 		}
15507 
15508 		top = (vm_region_top_info_t) info;
15509 		*count = VM_REGION_TOP_INFO_COUNT;
15510 
15511 		vm_map_lock_read(map);
15512 
15513 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15514 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15515 				vm_map_unlock_read(map);
15516 				return KERN_INVALID_ADDRESS;
15517 			}
15518 		} else {
15519 			entry = tmp_entry;
15520 		}
15521 		start = entry->vme_start;
15522 
15523 		top->private_pages_resident = 0;
15524 		top->shared_pages_resident = 0;
15525 
15526 		vm_map_region_top_walk(entry, top);
15527 
15528 		if (object_name) {
15529 			*object_name = IP_NULL;
15530 		}
15531 
15532 		*address_u = vm_sanitize_wrap_addr(start);
15533 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15534 
15535 		vm_map_unlock_read(map);
15536 		return KERN_SUCCESS;
15537 	}
15538 	default:
15539 		return KERN_INVALID_ARGUMENT;
15540 	}
15541 }
15542 
15543 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
15544 	MIN((entry_size),                                               \
15545 	    ((obj)->all_reusable ?                                      \
15546 	     (obj)->wired_page_count :                                  \
15547 	     (obj)->resident_page_count - (obj)->reusable_page_count))
15548 
15549 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15550 vm_map_region_top_walk(
15551 	vm_map_entry_t             entry,
15552 	vm_region_top_info_t       top)
15553 {
15554 	if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15555 		top->share_mode = SM_EMPTY;
15556 		top->ref_count = 0;
15557 		top->obj_id = 0;
15558 		return;
15559 	}
15560 
15561 	{
15562 		struct  vm_object *obj, *tmp_obj;
15563 		int             ref_count;
15564 		uint32_t        entry_size;
15565 
15566 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15567 
15568 		obj = VME_OBJECT(entry);
15569 
15570 		vm_object_lock(obj);
15571 
15572 		if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15573 		    obj->paging_in_progress) {
15574 			ref_count--;
15575 		}
15576 
15577 		assert(obj->reusable_page_count <= obj->resident_page_count);
15578 		if (obj->shadow) {
15579 			if (ref_count == 1) {
15580 				top->private_pages_resident =
15581 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15582 			} else {
15583 				top->shared_pages_resident =
15584 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15585 			}
15586 			top->ref_count  = ref_count;
15587 			top->share_mode = SM_COW;
15588 
15589 			while ((tmp_obj = obj->shadow)) {
15590 				vm_object_lock(tmp_obj);
15591 				vm_object_unlock(obj);
15592 				obj = tmp_obj;
15593 
15594 				if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15595 				    obj->paging_in_progress) {
15596 					ref_count--;
15597 				}
15598 
15599 				assert(obj->reusable_page_count <= obj->resident_page_count);
15600 				top->shared_pages_resident +=
15601 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15602 				top->ref_count += ref_count - 1;
15603 			}
15604 		} else {
15605 			if (entry->superpage_size) {
15606 				top->share_mode = SM_LARGE_PAGE;
15607 				top->shared_pages_resident = 0;
15608 				top->private_pages_resident = entry_size;
15609 			} else if (entry->needs_copy) {
15610 				top->share_mode = SM_COW;
15611 				top->shared_pages_resident =
15612 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15613 			} else {
15614 				if (ref_count == 1 ||
15615 				    (ref_count == 2 && obj->named)) {
15616 					top->share_mode = SM_PRIVATE;
15617 					top->private_pages_resident =
15618 					    OBJ_RESIDENT_COUNT(obj,
15619 					    entry_size);
15620 				} else {
15621 					top->share_mode = SM_SHARED;
15622 					top->shared_pages_resident =
15623 					    OBJ_RESIDENT_COUNT(obj,
15624 					    entry_size);
15625 				}
15626 			}
15627 			top->ref_count = ref_count;
15628 		}
15629 
15630 		vm_object_unlock(obj);
15631 
15632 		/* XXX K64: obj_id will be truncated */
15633 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15634 	}
15635 }
15636 
15637 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15638 vm_map_region_walk(
15639 	vm_map_t                        map,
15640 	vm_map_offset_t                 va,
15641 	vm_map_entry_t                  entry,
15642 	vm_object_offset_t              offset,
15643 	vm_object_size_t                range,
15644 	vm_region_extended_info_t       extended,
15645 	boolean_t                       look_for_pages,
15646 	mach_msg_type_number_t count)
15647 {
15648 	struct vm_object *obj, *tmp_obj;
15649 	vm_map_offset_t       last_offset;
15650 	int               i;
15651 	int               ref_count;
15652 	struct vm_object        *shadow_object;
15653 	unsigned short          shadow_depth;
15654 	boolean_t         do_region_footprint;
15655 	int                     effective_page_size, effective_page_shift;
15656 	vm_map_offset_t         effective_page_mask;
15657 
15658 	do_region_footprint = task_self_region_footprint();
15659 
15660 	if ((entry->is_sub_map) ||
15661 	    (VME_OBJECT(entry) == 0) ||
15662 	    (VME_OBJECT(entry)->phys_contiguous &&
15663 	    !entry->superpage_size)) {
15664 		extended->share_mode = SM_EMPTY;
15665 		extended->ref_count = 0;
15666 		return;
15667 	}
15668 
15669 	if (entry->superpage_size) {
15670 		extended->shadow_depth = 0;
15671 		extended->share_mode = SM_LARGE_PAGE;
15672 		extended->ref_count = 1;
15673 		extended->external_pager = 0;
15674 
15675 		/* TODO4K: Superpage in 4k mode? */
15676 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15677 		extended->shadow_depth = 0;
15678 		return;
15679 	}
15680 
15681 	effective_page_shift = vm_self_region_page_shift(map);
15682 	effective_page_size = (1 << effective_page_shift);
15683 	effective_page_mask = effective_page_size - 1;
15684 
15685 	offset = vm_map_trunc_page(offset, effective_page_mask);
15686 
15687 	obj = VME_OBJECT(entry);
15688 
15689 	vm_object_lock(obj);
15690 
15691 	if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15692 	    obj->paging_in_progress) {
15693 		ref_count--;
15694 	}
15695 
15696 	if (look_for_pages) {
15697 		for (last_offset = offset + range;
15698 		    offset < last_offset;
15699 		    offset += effective_page_size, va += effective_page_size) {
15700 			if (do_region_footprint) {
15701 				int disp;
15702 
15703 				disp = 0;
15704 				if (map->has_corpse_footprint) {
15705 					/*
15706 					 * Query the page info data we saved
15707 					 * while forking the corpse.
15708 					 */
15709 					vm_map_corpse_footprint_query_page_info(
15710 						map,
15711 						va,
15712 						&disp);
15713 				} else {
15714 					/*
15715 					 * Query the pmap.
15716 					 */
15717 					vm_map_footprint_query_page_info(
15718 						map,
15719 						entry,
15720 						va,
15721 						&disp);
15722 				}
15723 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15724 					extended->pages_resident++;
15725 				}
15726 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15727 					extended->pages_reusable++;
15728 				}
15729 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15730 					extended->pages_dirtied++;
15731 				}
15732 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15733 					extended->pages_swapped_out++;
15734 				}
15735 				continue;
15736 			}
15737 
15738 			vm_map_region_look_for_page(map, va, obj,
15739 			    vm_object_trunc_page(offset), ref_count,
15740 			    0, extended, count);
15741 		}
15742 
15743 		if (do_region_footprint) {
15744 			goto collect_object_info;
15745 		}
15746 	} else {
15747 collect_object_info:
15748 		shadow_object = obj->shadow;
15749 		shadow_depth = 0;
15750 
15751 		if (!(obj->internal)) {
15752 			extended->external_pager = 1;
15753 		}
15754 
15755 		if (shadow_object != VM_OBJECT_NULL) {
15756 			vm_object_lock(shadow_object);
15757 			for (;
15758 			    shadow_object != VM_OBJECT_NULL;
15759 			    shadow_depth++) {
15760 				vm_object_t     next_shadow;
15761 
15762 				if (!(shadow_object->internal)) {
15763 					extended->external_pager = 1;
15764 				}
15765 
15766 				next_shadow = shadow_object->shadow;
15767 				if (next_shadow) {
15768 					vm_object_lock(next_shadow);
15769 				}
15770 				vm_object_unlock(shadow_object);
15771 				shadow_object = next_shadow;
15772 			}
15773 		}
15774 		extended->shadow_depth = shadow_depth;
15775 	}
15776 
15777 	if (extended->shadow_depth || entry->needs_copy) {
15778 		extended->share_mode = SM_COW;
15779 	} else {
15780 		if (ref_count == 1) {
15781 			extended->share_mode = SM_PRIVATE;
15782 		} else {
15783 			if (obj->true_share) {
15784 				extended->share_mode = SM_TRUESHARED;
15785 			} else {
15786 				extended->share_mode = SM_SHARED;
15787 			}
15788 		}
15789 	}
15790 	extended->ref_count = ref_count - extended->shadow_depth;
15791 
15792 	for (i = 0; i < extended->shadow_depth; i++) {
15793 		if ((tmp_obj = obj->shadow) == 0) {
15794 			break;
15795 		}
15796 		vm_object_lock(tmp_obj);
15797 		vm_object_unlock(obj);
15798 
15799 		if ((ref_count = os_ref_get_count_raw(&tmp_obj->ref_count)) > 1 &&
15800 		    tmp_obj->paging_in_progress) {
15801 			ref_count--;
15802 		}
15803 
15804 		extended->ref_count += ref_count;
15805 		obj = tmp_obj;
15806 	}
15807 	vm_object_unlock(obj);
15808 
15809 	if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15810 		extended->share_mode = SM_PRIVATE;
15811 	} else if (extended->share_mode == SM_SHARED && !(task_self_region_info_flags() & VM_REGION_INFO_FLAGS_NO_ALIASED)) {
15812 		vm_map_entry_t       cur;
15813 		vm_map_entry_t       last;
15814 		int      my_refs;
15815 
15816 		obj = VME_OBJECT(entry);
15817 		last = vm_map_to_entry(map);
15818 		my_refs = 0;
15819 
15820 		if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15821 		    obj->paging_in_progress) {
15822 			ref_count--;
15823 		}
15824 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15825 			if (vm_map_region_has_obj_ref(cur, obj)) {
15826 				my_refs++;
15827 			}
15828 		}
15829 
15830 		if (my_refs == ref_count) {
15831 			extended->share_mode = SM_PRIVATE_ALIASED;
15832 		} else if (my_refs > 1) {
15833 			extended->share_mode = SM_SHARED_ALIASED;
15834 		}
15835 	}
15836 }
15837 
15838 
15839 /* object is locked on entry and locked on return */
15840 
15841 
15842 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15843 vm_map_region_look_for_page(
15844 	__unused vm_map_t               map,
15845 	__unused vm_map_offset_t        va,
15846 	vm_object_t                     object,
15847 	vm_object_offset_t              offset,
15848 	int                             max_refcnt,
15849 	unsigned short                  depth,
15850 	vm_region_extended_info_t       extended,
15851 	mach_msg_type_number_t count)
15852 {
15853 	vm_page_t       p;
15854 	vm_object_t     shadow;
15855 	int             ref_count;
15856 	vm_object_t     caller_object;
15857 
15858 	shadow = object->shadow;
15859 	caller_object = object;
15860 
15861 
15862 	while (TRUE) {
15863 		if (!(object->internal)) {
15864 			extended->external_pager = 1;
15865 		}
15866 
15867 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15868 			if (shadow && (max_refcnt == 1)) {
15869 				extended->pages_shared_now_private++;
15870 			}
15871 
15872 			if (!p->vmp_fictitious &&
15873 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15874 				extended->pages_dirtied++;
15875 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15876 				if (p->vmp_reusable || object->all_reusable) {
15877 					extended->pages_reusable++;
15878 				}
15879 			}
15880 
15881 			extended->pages_resident++;
15882 
15883 			if (object != caller_object) {
15884 				vm_object_unlock(object);
15885 			}
15886 
15887 			return;
15888 		}
15889 		if (object->internal &&
15890 		    object->alive &&
15891 		    !object->terminating &&
15892 		    object->pager_ready) {
15893 			if (vm_object_compressor_pager_state_get(object, offset)
15894 			    == VM_EXTERNAL_STATE_EXISTS) {
15895 				/* the pager has that page */
15896 				extended->pages_swapped_out++;
15897 				if (object != caller_object) {
15898 					vm_object_unlock(object);
15899 				}
15900 				return;
15901 			}
15902 		}
15903 
15904 		if (shadow) {
15905 			vm_object_lock(shadow);
15906 			if ((ref_count = os_ref_get_count_raw(&shadow->ref_count)) > 1 &&
15907 			    shadow->paging_in_progress) {
15908 				ref_count--;
15909 			}
15910 
15911 			if (++depth > extended->shadow_depth) {
15912 				extended->shadow_depth = depth;
15913 			}
15914 
15915 			if (ref_count > max_refcnt) {
15916 				max_refcnt = ref_count;
15917 			}
15918 
15919 			if (object != caller_object) {
15920 				vm_object_unlock(object);
15921 			}
15922 
15923 			offset = offset + object->vo_shadow_offset;
15924 			object = shadow;
15925 			shadow = object->shadow;
15926 			continue;
15927 		}
15928 		if (object != caller_object) {
15929 			vm_object_unlock(object);
15930 		}
15931 		break;
15932 	}
15933 }
15934 
15935 static inline boolean_t
vm_map_region_has_obj_ref(vm_map_entry_t entry,vm_object_t object)15936 vm_map_region_has_obj_ref(
15937 	vm_map_entry_t    entry,
15938 	vm_object_t       object)
15939 {
15940 	vm_object_t cur_obj;
15941 	vm_object_t shadow_obj;
15942 
15943 	if (entry->is_sub_map) {
15944 		return FALSE;
15945 	}
15946 
15947 	cur_obj = VME_OBJECT(entry);
15948 	if (cur_obj == VM_OBJECT_NULL) {
15949 		return FALSE;
15950 	} else if (cur_obj == object) {
15951 		return TRUE;
15952 	}
15953 
15954 	/*
15955 	 * Avoid locks for first shadow check, otherwise diagnostic tools will
15956 	 * spend most of their time obtaining locks in this function when analyzing
15957 	 * processes with many VM entries which may commonly have no shadow chain.
15958 	 *
15959 	 * This is acceptable because:
15960 	 *  - Shadow's fields are not accessed outside of its lock
15961 	 *  - Objects are unlikely to be modified due to:
15962 	 *	  - Many diagnostic tools suspend the task
15963 	 *	  - VM map is locked
15964 	 *	- The rare incorrect return from this function turns a guess into a
15965 	 *	  slightly worse guess
15966 	 *	- Entire shadow chain is not locked as a whole, so can still change
15967 	 *	  while traversing, resulting in incorrect guess even with locking
15968 	 */
15969 	shadow_obj = cur_obj->shadow;
15970 	if (shadow_obj == VM_OBJECT_NULL) {
15971 		return FALSE;
15972 	} else if (shadow_obj == object) {
15973 		return TRUE;
15974 	}
15975 
15976 	vm_object_lock(cur_obj);
15977 
15978 	while ((shadow_obj = cur_obj->shadow)) {
15979 		/* check if object was found before grabbing a lock */
15980 		if (shadow_obj == object) {
15981 			vm_object_unlock(cur_obj);
15982 			return TRUE;
15983 		}
15984 
15985 		vm_object_lock(shadow_obj);
15986 		vm_object_unlock(cur_obj);
15987 		cur_obj = shadow_obj;
15988 	}
15989 
15990 	/* exhausted the shadow chain */
15991 	vm_object_unlock(cur_obj);
15992 	return FALSE;
15993 }
15994 
15995 
15996 /*
15997  *	Routine:	vm_map_simplify
15998  *
15999  *	Description:
16000  *		Attempt to simplify the map representation in
16001  *		the vicinity of the given starting address.
16002  *	Note:
16003  *		This routine is intended primarily to keep the
16004  *		kernel maps more compact -- they generally don't
16005  *		benefit from the "expand a map entry" technology
16006  *		at allocation time because the adjacent entry
16007  *		is often wired down.
16008  */
16009 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)16010 vm_map_simplify_entry(
16011 	vm_map_t        map,
16012 	vm_map_entry_t  this_entry)
16013 {
16014 	vm_map_entry_t  prev_entry;
16015 
16016 	prev_entry = this_entry->vme_prev;
16017 
16018 	if ((this_entry != vm_map_to_entry(map)) &&
16019 	    (prev_entry != vm_map_to_entry(map)) &&
16020 
16021 	    (prev_entry->vme_end == this_entry->vme_start) &&
16022 
16023 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
16024 	    (prev_entry->vme_object_value == this_entry->vme_object_value) &&
16025 	    (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
16026 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
16027 	    prev_entry->vme_start))
16028 	    == VME_OFFSET(this_entry)) &&
16029 
16030 	    (prev_entry->behavior == this_entry->behavior) &&
16031 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
16032 	    (prev_entry->protection == this_entry->protection) &&
16033 	    (prev_entry->max_protection == this_entry->max_protection) &&
16034 	    (prev_entry->inheritance == this_entry->inheritance) &&
16035 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
16036 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
16037 	    (prev_entry->no_cache == this_entry->no_cache) &&
16038 	    (prev_entry->vme_permanent == this_entry->vme_permanent) &&
16039 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
16040 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
16041 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
16042 #if __arm64e__
16043 	    (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
16044 #endif
16045 	    (prev_entry->csm_associated == this_entry->csm_associated) &&
16046 	    (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
16047 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
16048 	    (prev_entry->vme_resilient_codesign ==
16049 	    this_entry->vme_resilient_codesign) &&
16050 	    (prev_entry->vme_resilient_media ==
16051 	    this_entry->vme_resilient_media) &&
16052 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
16053 	    (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
16054 
16055 	    (prev_entry->wired_count == this_entry->wired_count) &&
16056 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
16057 
16058 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
16059 	    (prev_entry->in_transition == FALSE) &&
16060 	    (this_entry->in_transition == FALSE) &&
16061 	    (prev_entry->needs_wakeup == FALSE) &&
16062 	    (this_entry->needs_wakeup == FALSE) &&
16063 	    (prev_entry->is_shared == this_entry->is_shared) &&
16064 	    (prev_entry->superpage_size == FALSE) &&
16065 	    (this_entry->superpage_size == FALSE)
16066 	    ) {
16067 		if (prev_entry->vme_permanent) {
16068 			assert(this_entry->vme_permanent);
16069 			prev_entry->vme_permanent = false;
16070 		}
16071 		vm_map_store_entry_unlink(map, prev_entry, true);
16072 		assert(prev_entry->vme_start < this_entry->vme_end);
16073 		if (prev_entry->map_aligned) {
16074 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
16075 			    VM_MAP_PAGE_MASK(map)));
16076 		}
16077 		this_entry->vme_start = prev_entry->vme_start;
16078 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
16079 
16080 		if (map->holelistenabled) {
16081 			vm_map_store_update_first_free(map, this_entry, TRUE);
16082 		}
16083 
16084 		if (prev_entry->is_sub_map) {
16085 			vm_map_deallocate(VME_SUBMAP(prev_entry));
16086 		} else {
16087 			vm_object_deallocate(VME_OBJECT(prev_entry));
16088 		}
16089 		vm_map_entry_dispose(prev_entry);
16090 		SAVE_HINT_MAP_WRITE(map, this_entry);
16091 	}
16092 }
16093 
16094 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)16095 vm_map_simplify(
16096 	vm_map_t        map,
16097 	vm_map_offset_t start)
16098 {
16099 	vm_map_entry_t  this_entry;
16100 
16101 	vm_map_lock(map);
16102 	if (vm_map_lookup_entry(map, start, &this_entry)) {
16103 		vm_map_simplify_entry(map, this_entry);
16104 		vm_map_simplify_entry(map, this_entry->vme_next);
16105 	}
16106 	vm_map_unlock(map);
16107 }
16108 
16109 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16110 vm_map_simplify_range(
16111 	vm_map_t        map,
16112 	vm_map_offset_t start,
16113 	vm_map_offset_t end)
16114 {
16115 	vm_map_entry_t  entry;
16116 
16117 	/*
16118 	 * The map should be locked (for "write") by the caller.
16119 	 */
16120 
16121 	if (start >= end) {
16122 		/* invalid address range */
16123 		return;
16124 	}
16125 
16126 	start = vm_map_trunc_page(start,
16127 	    VM_MAP_PAGE_MASK(map));
16128 	end = vm_map_round_page(end,
16129 	    VM_MAP_PAGE_MASK(map));
16130 
16131 	if (!vm_map_lookup_entry(map, start, &entry)) {
16132 		/* "start" is not mapped and "entry" ends before "start" */
16133 		if (entry == vm_map_to_entry(map)) {
16134 			/* start with first entry in the map */
16135 			entry = vm_map_first_entry(map);
16136 		} else {
16137 			/* start with next entry */
16138 			entry = entry->vme_next;
16139 		}
16140 	}
16141 
16142 	while (entry != vm_map_to_entry(map) &&
16143 	    entry->vme_start <= end) {
16144 		/* try and coalesce "entry" with its previous entry */
16145 		vm_map_simplify_entry(map, entry);
16146 		entry = entry->vme_next;
16147 	}
16148 }
16149 
16150 static __attribute__((always_inline, warn_unused_result))
16151 kern_return_t
vm_map_machine_attribute_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,mach_vm_offset_t * start,mach_vm_offset_t * end,vm_map_size_t * size)16152 vm_map_machine_attribute_sanitize(
16153 	vm_map_t                map,
16154 	vm_map_offset_ut        start_u,
16155 	vm_map_offset_ut        end_u,
16156 	mach_vm_offset_t       *start,
16157 	mach_vm_offset_t       *end,
16158 	vm_map_size_t          *size)
16159 {
16160 	return vm_sanitize_addr_end(start_u, end_u,
16161 	           VM_SANITIZE_CALLER_VM_MAP_MACHINE_ATTRIBUTE, map,
16162 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
16163 	           size);
16164 }
16165 
16166 
16167 /*
16168  *	Routine:	vm_map_machine_attribute
16169  *	Purpose:
16170  *		Provide machine-specific attributes to mappings,
16171  *		such as cachability etc. for machines that provide
16172  *		them.  NUMA architectures and machines with big/strange
16173  *		caches will use this.
16174  *	Note:
16175  *		Responsibilities for locking and checking are handled here,
16176  *		everything else in the pmap module. If any non-volatile
16177  *		information must be kept, the pmap module should handle
16178  *		it itself. [This assumes that attributes do not
16179  *		need to be inherited, which seems ok to me]
16180  */
16181 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16182 vm_map_machine_attribute(
16183 	vm_map_t                map,
16184 	vm_map_offset_ut        start_u,
16185 	vm_map_offset_ut        end_u,
16186 	vm_machine_attribute_t  attribute,
16187 	vm_machine_attribute_val_t *value) /* IN/OUT */
16188 {
16189 	mach_vm_offset_t start, end;
16190 	vm_map_size_t    sync_size;
16191 	kern_return_t    ret;
16192 	vm_map_entry_t   entry;
16193 
16194 	ret = vm_map_machine_attribute_sanitize(map,
16195 	    start_u,
16196 	    end_u,
16197 	    &start,
16198 	    &end,
16199 	    &sync_size);
16200 	if (__improbable(ret != KERN_SUCCESS)) {
16201 		return vm_sanitize_get_kr(ret);
16202 	}
16203 
16204 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
16205 		return KERN_INVALID_ADDRESS;
16206 	}
16207 
16208 	vm_map_lock(map);
16209 
16210 	if (attribute != MATTR_CACHE) {
16211 		/* If we don't have to find physical addresses, we */
16212 		/* don't have to do an explicit traversal here.    */
16213 		ret = pmap_attribute(map->pmap, start, end - start,
16214 		    attribute, value);
16215 		vm_map_unlock(map);
16216 		return ret;
16217 	}
16218 
16219 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
16220 
16221 	while (sync_size) {
16222 		if (vm_map_lookup_entry(map, start, &entry)) {
16223 			vm_map_size_t   sub_size;
16224 			if ((entry->vme_end - start) > sync_size) {
16225 				sub_size = sync_size;
16226 				sync_size = 0;
16227 			} else {
16228 				sub_size = entry->vme_end - start;
16229 				sync_size -= sub_size;
16230 			}
16231 			if (entry->is_sub_map) {
16232 				vm_map_offset_t sub_start;
16233 				vm_map_offset_t sub_end;
16234 
16235 				sub_start = (start - entry->vme_start)
16236 				    + VME_OFFSET(entry);
16237 				sub_end = sub_start + sub_size;
16238 				vm_map_machine_attribute(
16239 					VME_SUBMAP(entry),
16240 					sub_start,
16241 					sub_end,
16242 					attribute, value);
16243 			} else if (VME_OBJECT(entry)) {
16244 				vm_page_t               m;
16245 				vm_object_t             object;
16246 				vm_object_t             base_object;
16247 				vm_object_t             last_object;
16248 				vm_object_offset_t      offset;
16249 				vm_object_offset_t      base_offset;
16250 				vm_map_size_t           range;
16251 				range = sub_size;
16252 				offset = (start - entry->vme_start)
16253 				    + VME_OFFSET(entry);
16254 				offset = vm_object_trunc_page(offset);
16255 				base_offset = offset;
16256 				object = VME_OBJECT(entry);
16257 				base_object = object;
16258 				last_object = NULL;
16259 
16260 				vm_object_lock(object);
16261 
16262 				while (range) {
16263 					m = vm_page_lookup(
16264 						object, offset);
16265 
16266 					if (m && !m->vmp_fictitious) {
16267 						ret =
16268 						    pmap_attribute_cache_sync(
16269 							VM_PAGE_GET_PHYS_PAGE(m),
16270 							PAGE_SIZE,
16271 							attribute, value);
16272 					} else if (object->shadow) {
16273 						offset = offset + object->vo_shadow_offset;
16274 						last_object = object;
16275 						object = object->shadow;
16276 						vm_object_lock(last_object->shadow);
16277 						vm_object_unlock(last_object);
16278 						continue;
16279 					}
16280 					if (range < PAGE_SIZE) {
16281 						range = 0;
16282 					} else {
16283 						range -= PAGE_SIZE;
16284 					}
16285 
16286 					if (base_object != object) {
16287 						vm_object_unlock(object);
16288 						vm_object_lock(base_object);
16289 						object = base_object;
16290 					}
16291 					/* Bump to the next page */
16292 					base_offset += PAGE_SIZE;
16293 					offset = base_offset;
16294 				}
16295 				vm_object_unlock(object);
16296 			}
16297 			start += sub_size;
16298 		} else {
16299 			vm_map_unlock(map);
16300 			return KERN_FAILURE;
16301 		}
16302 	}
16303 
16304 	vm_map_unlock(map);
16305 
16306 	return ret;
16307 }
16308 
16309 /*
16310  *	vm_map_behavior_set:
16311  *
16312  *	Sets the paging reference behavior of the specified address
16313  *	range in the target map.  Paging reference behavior affects
16314  *	how pagein operations resulting from faults on the map will be
16315  *	clustered.
16316  */
16317 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16318 vm_map_behavior_set(
16319 	vm_map_t        map,
16320 	vm_map_offset_t start,
16321 	vm_map_offset_t end,
16322 	vm_behavior_t   new_behavior)
16323 {
16324 	vm_map_entry_t  entry;
16325 	vm_map_entry_t  temp_entry;
16326 
16327 	if (start > end ||
16328 	    start < vm_map_min(map) ||
16329 	    end > vm_map_max(map)) {
16330 		return KERN_NO_SPACE;
16331 	}
16332 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16333 		return KERN_INVALID_ADDRESS;
16334 	}
16335 
16336 	switch (new_behavior) {
16337 	/*
16338 	 * This first block of behaviors all set a persistent state on the specified
16339 	 * memory range.  All we have to do here is to record the desired behavior
16340 	 * in the vm_map_entry_t's.
16341 	 */
16342 
16343 	case VM_BEHAVIOR_DEFAULT:
16344 	case VM_BEHAVIOR_RANDOM:
16345 	case VM_BEHAVIOR_SEQUENTIAL:
16346 	case VM_BEHAVIOR_RSEQNTL:
16347 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16348 		vm_map_lock(map);
16349 
16350 		/*
16351 		 *	The entire address range must be valid for the map.
16352 		 *      Note that vm_map_range_check() does a
16353 		 *	vm_map_lookup_entry() internally and returns the
16354 		 *	entry containing the start of the address range if
16355 		 *	the entire range is valid.
16356 		 */
16357 		if (vm_map_range_check(map, start, end, &temp_entry)) {
16358 			entry = temp_entry;
16359 			vm_map_clip_start(map, entry, start);
16360 		} else {
16361 			vm_map_unlock(map);
16362 			return KERN_INVALID_ADDRESS;
16363 		}
16364 
16365 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16366 			vm_map_clip_end(map, entry, end);
16367 			if (entry->is_sub_map) {
16368 				assert(!entry->use_pmap);
16369 			}
16370 
16371 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16372 				entry->zero_wired_pages = TRUE;
16373 			} else {
16374 				entry->behavior = new_behavior;
16375 			}
16376 			entry = entry->vme_next;
16377 		}
16378 
16379 		vm_map_unlock(map);
16380 		break;
16381 
16382 	/*
16383 	 * The rest of these are different from the above in that they cause
16384 	 * an immediate action to take place as opposed to setting a behavior that
16385 	 * affects future actions.
16386 	 */
16387 
16388 	case VM_BEHAVIOR_WILLNEED:
16389 		return vm_map_willneed(map, start, end);
16390 
16391 	case VM_BEHAVIOR_DONTNEED:
16392 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16393 
16394 	case VM_BEHAVIOR_FREE:
16395 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16396 
16397 	case VM_BEHAVIOR_REUSABLE:
16398 		return vm_map_reusable_pages(map, start, end);
16399 
16400 	case VM_BEHAVIOR_REUSE:
16401 		return vm_map_reuse_pages(map, start, end);
16402 
16403 	case VM_BEHAVIOR_CAN_REUSE:
16404 		return vm_map_can_reuse(map, start, end);
16405 
16406 #if MACH_ASSERT
16407 	case VM_BEHAVIOR_PAGEOUT:
16408 		return vm_map_pageout(map, start, end);
16409 #endif /* MACH_ASSERT */
16410 
16411 	case VM_BEHAVIOR_ZERO:
16412 		return vm_map_zero(map, start, end);
16413 
16414 	default:
16415 		return KERN_INVALID_ARGUMENT;
16416 	}
16417 
16418 	return KERN_SUCCESS;
16419 }
16420 
16421 
16422 /*
16423  * Internals for madvise(MADV_WILLNEED) system call.
16424  *
16425  * The implementation is to do:-
16426  * a) read-ahead if the mapping corresponds to a mapped regular file
16427  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16428  */
16429 
16430 
16431 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16432 vm_map_willneed(
16433 	vm_map_t        map,
16434 	vm_map_offset_t start,
16435 	vm_map_offset_t end
16436 	)
16437 {
16438 	vm_map_entry_t                  entry;
16439 	vm_object_t                     object;
16440 	memory_object_t                 pager;
16441 	struct vm_object_fault_info     fault_info = {};
16442 	kern_return_t                   kr;
16443 	vm_object_size_t                len;
16444 	vm_object_offset_t              offset;
16445 
16446 	KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_START,
16447 	    task_pid(current_task()), start, end);
16448 	fault_info.interruptible = THREAD_UNINT;        /* ignored value */
16449 	fault_info.behavior      = VM_BEHAVIOR_SEQUENTIAL;
16450 	fault_info.stealth       = TRUE;
16451 
16452 	/*
16453 	 * The MADV_WILLNEED operation doesn't require any changes to the
16454 	 * vm_map_entry_t's, so the read lock is sufficient.
16455 	 */
16456 
16457 	vm_map_lock_read(map);
16458 
16459 	/*
16460 	 * The madvise semantics require that the address range be fully
16461 	 * allocated with no holes.  Otherwise, we're required to return
16462 	 * an error.
16463 	 */
16464 
16465 	if (!vm_map_range_check(map, start, end, &entry)) {
16466 		vm_map_unlock_read(map);
16467 		KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16468 		    task_pid(current_task()), start, KERN_INVALID_ADDRESS);
16469 		return KERN_INVALID_ADDRESS;
16470 	}
16471 
16472 	/*
16473 	 * Examine each vm_map_entry_t in the range.
16474 	 */
16475 	for (; entry != vm_map_to_entry(map) && start < end;) {
16476 		/*
16477 		 * The first time through, the start address could be anywhere
16478 		 * within the vm_map_entry we found.  So adjust the offset to
16479 		 * correspond.  After that, the offset will always be zero to
16480 		 * correspond to the beginning of the current vm_map_entry.
16481 		 */
16482 		offset = (start - entry->vme_start) + VME_OFFSET(entry);
16483 
16484 		/*
16485 		 * Set the length so we don't go beyond the end of the
16486 		 * map_entry or beyond the end of the range we were given.
16487 		 * This range could span also multiple map entries all of which
16488 		 * map different files, so make sure we only do the right amount
16489 		 * of I/O for each object.  Note that it's possible for there
16490 		 * to be multiple map entries all referring to the same object
16491 		 * but with different page permissions, but it's not worth
16492 		 * trying to optimize that case.
16493 		 */
16494 		len = MIN(entry->vme_end - start, end - start);
16495 
16496 		if ((vm_size_t) len != len) {
16497 			/* 32-bit overflow */
16498 			len = (vm_size_t) (0 - PAGE_SIZE);
16499 		}
16500 		fault_info.cluster_size = (vm_size_t) len;
16501 		fault_info.lo_offset    = offset;
16502 		fault_info.hi_offset    = offset + len;
16503 		fault_info.user_tag     = VME_ALIAS(entry);
16504 		fault_info.pmap_options = 0;
16505 		if (entry->iokit_acct ||
16506 		    (!entry->is_sub_map && !entry->use_pmap)) {
16507 			fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16508 		}
16509 		fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16510 
16511 		/*
16512 		 * If the entry is a submap OR there's no read permission
16513 		 * to this mapping, then just skip it.
16514 		 */
16515 		if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16516 			entry = entry->vme_next;
16517 			start = entry->vme_start;
16518 			continue;
16519 		}
16520 
16521 		object = VME_OBJECT(entry);
16522 
16523 		if (object == NULL ||
16524 		    (object && object->internal)) {
16525 			/*
16526 			 * Memory range backed by anonymous memory.
16527 			 */
16528 			vm_size_t region_size = 0, effective_page_size = 0;
16529 			vm_map_offset_t addr = 0, effective_page_mask = 0;
16530 
16531 			region_size = len;
16532 			addr = start;
16533 
16534 			effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16535 			effective_page_size = effective_page_mask + 1;
16536 
16537 			vm_map_unlock_read(map);
16538 
16539 			while (region_size) {
16540 				vm_pre_fault(
16541 					vm_map_trunc_page(addr, effective_page_mask),
16542 					VM_PROT_READ | VM_PROT_WRITE);
16543 
16544 				region_size -= effective_page_size;
16545 				addr += effective_page_size;
16546 			}
16547 		} else {
16548 			/*
16549 			 * Find the file object backing this map entry.  If there is
16550 			 * none, then we simply ignore the "will need" advice for this
16551 			 * entry and go on to the next one.
16552 			 */
16553 			if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16554 				entry = entry->vme_next;
16555 				start = entry->vme_start;
16556 				continue;
16557 			}
16558 
16559 			vm_object_paging_begin(object);
16560 			pager = object->pager;
16561 			vm_object_unlock(object);
16562 
16563 			/*
16564 			 * The data_request() could take a long time, so let's
16565 			 * release the map lock to avoid blocking other threads.
16566 			 */
16567 			vm_map_unlock_read(map);
16568 
16569 			/*
16570 			 * Get the data from the object asynchronously.
16571 			 *
16572 			 * Note that memory_object_data_request() places limits on the
16573 			 * amount of I/O it will do.  Regardless of the len we
16574 			 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16575 			 * silently truncates the len to that size.  This isn't
16576 			 * necessarily bad since madvise shouldn't really be used to
16577 			 * page in unlimited amounts of data.  Other Unix variants
16578 			 * limit the willneed case as well.  If this turns out to be an
16579 			 * issue for developers, then we can always adjust the policy
16580 			 * here and still be backwards compatible since this is all
16581 			 * just "advice".
16582 			 */
16583 			kr = memory_object_data_request(
16584 				pager,
16585 				vm_object_trunc_page(offset) + object->paging_offset,
16586 				0,      /* ignored */
16587 				VM_PROT_READ,
16588 				(memory_object_fault_info_t)&fault_info);
16589 
16590 			vm_object_lock(object);
16591 			vm_object_paging_end(object);
16592 			vm_object_unlock(object);
16593 
16594 			/*
16595 			 * If we couldn't do the I/O for some reason, just give up on
16596 			 * the madvise.  We still return success to the user since
16597 			 * madvise isn't supposed to fail when the advice can't be
16598 			 * taken.
16599 			 */
16600 
16601 			if (kr != KERN_SUCCESS) {
16602 				KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16603 				    task_pid(current_task()), start, kr);
16604 				return KERN_SUCCESS;
16605 			}
16606 		}
16607 
16608 		start += len;
16609 		if (start >= end) {
16610 			/* done */
16611 			KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16612 			    task_pid(current_task()), start, KERN_SUCCESS);
16613 			return KERN_SUCCESS;
16614 		}
16615 
16616 		/* look up next entry */
16617 		vm_map_lock_read(map);
16618 		if (!vm_map_lookup_entry(map, start, &entry)) {
16619 			/*
16620 			 * There's a new hole in the address range.
16621 			 */
16622 			vm_map_unlock_read(map);
16623 			KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16624 			    task_pid(current_task()), start, KERN_INVALID_ADDRESS);
16625 			return KERN_INVALID_ADDRESS;
16626 		}
16627 	}
16628 
16629 	vm_map_unlock_read(map);
16630 	KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16631 	    task_pid(current_task()), start, KERN_SUCCESS);
16632 	return KERN_SUCCESS;
16633 }
16634 
16635 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16636 vm_map_entry_is_reusable(
16637 	vm_map_entry_t entry)
16638 {
16639 	/* Only user map entries */
16640 
16641 	vm_object_t object;
16642 
16643 	if (entry->is_sub_map) {
16644 		return FALSE;
16645 	}
16646 
16647 	switch (VME_ALIAS(entry)) {
16648 	case VM_MEMORY_MALLOC:
16649 	case VM_MEMORY_MALLOC_SMALL:
16650 	case VM_MEMORY_MALLOC_LARGE:
16651 	case VM_MEMORY_REALLOC:
16652 	case VM_MEMORY_MALLOC_TINY:
16653 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16654 	case VM_MEMORY_MALLOC_LARGE_REUSED:
16655 		/*
16656 		 * This is a malloc() memory region: check if it's still
16657 		 * in its original state and can be re-used for more
16658 		 * malloc() allocations.
16659 		 */
16660 		break;
16661 	default:
16662 		/*
16663 		 * Not a malloc() memory region: let the caller decide if
16664 		 * it's re-usable.
16665 		 */
16666 		return TRUE;
16667 	}
16668 
16669 	if (/*entry->is_shared ||*/
16670 		entry->is_sub_map ||
16671 		entry->in_transition ||
16672 		entry->protection != VM_PROT_DEFAULT ||
16673 		entry->max_protection != VM_PROT_ALL ||
16674 		entry->inheritance != VM_INHERIT_DEFAULT ||
16675 		entry->no_cache ||
16676 		entry->vme_permanent ||
16677 		entry->superpage_size != FALSE ||
16678 		entry->zero_wired_pages ||
16679 		entry->wired_count != 0 ||
16680 		entry->user_wired_count != 0) {
16681 		return FALSE;
16682 	}
16683 
16684 	object = VME_OBJECT(entry);
16685 	if (object == VM_OBJECT_NULL) {
16686 		return TRUE;
16687 	}
16688 	if (
16689 #if 0
16690 		/*
16691 		 * Let's proceed even if the VM object is potentially
16692 		 * shared.
16693 		 * We check for this later when processing the actual
16694 		 * VM pages, so the contents will be safe if shared.
16695 		 *
16696 		 * But we can still mark this memory region as "reusable" to
16697 		 * acknowledge that the caller did let us know that the memory
16698 		 * could be re-used and should not be penalized for holding
16699 		 * on to it.  This allows its "resident size" to not include
16700 		 * the reusable range.
16701 		 */
16702 		object->ref_count == 1 &&
16703 #endif
16704 		object->vo_copy == VM_OBJECT_NULL &&
16705 		object->shadow == VM_OBJECT_NULL &&
16706 		object->internal &&
16707 		object->purgable == VM_PURGABLE_DENY &&
16708 		object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16709 		!object->code_signed) {
16710 		return TRUE;
16711 	}
16712 	return FALSE;
16713 }
16714 
16715 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16716 vm_map_reuse_pages(
16717 	vm_map_t        map,
16718 	vm_map_offset_t start,
16719 	vm_map_offset_t end)
16720 {
16721 	vm_map_entry_t                  entry;
16722 	vm_object_t                     object;
16723 	vm_object_offset_t              start_offset, end_offset;
16724 
16725 	/*
16726 	 * The MADV_REUSE operation doesn't require any changes to the
16727 	 * vm_map_entry_t's, so the read lock is sufficient.
16728 	 */
16729 
16730 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16731 		/*
16732 		 * XXX TODO4K
16733 		 * need to figure out what reusable means for a
16734 		 * portion of a native page.
16735 		 */
16736 		return KERN_SUCCESS;
16737 	}
16738 
16739 	vm_map_lock_read(map);
16740 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16741 
16742 	/*
16743 	 * The madvise semantics require that the address range be fully
16744 	 * allocated with no holes.  Otherwise, we're required to return
16745 	 * an error.
16746 	 */
16747 
16748 	if (!vm_map_range_check(map, start, end, &entry)) {
16749 		vm_map_unlock_read(map);
16750 		vm_page_stats_reusable.reuse_pages_failure++;
16751 		return KERN_INVALID_ADDRESS;
16752 	}
16753 
16754 	/*
16755 	 * Examine each vm_map_entry_t in the range.
16756 	 */
16757 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16758 	    entry = entry->vme_next) {
16759 		/*
16760 		 * Sanity check on the VM map entry.
16761 		 */
16762 		if (!vm_map_entry_is_reusable(entry)) {
16763 			vm_map_unlock_read(map);
16764 			vm_page_stats_reusable.reuse_pages_failure++;
16765 			return KERN_INVALID_ADDRESS;
16766 		}
16767 
16768 		/*
16769 		 * The first time through, the start address could be anywhere
16770 		 * within the vm_map_entry we found.  So adjust the offset to
16771 		 * correspond.
16772 		 */
16773 		if (entry->vme_start < start) {
16774 			start_offset = start - entry->vme_start;
16775 		} else {
16776 			start_offset = 0;
16777 		}
16778 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16779 		start_offset += VME_OFFSET(entry);
16780 		end_offset += VME_OFFSET(entry);
16781 
16782 		object = VME_OBJECT(entry);
16783 		if (object != VM_OBJECT_NULL) {
16784 			vm_object_lock(object);
16785 			vm_object_reuse_pages(object, start_offset, end_offset,
16786 			    TRUE);
16787 			vm_object_unlock(object);
16788 		}
16789 
16790 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16791 			/*
16792 			 * XXX
16793 			 * We do not hold the VM map exclusively here.
16794 			 * The "alias" field is not that critical, so it's
16795 			 * safe to update it here, as long as it is the only
16796 			 * one that can be modified while holding the VM map
16797 			 * "shared".
16798 			 */
16799 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16800 		}
16801 	}
16802 
16803 	vm_map_unlock_read(map);
16804 	vm_page_stats_reusable.reuse_pages_success++;
16805 	return KERN_SUCCESS;
16806 }
16807 
16808 
16809 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16810 vm_map_reusable_pages(
16811 	vm_map_t        map,
16812 	vm_map_offset_t start,
16813 	vm_map_offset_t end)
16814 {
16815 	vm_map_entry_t                  entry;
16816 	vm_object_t                     object;
16817 	vm_object_offset_t              start_offset, end_offset;
16818 	vm_map_offset_t                 pmap_offset;
16819 
16820 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16821 		/*
16822 		 * XXX TODO4K
16823 		 * need to figure out what reusable means for a portion
16824 		 * of a native page.
16825 		 */
16826 		return KERN_SUCCESS;
16827 	}
16828 
16829 	/*
16830 	 * The MADV_REUSABLE operation doesn't require any changes to the
16831 	 * vm_map_entry_t's, so the read lock is sufficient.
16832 	 */
16833 
16834 	vm_map_lock_read(map);
16835 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16836 
16837 	/*
16838 	 * The madvise semantics require that the address range be fully
16839 	 * allocated with no holes.  Otherwise, we're required to return
16840 	 * an error.
16841 	 */
16842 
16843 	if (!vm_map_range_check(map, start, end, &entry)) {
16844 		vm_map_unlock_read(map);
16845 		vm_page_stats_reusable.reusable_pages_failure++;
16846 		return KERN_INVALID_ADDRESS;
16847 	}
16848 
16849 	/*
16850 	 * Examine each vm_map_entry_t in the range.
16851 	 */
16852 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16853 	    entry = entry->vme_next) {
16854 		int kill_pages = 0;
16855 		boolean_t reusable_no_write = FALSE;
16856 
16857 		/*
16858 		 * Sanity check on the VM map entry.
16859 		 */
16860 		if (!vm_map_entry_is_reusable(entry)) {
16861 			vm_map_unlock_read(map);
16862 			vm_page_stats_reusable.reusable_pages_failure++;
16863 			return KERN_INVALID_ADDRESS;
16864 		}
16865 
16866 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16867 #if __arm64e__
16868 		    && !entry->used_for_tpro
16869 #endif
16870 		    ) {
16871 			/* not writable: can't discard contents */
16872 			vm_map_unlock_read(map);
16873 			vm_page_stats_reusable.reusable_nonwritable++;
16874 			vm_page_stats_reusable.reusable_pages_failure++;
16875 			return KERN_PROTECTION_FAILURE;
16876 		}
16877 
16878 		/*
16879 		 * The first time through, the start address could be anywhere
16880 		 * within the vm_map_entry we found.  So adjust the offset to
16881 		 * correspond.
16882 		 */
16883 		if (entry->vme_start < start) {
16884 			start_offset = start - entry->vme_start;
16885 			pmap_offset = start;
16886 		} else {
16887 			start_offset = 0;
16888 			pmap_offset = entry->vme_start;
16889 		}
16890 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16891 		start_offset += VME_OFFSET(entry);
16892 		end_offset += VME_OFFSET(entry);
16893 
16894 		object = VME_OBJECT(entry);
16895 		if (object == VM_OBJECT_NULL) {
16896 			continue;
16897 		}
16898 
16899 		if (entry->protection & VM_PROT_EXECUTE) {
16900 			/*
16901 			 * Executable mappings might be write-protected by
16902 			 * hardware, so do not attempt to write to these pages.
16903 			 */
16904 			reusable_no_write = TRUE;
16905 		}
16906 
16907 		if (entry->vme_xnu_user_debug) {
16908 			/*
16909 			 * User debug pages might be write-protected by hardware,
16910 			 * so do not attempt to write to these pages.
16911 			 */
16912 			reusable_no_write = TRUE;
16913 		}
16914 
16915 		vm_object_lock(object);
16916 		if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
16917 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16918 		    object->vo_copy == VM_OBJECT_NULL)) &&
16919 		    object->shadow == VM_OBJECT_NULL &&
16920 		    /*
16921 		     * "iokit_acct" entries are billed for their virtual size
16922 		     * (rather than for their resident pages only), so they
16923 		     * wouldn't benefit from making pages reusable, and it
16924 		     * would be hard to keep track of pages that are both
16925 		     * "iokit_acct" and "reusable" in the pmap stats and
16926 		     * ledgers.
16927 		     */
16928 		    !(entry->iokit_acct ||
16929 		    (!entry->is_sub_map && !entry->use_pmap))) {
16930 			if (os_ref_get_count_raw(&object->ref_count) != 1) {
16931 				vm_page_stats_reusable.reusable_shared++;
16932 			}
16933 			kill_pages = 1;
16934 		} else {
16935 			kill_pages = -1;
16936 		}
16937 		if (kill_pages != -1) {
16938 			vm_object_deactivate_pages(object,
16939 			    start_offset,
16940 			    end_offset - start_offset,
16941 			    kill_pages,
16942 			    TRUE /*reusable_pages*/,
16943 			    reusable_no_write,
16944 			    map->pmap,
16945 			    pmap_offset);
16946 		} else {
16947 			vm_page_stats_reusable.reusable_pages_shared++;
16948 			DTRACE_VM4(vm_map_reusable_pages_shared,
16949 			    unsigned int, VME_ALIAS(entry),
16950 			    vm_map_t, map,
16951 			    vm_map_entry_t, entry,
16952 			    vm_object_t, object);
16953 		}
16954 		vm_object_unlock(object);
16955 
16956 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16957 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16958 			/*
16959 			 * XXX
16960 			 * We do not hold the VM map exclusively here.
16961 			 * The "alias" field is not that critical, so it's
16962 			 * safe to update it here, as long as it is the only
16963 			 * one that can be modified while holding the VM map
16964 			 * "shared".
16965 			 */
16966 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16967 		}
16968 	}
16969 
16970 	vm_map_unlock_read(map);
16971 	vm_page_stats_reusable.reusable_pages_success++;
16972 	return KERN_SUCCESS;
16973 }
16974 
16975 
16976 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16977 vm_map_can_reuse(
16978 	vm_map_t        map,
16979 	vm_map_offset_t start,
16980 	vm_map_offset_t end)
16981 {
16982 	vm_map_entry_t                  entry;
16983 
16984 	/*
16985 	 * The MADV_REUSABLE operation doesn't require any changes to the
16986 	 * vm_map_entry_t's, so the read lock is sufficient.
16987 	 */
16988 
16989 	vm_map_lock_read(map);
16990 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16991 
16992 	/*
16993 	 * The madvise semantics require that the address range be fully
16994 	 * allocated with no holes.  Otherwise, we're required to return
16995 	 * an error.
16996 	 */
16997 
16998 	if (!vm_map_range_check(map, start, end, &entry)) {
16999 		vm_map_unlock_read(map);
17000 		vm_page_stats_reusable.can_reuse_failure++;
17001 		return KERN_INVALID_ADDRESS;
17002 	}
17003 
17004 	/*
17005 	 * Examine each vm_map_entry_t in the range.
17006 	 */
17007 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17008 	    entry = entry->vme_next) {
17009 		/*
17010 		 * Sanity check on the VM map entry.
17011 		 */
17012 		if (!vm_map_entry_is_reusable(entry)) {
17013 			vm_map_unlock_read(map);
17014 			vm_page_stats_reusable.can_reuse_failure++;
17015 			return KERN_INVALID_ADDRESS;
17016 		}
17017 	}
17018 
17019 	vm_map_unlock_read(map);
17020 	vm_page_stats_reusable.can_reuse_success++;
17021 	return KERN_SUCCESS;
17022 }
17023 
17024 
17025 #if MACH_ASSERT
17026 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17027 vm_map_pageout(
17028 	vm_map_t        map,
17029 	vm_map_offset_t start,
17030 	vm_map_offset_t end)
17031 {
17032 	vm_map_entry_t                  entry;
17033 
17034 	/*
17035 	 * The MADV_PAGEOUT operation doesn't require any changes to the
17036 	 * vm_map_entry_t's, so the read lock is sufficient.
17037 	 */
17038 
17039 	vm_map_lock_read(map);
17040 
17041 	/*
17042 	 * The madvise semantics require that the address range be fully
17043 	 * allocated with no holes.  Otherwise, we're required to return
17044 	 * an error.
17045 	 */
17046 
17047 	if (!vm_map_range_check(map, start, end, &entry)) {
17048 		vm_map_unlock_read(map);
17049 		return KERN_INVALID_ADDRESS;
17050 	}
17051 
17052 	/*
17053 	 * Examine each vm_map_entry_t in the range.
17054 	 */
17055 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17056 	    entry = entry->vme_next) {
17057 		vm_object_t     object;
17058 
17059 		/*
17060 		 * Sanity check on the VM map entry.
17061 		 */
17062 		if (entry->is_sub_map) {
17063 			vm_map_t submap;
17064 			vm_map_offset_t submap_start;
17065 			vm_map_offset_t submap_end;
17066 			vm_map_entry_t submap_entry;
17067 
17068 			submap = VME_SUBMAP(entry);
17069 			submap_start = VME_OFFSET(entry);
17070 			submap_end = submap_start + (entry->vme_end -
17071 			    entry->vme_start);
17072 
17073 			vm_map_lock_read(submap);
17074 
17075 			if (!vm_map_range_check(submap,
17076 			    submap_start,
17077 			    submap_end,
17078 			    &submap_entry)) {
17079 				vm_map_unlock_read(submap);
17080 				vm_map_unlock_read(map);
17081 				return KERN_INVALID_ADDRESS;
17082 			}
17083 
17084 			if (submap_entry->is_sub_map) {
17085 				vm_map_unlock_read(submap);
17086 				continue;
17087 			}
17088 
17089 			object = VME_OBJECT(submap_entry);
17090 			if (object == VM_OBJECT_NULL || !object->internal) {
17091 				vm_map_unlock_read(submap);
17092 				continue;
17093 			}
17094 
17095 			vm_object_pageout(object);
17096 
17097 			vm_map_unlock_read(submap);
17098 			submap = VM_MAP_NULL;
17099 			submap_entry = VM_MAP_ENTRY_NULL;
17100 			continue;
17101 		}
17102 
17103 		object = VME_OBJECT(entry);
17104 		if (object == VM_OBJECT_NULL || !object->internal) {
17105 			continue;
17106 		}
17107 
17108 		vm_object_pageout(object);
17109 	}
17110 
17111 	vm_map_unlock_read(map);
17112 	return KERN_SUCCESS;
17113 }
17114 #endif /* MACH_ASSERT */
17115 
17116 /*
17117  * This function determines if the zero operation can be run on the
17118  * respective entry. Additional checks on the object are in
17119  * vm_object_zero_preflight.
17120  */
17121 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)17122 vm_map_zero_entry_preflight(vm_map_entry_t entry)
17123 {
17124 	/*
17125 	 * Zeroing is restricted to writable non-executable entries and non-JIT
17126 	 * regions.
17127 	 */
17128 	if (!(entry->protection & VM_PROT_WRITE) ||
17129 	    (entry->protection & VM_PROT_EXECUTE) ||
17130 	    entry->used_for_jit ||
17131 	    entry->vme_xnu_user_debug) {
17132 		return KERN_PROTECTION_FAILURE;
17133 	}
17134 
17135 	/*
17136 	 * Zeroing for copy on write isn't yet supported. Zeroing is also not
17137 	 * allowed for submaps.
17138 	 */
17139 	if (entry->needs_copy || entry->is_sub_map) {
17140 		return KERN_NO_ACCESS;
17141 	}
17142 
17143 	return KERN_SUCCESS;
17144 }
17145 
17146 /*
17147  * This function translates entry's start and end to offsets in the object
17148  */
17149 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)17150 vm_map_get_bounds_in_object(
17151 	vm_map_entry_t      entry,
17152 	vm_map_offset_t     start,
17153 	vm_map_offset_t     end,
17154 	vm_map_offset_t    *start_offset,
17155 	vm_map_offset_t    *end_offset)
17156 {
17157 	if (entry->vme_start < start) {
17158 		*start_offset = start - entry->vme_start;
17159 	} else {
17160 		*start_offset = 0;
17161 	}
17162 	*end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17163 	*start_offset += VME_OFFSET(entry);
17164 	*end_offset += VME_OFFSET(entry);
17165 }
17166 
17167 /*
17168  * This function iterates through the entries in the requested range
17169  * and zeroes any resident pages in the corresponding objects. Compressed
17170  * pages are dropped instead of being faulted in and zeroed.
17171  */
17172 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17173 vm_map_zero(
17174 	vm_map_t        map,
17175 	vm_map_offset_t start,
17176 	vm_map_offset_t end)
17177 {
17178 	vm_map_entry_t                  entry;
17179 	vm_map_offset_t                 cur = start;
17180 	kern_return_t                   ret;
17181 
17182 	/*
17183 	 * This operation isn't supported where the map page size is less than
17184 	 * the hardware page size. Caller will need to handle error and
17185 	 * explicitly zero memory if needed.
17186 	 */
17187 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17188 		return KERN_NO_ACCESS;
17189 	}
17190 
17191 	/*
17192 	 * The MADV_ZERO operation doesn't require any changes to the
17193 	 * vm_map_entry_t's, so the read lock is sufficient.
17194 	 */
17195 	vm_map_lock_read(map);
17196 	assert(map->pmap != kernel_pmap);       /* protect alias access */
17197 
17198 	/*
17199 	 * The madvise semantics require that the address range be fully
17200 	 * allocated with no holes. Otherwise, we're required to return
17201 	 * an error. This check needs to be redone if the map has changed.
17202 	 */
17203 	if (!vm_map_range_check(map, cur, end, &entry)) {
17204 		vm_map_unlock_read(map);
17205 		return KERN_INVALID_ADDRESS;
17206 	}
17207 
17208 	/*
17209 	 * Examine each vm_map_entry_t in the range.
17210 	 */
17211 	while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
17212 		vm_map_offset_t cur_offset;
17213 		vm_map_offset_t end_offset;
17214 		unsigned int last_timestamp = map->timestamp;
17215 		vm_object_t object = VME_OBJECT(entry);
17216 
17217 		ret = vm_map_zero_entry_preflight(entry);
17218 		if (ret != KERN_SUCCESS) {
17219 			vm_map_unlock_read(map);
17220 			return ret;
17221 		}
17222 
17223 		if (object == VM_OBJECT_NULL) {
17224 			entry = entry->vme_next;
17225 			continue;
17226 		}
17227 
17228 		vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
17229 		vm_object_lock(object);
17230 		/*
17231 		 * Take a reference on the object as vm_object_zero will drop the object
17232 		 * lock when it encounters a busy page.
17233 		 */
17234 		vm_object_reference_locked(object);
17235 		vm_map_unlock_read(map);
17236 
17237 		ret = vm_object_zero(object, cur_offset, end_offset);
17238 		vm_object_unlock(object);
17239 		vm_object_deallocate(object);
17240 		if (ret != KERN_SUCCESS) {
17241 			return ret;
17242 		}
17243 		/*
17244 		 * Update cur as vm_object_zero has succeeded.
17245 		 */
17246 		cur += (end_offset - cur_offset);
17247 		if (cur == end) {
17248 			return KERN_SUCCESS;
17249 		}
17250 
17251 		/*
17252 		 * If the map timestamp has changed, restart by relooking up cur in the
17253 		 * map
17254 		 */
17255 		vm_map_lock_read(map);
17256 		if (last_timestamp != map->timestamp) {
17257 			/*
17258 			 * Relookup cur in the map
17259 			 */
17260 			if (!vm_map_range_check(map, cur, end, &entry)) {
17261 				vm_map_unlock_read(map);
17262 				return KERN_INVALID_ADDRESS;
17263 			}
17264 			continue;
17265 		}
17266 		/*
17267 		 * If the map hasn't changed proceed with the next entry
17268 		 */
17269 		entry = entry->vme_next;
17270 	}
17271 
17272 	vm_map_unlock_read(map);
17273 	return KERN_SUCCESS;
17274 }
17275 
17276 
17277 /*
17278  *	Routine:	vm_map_entry_insert
17279  *
17280  *	Description:	This routine inserts a new vm_entry in a locked map.
17281  */
17282 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17283 vm_map_entry_insert(
17284 	vm_map_t                map,
17285 	vm_map_entry_t          insp_entry,
17286 	vm_map_offset_t         start,
17287 	vm_map_offset_t         end,
17288 	vm_object_t             object,
17289 	vm_object_offset_t      offset,
17290 	vm_map_kernel_flags_t   vmk_flags,
17291 	boolean_t               needs_copy,
17292 	vm_prot_t               cur_protection,
17293 	vm_prot_t               max_protection,
17294 	vm_inherit_t            inheritance,
17295 	boolean_t               clear_map_aligned)
17296 {
17297 	vm_map_entry_t  new_entry;
17298 	boolean_t map_aligned = FALSE;
17299 
17300 	assert(insp_entry != (vm_map_entry_t)0);
17301 	vm_map_lock_assert_exclusive(map);
17302 
17303 	__assert_only vm_object_offset_t      end_offset = 0;
17304 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17305 
17306 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17307 		map_aligned = TRUE;
17308 	}
17309 	if (clear_map_aligned &&
17310 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17311 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17312 		map_aligned = FALSE;
17313 	}
17314 	if (map_aligned) {
17315 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17316 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17317 	} else {
17318 		assert(page_aligned(start));
17319 		assert(page_aligned(end));
17320 	}
17321 	assert(start < end);
17322 
17323 	new_entry = vm_map_entry_create(map);
17324 
17325 	new_entry->vme_start = start;
17326 	new_entry->vme_end = end;
17327 
17328 	if (vmk_flags.vmkf_submap) {
17329 		new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17330 		VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17331 	} else {
17332 		VME_OBJECT_SET(new_entry, object, false, 0);
17333 	}
17334 	VME_OFFSET_SET(new_entry, offset);
17335 	VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17336 
17337 	new_entry->map_aligned = map_aligned;
17338 	new_entry->needs_copy = needs_copy;
17339 	new_entry->inheritance = inheritance;
17340 	new_entry->protection = cur_protection;
17341 	new_entry->max_protection = max_protection;
17342 	/*
17343 	 * submap: "use_pmap" means "nested".
17344 	 * default: false.
17345 	 *
17346 	 * object: "use_pmap" means "use pmap accounting" for footprint.
17347 	 * default: true.
17348 	 */
17349 	new_entry->use_pmap = !vmk_flags.vmkf_submap;
17350 	new_entry->no_cache = vmk_flags.vmf_no_cache;
17351 	new_entry->vme_permanent = vmk_flags.vmf_permanent;
17352 	new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17353 	new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17354 	new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17355 
17356 	if (vmk_flags.vmkf_map_jit) {
17357 		if (!(map->jit_entry_exists) ||
17358 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17359 			new_entry->used_for_jit = TRUE;
17360 			map->jit_entry_exists = TRUE;
17361 		}
17362 	}
17363 
17364 	/*
17365 	 *	Insert the new entry into the list.
17366 	 */
17367 
17368 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17369 	map->size += end - start;
17370 
17371 	/*
17372 	 *	Update the free space hint and the lookup hint.
17373 	 */
17374 
17375 	SAVE_HINT_MAP_WRITE(map, new_entry);
17376 	return new_entry;
17377 }
17378 
17379 /*
17380  *	Routine:	vm_map_remap_extract
17381  *
17382  *	Description:	This routine returns a vm_entry list from a map.
17383  */
17384 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17385 vm_map_remap_extract(
17386 	vm_map_t                map,
17387 	vm_map_offset_t         addr,
17388 	vm_map_size_t           size,
17389 	boolean_t               copy,
17390 	vm_map_copy_t           map_copy,
17391 	vm_prot_t               *cur_protection,   /* IN/OUT */
17392 	vm_prot_t               *max_protection,   /* IN/OUT */
17393 	/* What, no behavior? */
17394 	vm_inherit_t            inheritance,
17395 	vm_map_kernel_flags_t   vmk_flags)
17396 {
17397 	struct vm_map_header   *map_header = &map_copy->cpy_hdr;
17398 	kern_return_t           result;
17399 	vm_map_size_t           mapped_size;
17400 	vm_map_size_t           tmp_size;
17401 	vm_map_entry_t          src_entry;     /* result of last map lookup */
17402 	vm_map_entry_t          new_entry;
17403 	vm_object_offset_t      offset;
17404 	vm_map_offset_t         map_address;
17405 	vm_map_offset_t         src_start;     /* start of entry to map */
17406 	vm_map_offset_t         src_end;       /* end of region to be mapped */
17407 	vm_object_t             object;
17408 	vm_map_version_t        version;
17409 	boolean_t               src_needs_copy;
17410 	boolean_t               new_entry_needs_copy;
17411 	vm_map_entry_t          saved_src_entry;
17412 	boolean_t               src_entry_was_wired;
17413 	vm_prot_t               max_prot_for_prot_copy;
17414 	vm_map_offset_t         effective_page_mask;
17415 	bool                    pageable, same_map;
17416 	boolean_t               vm_remap_legacy;
17417 	vm_prot_t               required_cur_prot, required_max_prot;
17418 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
17419 	boolean_t               saved_used_for_jit;  /* Saved used_for_jit. */
17420 
17421 	pageable = vmk_flags.vmkf_copy_pageable;
17422 	same_map = vmk_flags.vmkf_copy_same_map;
17423 
17424 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17425 
17426 	assert(map != VM_MAP_NULL);
17427 	assert(size != 0);
17428 	assert(size == vm_map_round_page(size, effective_page_mask));
17429 	assert(inheritance == VM_INHERIT_NONE ||
17430 	    inheritance == VM_INHERIT_COPY ||
17431 	    inheritance == VM_INHERIT_SHARE);
17432 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17433 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17434 	assert((*cur_protection & *max_protection) == *cur_protection);
17435 
17436 	/*
17437 	 *	Compute start and end of region.
17438 	 */
17439 	src_start = vm_map_trunc_page(addr, effective_page_mask);
17440 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
17441 
17442 	/*
17443 	 *	Initialize map_header.
17444 	 */
17445 	map_header->nentries = 0;
17446 	map_header->entries_pageable = pageable;
17447 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17448 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17449 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17450 	vm_map_store_init(map_header);
17451 
17452 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
17453 		/*
17454 		 * Special case for vm_map_protect(VM_PROT_COPY):
17455 		 * we want to set the new mappings' max protection to the
17456 		 * specified *max_protection...
17457 		 */
17458 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17459 		/* ... but we want to use the vm_remap() legacy mode */
17460 		vmk_flags.vmkf_remap_legacy_mode = true;
17461 		*max_protection = VM_PROT_NONE;
17462 		*cur_protection = VM_PROT_NONE;
17463 	} else {
17464 		max_prot_for_prot_copy = VM_PROT_NONE;
17465 	}
17466 
17467 	if (vmk_flags.vmkf_remap_legacy_mode) {
17468 		/*
17469 		 * vm_remap() legacy mode:
17470 		 * Extract all memory regions in the specified range and
17471 		 * collect the strictest set of protections allowed on the
17472 		 * entire range, so the caller knows what they can do with
17473 		 * the remapped range.
17474 		 * We start with VM_PROT_ALL and we'll remove the protections
17475 		 * missing from each memory region.
17476 		 */
17477 		vm_remap_legacy = TRUE;
17478 		*cur_protection = VM_PROT_ALL;
17479 		*max_protection = VM_PROT_ALL;
17480 		required_cur_prot = VM_PROT_NONE;
17481 		required_max_prot = VM_PROT_NONE;
17482 	} else {
17483 		/*
17484 		 * vm_remap_new() mode:
17485 		 * Extract all memory regions in the specified range and
17486 		 * ensure that they have at least the protections specified
17487 		 * by the caller via *cur_protection and *max_protection.
17488 		 * The resulting mapping should have these protections.
17489 		 */
17490 		vm_remap_legacy = FALSE;
17491 		if (copy) {
17492 			required_cur_prot = VM_PROT_NONE;
17493 			required_max_prot = VM_PROT_READ;
17494 		} else {
17495 			required_cur_prot = *cur_protection;
17496 			required_max_prot = *max_protection;
17497 		}
17498 	}
17499 
17500 	map_address = 0;
17501 	mapped_size = 0;
17502 	result = KERN_SUCCESS;
17503 
17504 	/*
17505 	 *	The specified source virtual space might correspond to
17506 	 *	multiple map entries, need to loop on them.
17507 	 */
17508 	vm_map_lock(map);
17509 
17510 	if (map->pmap == kernel_pmap) {
17511 		map_copy->is_kernel_range = true;
17512 		map_copy->orig_range = kmem_addr_get_range(addr, size);
17513 #if CONFIG_MAP_RANGES
17514 	} else if (map->uses_user_ranges) {
17515 		map_copy->is_user_range = true;
17516 		map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17517 #endif /* CONFIG_MAP_RANGES */
17518 	}
17519 
17520 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17521 		/*
17522 		 * This address space uses sub-pages so the range might
17523 		 * not be re-mappable in an address space with larger
17524 		 * pages. Re-assemble any broken-up VM map entries to
17525 		 * improve our chances of making it work.
17526 		 */
17527 		vm_map_simplify_range(map, src_start, src_end);
17528 	}
17529 	while (mapped_size != size) {
17530 		vm_map_size_t   entry_size;
17531 
17532 		/*
17533 		 *	Find the beginning of the region.
17534 		 */
17535 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17536 			result = KERN_INVALID_ADDRESS;
17537 			break;
17538 		}
17539 
17540 		if (src_start < src_entry->vme_start ||
17541 		    (mapped_size && src_start != src_entry->vme_start)) {
17542 			result = KERN_INVALID_ADDRESS;
17543 			break;
17544 		}
17545 
17546 		tmp_size = size - mapped_size;
17547 		if (src_end > src_entry->vme_end) {
17548 			tmp_size -= (src_end - src_entry->vme_end);
17549 		}
17550 
17551 		entry_size = (vm_map_size_t)(src_entry->vme_end -
17552 		    src_entry->vme_start);
17553 
17554 		if (src_entry->is_sub_map &&
17555 		    vmk_flags.vmkf_copy_single_object) {
17556 			vm_map_t submap;
17557 			vm_map_offset_t submap_start;
17558 			vm_map_size_t submap_size;
17559 			boolean_t submap_needs_copy;
17560 
17561 			/*
17562 			 * No check for "required protection" on "src_entry"
17563 			 * because the protections that matter are the ones
17564 			 * on the submap's VM map entry, which will be checked
17565 			 * during the call to vm_map_remap_extract() below.
17566 			 */
17567 			object = VM_OBJECT_NULL;
17568 
17569 			submap_size = src_entry->vme_end - src_start;
17570 			if (submap_size > size) {
17571 				submap_size = size;
17572 			}
17573 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17574 			submap = VME_SUBMAP(src_entry);
17575 			if (copy) {
17576 				/*
17577 				 * The caller wants a copy-on-write re-mapping,
17578 				 * so let's extract from the submap accordingly.
17579 				 */
17580 				submap_needs_copy = TRUE;
17581 			} else if (src_entry->needs_copy) {
17582 				/*
17583 				 * The caller wants a shared re-mapping but the
17584 				 * submap is mapped with "needs_copy", so its
17585 				 * contents can't be shared as is. Extract the
17586 				 * contents of the submap as "copy-on-write".
17587 				 * The re-mapping won't be shared with the
17588 				 * original mapping but this is equivalent to
17589 				 * what happened with the original "remap from
17590 				 * submap" code.
17591 				 * The shared region is mapped "needs_copy", for
17592 				 * example.
17593 				 */
17594 				submap_needs_copy = TRUE;
17595 			} else {
17596 				/*
17597 				 * The caller wants a shared re-mapping and
17598 				 * this mapping can be shared (no "needs_copy"),
17599 				 * so let's extract from the submap accordingly.
17600 				 * Kernel submaps are mapped without
17601 				 * "needs_copy", for example.
17602 				 */
17603 				submap_needs_copy = FALSE;
17604 			}
17605 			vm_map_reference(submap);
17606 			vm_map_unlock(map);
17607 			src_entry = NULL;
17608 			if (vm_remap_legacy) {
17609 				*cur_protection = VM_PROT_NONE;
17610 				*max_protection = VM_PROT_NONE;
17611 			}
17612 
17613 			DTRACE_VM7(remap_submap_recurse,
17614 			    vm_map_t, map,
17615 			    vm_map_offset_t, addr,
17616 			    vm_map_size_t, size,
17617 			    boolean_t, copy,
17618 			    vm_map_offset_t, submap_start,
17619 			    vm_map_size_t, submap_size,
17620 			    boolean_t, submap_needs_copy);
17621 
17622 			result = vm_map_remap_extract(submap,
17623 			    submap_start,
17624 			    submap_size,
17625 			    submap_needs_copy,
17626 			    map_copy,
17627 			    cur_protection,
17628 			    max_protection,
17629 			    inheritance,
17630 			    vmk_flags);
17631 			vm_map_deallocate(submap);
17632 
17633 			if (result == KERN_SUCCESS &&
17634 			    submap_needs_copy &&
17635 			    !copy) {
17636 				/*
17637 				 * We were asked for a "shared"
17638 				 * re-mapping but had to ask for a
17639 				 * "copy-on-write" remapping of the
17640 				 * submap's mapping to honor the
17641 				 * submap's "needs_copy".
17642 				 * We now need to resolve that
17643 				 * pending "copy-on-write" to
17644 				 * get something we can share.
17645 				 */
17646 				vm_map_entry_t copy_entry;
17647 				vm_object_offset_t copy_offset;
17648 				vm_map_size_t copy_size;
17649 				vm_object_t copy_object;
17650 				copy_entry = vm_map_copy_first_entry(map_copy);
17651 				copy_size = copy_entry->vme_end - copy_entry->vme_start;
17652 				copy_object = VME_OBJECT(copy_entry);
17653 				copy_offset = VME_OFFSET(copy_entry);
17654 				if (copy_object == VM_OBJECT_NULL) {
17655 					assert(copy_offset == 0);
17656 					assert(!copy_entry->needs_copy);
17657 					if (copy_entry->max_protection == VM_PROT_NONE) {
17658 						assert(copy_entry->protection == VM_PROT_NONE);
17659 						/* nothing to share */
17660 					} else {
17661 						assert(copy_offset == 0);
17662 						copy_object = vm_object_allocate(copy_size);
17663 						VME_OFFSET_SET(copy_entry, 0);
17664 						VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17665 						assert(copy_entry->use_pmap);
17666 					}
17667 				} else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17668 					/* already shareable */
17669 					assert(!copy_entry->needs_copy);
17670 				} else if (copy_entry->needs_copy ||
17671 				    copy_object->shadowed ||
17672 				    (copy_object->internal &&
17673 				    !copy_object->true_share &&
17674 				    !copy_entry->is_shared &&
17675 				    copy_object->vo_size > copy_size)) {
17676 					VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17677 					assert(copy_entry->use_pmap);
17678 					if (copy_entry->needs_copy) {
17679 						/* already write-protected */
17680 					} else {
17681 						vm_prot_t prot;
17682 						prot = copy_entry->protection & ~VM_PROT_WRITE;
17683 						vm_object_pmap_protect(copy_object,
17684 						    copy_offset,
17685 						    copy_size,
17686 						    PMAP_NULL,
17687 						    PAGE_SIZE,
17688 						    0,
17689 						    prot);
17690 					}
17691 					copy_entry->needs_copy = FALSE;
17692 				}
17693 				copy_object = VME_OBJECT(copy_entry);
17694 				copy_offset = VME_OFFSET(copy_entry);
17695 				if (copy_object &&
17696 				    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17697 					copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17698 					copy_object->true_share = TRUE;
17699 				}
17700 			}
17701 
17702 			return result;
17703 		}
17704 
17705 		if (src_entry->is_sub_map) {
17706 			/* protections for submap mapping are irrelevant here */
17707 		} else if (((src_entry->protection & required_cur_prot) !=
17708 		    required_cur_prot) ||
17709 		    ((src_entry->max_protection & required_max_prot) !=
17710 		    required_max_prot)) {
17711 			if (vmk_flags.vmkf_copy_single_object &&
17712 			    mapped_size != 0) {
17713 				/*
17714 				 * Single object extraction.
17715 				 * We can't extract more with the required
17716 				 * protection but we've extracted some, so
17717 				 * stop there and declare success.
17718 				 * The caller should check the size of
17719 				 * the copy entry we've extracted.
17720 				 */
17721 				result = KERN_SUCCESS;
17722 			} else {
17723 				/*
17724 				 * VM range extraction.
17725 				 * Required proctection is not available
17726 				 * for this part of the range: fail.
17727 				 */
17728 				result = KERN_PROTECTION_FAILURE;
17729 			}
17730 			break;
17731 		}
17732 
17733 		if (src_entry->is_sub_map) {
17734 			vm_map_t submap;
17735 			vm_map_offset_t submap_start;
17736 			vm_map_size_t submap_size;
17737 			vm_map_copy_t submap_copy;
17738 			vm_prot_t submap_curprot, submap_maxprot;
17739 			boolean_t submap_needs_copy;
17740 
17741 			/*
17742 			 * No check for "required protection" on "src_entry"
17743 			 * because the protections that matter are the ones
17744 			 * on the submap's VM map entry, which will be checked
17745 			 * during the call to vm_map_copy_extract() below.
17746 			 */
17747 			object = VM_OBJECT_NULL;
17748 			submap_copy = VM_MAP_COPY_NULL;
17749 
17750 			/* find equivalent range in the submap */
17751 			submap = VME_SUBMAP(src_entry);
17752 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17753 			submap_size = tmp_size;
17754 			if (copy) {
17755 				/*
17756 				 * The caller wants a copy-on-write re-mapping,
17757 				 * so let's extract from the submap accordingly.
17758 				 */
17759 				submap_needs_copy = TRUE;
17760 			} else if (src_entry->needs_copy) {
17761 				/*
17762 				 * The caller wants a shared re-mapping but the
17763 				 * submap is mapped with "needs_copy", so its
17764 				 * contents can't be shared as is. Extract the
17765 				 * contents of the submap as "copy-on-write".
17766 				 * The re-mapping won't be shared with the
17767 				 * original mapping but this is equivalent to
17768 				 * what happened with the original "remap from
17769 				 * submap" code.
17770 				 * The shared region is mapped "needs_copy", for
17771 				 * example.
17772 				 */
17773 				submap_needs_copy = TRUE;
17774 			} else {
17775 				/*
17776 				 * The caller wants a shared re-mapping and
17777 				 * this mapping can be shared (no "needs_copy"),
17778 				 * so let's extract from the submap accordingly.
17779 				 * Kernel submaps are mapped without
17780 				 * "needs_copy", for example.
17781 				 */
17782 				submap_needs_copy = FALSE;
17783 			}
17784 			/* extra ref to keep submap alive */
17785 			vm_map_reference(submap);
17786 
17787 			DTRACE_VM7(remap_submap_recurse,
17788 			    vm_map_t, map,
17789 			    vm_map_offset_t, addr,
17790 			    vm_map_size_t, size,
17791 			    boolean_t, copy,
17792 			    vm_map_offset_t, submap_start,
17793 			    vm_map_size_t, submap_size,
17794 			    boolean_t, submap_needs_copy);
17795 
17796 			/*
17797 			 * The map can be safely unlocked since we
17798 			 * already hold a reference on the submap.
17799 			 *
17800 			 * No timestamp since we don't care if the map
17801 			 * gets modified while we're down in the submap.
17802 			 * We'll resume the extraction at src_start + tmp_size
17803 			 * anyway.
17804 			 */
17805 			vm_map_unlock(map);
17806 			src_entry = NULL; /* not valid once map is unlocked */
17807 
17808 			if (vm_remap_legacy) {
17809 				submap_curprot = VM_PROT_NONE;
17810 				submap_maxprot = VM_PROT_NONE;
17811 				if (max_prot_for_prot_copy) {
17812 					submap_maxprot = max_prot_for_prot_copy;
17813 				}
17814 			} else {
17815 				assert(!max_prot_for_prot_copy);
17816 				submap_curprot = *cur_protection;
17817 				submap_maxprot = *max_protection;
17818 			}
17819 			result = vm_map_copy_extract(submap,
17820 			    submap_start,
17821 			    submap_size,
17822 			    submap_needs_copy,
17823 			    &submap_copy,
17824 			    &submap_curprot,
17825 			    &submap_maxprot,
17826 			    inheritance,
17827 			    vmk_flags);
17828 
17829 			/* release extra ref on submap */
17830 			vm_map_deallocate(submap);
17831 			submap = VM_MAP_NULL;
17832 
17833 			if (result != KERN_SUCCESS) {
17834 				vm_map_lock(map);
17835 				break;
17836 			}
17837 
17838 			/* transfer submap_copy entries to map_header */
17839 			while (vm_map_copy_first_entry(submap_copy) !=
17840 			    vm_map_copy_to_entry(submap_copy)) {
17841 				vm_map_entry_t copy_entry;
17842 				vm_map_size_t copy_entry_size;
17843 
17844 				copy_entry = vm_map_copy_first_entry(submap_copy);
17845 
17846 				/*
17847 				 * Prevent kernel_object from being exposed to
17848 				 * user space.
17849 				 */
17850 				if (__improbable(copy_entry->vme_kernel_object)) {
17851 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17852 					    proc_selfpid(),
17853 					    (get_bsdtask_info(current_task())
17854 					    ? proc_name_address(get_bsdtask_info(current_task()))
17855 					    : "?"));
17856 					DTRACE_VM(extract_kernel_only);
17857 					result = KERN_INVALID_RIGHT;
17858 					vm_map_copy_discard(submap_copy);
17859 					submap_copy = VM_MAP_COPY_NULL;
17860 					vm_map_lock(map);
17861 					break;
17862 				}
17863 
17864 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
17865 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17866 				copy_entry->vme_start = map_address;
17867 				copy_entry->vme_end = map_address + copy_entry_size;
17868 				map_address += copy_entry_size;
17869 				mapped_size += copy_entry_size;
17870 				src_start += copy_entry_size;
17871 				assert(src_start <= src_end);
17872 				_vm_map_store_entry_link(map_header,
17873 				    map_header->links.prev,
17874 				    copy_entry);
17875 			}
17876 			/* done with submap_copy */
17877 			vm_map_copy_discard(submap_copy);
17878 
17879 			if (vm_remap_legacy) {
17880 				*cur_protection &= submap_curprot;
17881 				*max_protection &= submap_maxprot;
17882 			}
17883 
17884 			/* re-acquire the map lock and continue to next entry */
17885 			vm_map_lock(map);
17886 			continue;
17887 		} else {
17888 			object = VME_OBJECT(src_entry);
17889 
17890 			/*
17891 			 * Prevent kernel_object from being exposed to
17892 			 * user space.
17893 			 */
17894 			if (__improbable(is_kernel_object(object))) {
17895 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17896 				    proc_selfpid(),
17897 				    (get_bsdtask_info(current_task())
17898 				    ? proc_name_address(get_bsdtask_info(current_task()))
17899 				    : "?"));
17900 				DTRACE_VM(extract_kernel_only);
17901 				result = KERN_INVALID_RIGHT;
17902 				break;
17903 			}
17904 
17905 			if (src_entry->iokit_acct) {
17906 				/*
17907 				 * This entry uses "IOKit accounting".
17908 				 */
17909 			} else if (object != VM_OBJECT_NULL &&
17910 			    object->internal &&
17911 			    (object->purgable != VM_PURGABLE_DENY ||
17912 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17913 				/*
17914 				 * Purgeable objects have their own accounting:
17915 				 * no pmap accounting for them.
17916 				 */
17917 				assertf(!src_entry->use_pmap,
17918 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17919 				    map,
17920 				    src_entry,
17921 				    (uint64_t)src_entry->vme_start,
17922 				    (uint64_t)src_entry->vme_end,
17923 				    src_entry->protection,
17924 				    src_entry->max_protection,
17925 				    VME_ALIAS(src_entry));
17926 			} else {
17927 				/*
17928 				 * Not IOKit or purgeable:
17929 				 * must be accounted by pmap stats.
17930 				 */
17931 				assertf(src_entry->use_pmap,
17932 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17933 				    map,
17934 				    src_entry,
17935 				    (uint64_t)src_entry->vme_start,
17936 				    (uint64_t)src_entry->vme_end,
17937 				    src_entry->protection,
17938 				    src_entry->max_protection,
17939 				    VME_ALIAS(src_entry));
17940 			}
17941 
17942 			if (object == VM_OBJECT_NULL) {
17943 				assert(!src_entry->needs_copy);
17944 				if (src_entry->max_protection == VM_PROT_NONE) {
17945 					assert(src_entry->protection == VM_PROT_NONE);
17946 					/*
17947 					 * No VM object and no permissions:
17948 					 * this must be a reserved range with
17949 					 * nothing to share or copy.
17950 					 * There could also be all sorts of
17951 					 * pmap shenanigans within that reserved
17952 					 * range, so let's just copy the map
17953 					 * entry as is to remap a similar
17954 					 * reserved range.
17955 					 */
17956 					offset = 0; /* no object => no offset */
17957 					goto copy_src_entry;
17958 				}
17959 				object = vm_object_allocate(entry_size);
17960 				VME_OFFSET_SET(src_entry, 0);
17961 				VME_OBJECT_SET(src_entry, object, false, 0);
17962 				assert(src_entry->use_pmap);
17963 				assert(!map->mapped_in_other_pmaps);
17964 			} else if (src_entry->wired_count ||
17965 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17966 				/*
17967 				 * A wired memory region should not have
17968 				 * any pending copy-on-write and needs to
17969 				 * keep pointing at the VM object that
17970 				 * contains the wired pages.
17971 				 * If we're sharing this memory (copy=false),
17972 				 * we'll share this VM object.
17973 				 * If we're copying this memory (copy=true),
17974 				 * we'll call vm_object_copy_slowly() below
17975 				 * and use the new VM object for the remapping.
17976 				 *
17977 				 * Or, we are already using an asymmetric
17978 				 * copy, and therefore we already have
17979 				 * the right object.
17980 				 */
17981 				assert(!src_entry->needs_copy);
17982 			} else if (src_entry->needs_copy || object->shadowed ||
17983 			    (object->internal && !object->true_share &&
17984 			    !src_entry->is_shared &&
17985 			    object->vo_size > entry_size)) {
17986 				bool is_writable;
17987 
17988 				VME_OBJECT_SHADOW(src_entry, entry_size,
17989 				    vm_map_always_shadow(map));
17990 				assert(src_entry->use_pmap);
17991 
17992 				is_writable = false;
17993 				if (src_entry->protection & VM_PROT_WRITE) {
17994 					is_writable = true;
17995 #if __arm64e__
17996 				} else if (src_entry->used_for_tpro) {
17997 					is_writable = true;
17998 #endif /* __arm64e__ */
17999 				}
18000 				if (!src_entry->needs_copy && is_writable) {
18001 					vm_prot_t prot;
18002 
18003 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18004 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18005 						    __FUNCTION__,
18006 						    map, map->pmap,
18007 						    src_entry,
18008 						    (uint64_t)src_entry->vme_start,
18009 						    (uint64_t)src_entry->vme_end,
18010 						    src_entry->protection);
18011 					}
18012 
18013 					prot = src_entry->protection & ~VM_PROT_WRITE;
18014 
18015 					if (override_nx(map,
18016 					    VME_ALIAS(src_entry))
18017 					    && prot) {
18018 						prot |= VM_PROT_EXECUTE;
18019 					}
18020 
18021 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18022 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18023 						    __FUNCTION__,
18024 						    map, map->pmap,
18025 						    src_entry,
18026 						    (uint64_t)src_entry->vme_start,
18027 						    (uint64_t)src_entry->vme_end,
18028 						    prot);
18029 					}
18030 
18031 					if (map->mapped_in_other_pmaps) {
18032 						vm_object_pmap_protect(
18033 							VME_OBJECT(src_entry),
18034 							VME_OFFSET(src_entry),
18035 							entry_size,
18036 							PMAP_NULL,
18037 							PAGE_SIZE,
18038 							src_entry->vme_start,
18039 							prot);
18040 #if MACH_ASSERT
18041 					} else if (__improbable(map->pmap == PMAP_NULL)) {
18042 						/*
18043 						 * Some VM tests (in vm_tests.c)
18044 						 * sometimes want to use a VM
18045 						 * map without a pmap.
18046 						 * Otherwise, this should never
18047 						 * happen.
18048 						 */
18049 						if (!thread_get_test_option(test_option_vm_map_allow_null_pmap)) {
18050 							panic("null pmap");
18051 						}
18052 #endif /* MACH_ASSERT */
18053 					} else {
18054 						pmap_protect(vm_map_pmap(map),
18055 						    src_entry->vme_start,
18056 						    src_entry->vme_end,
18057 						    prot);
18058 					}
18059 				}
18060 
18061 				object = VME_OBJECT(src_entry);
18062 				src_entry->needs_copy = FALSE;
18063 			}
18064 
18065 
18066 			vm_object_lock(object);
18067 			vm_object_reference_locked(object); /* object ref. for new entry */
18068 			assert(!src_entry->needs_copy);
18069 			if (object->copy_strategy ==
18070 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
18071 				/*
18072 				 * If we want to share this object (copy==0),
18073 				 * it needs to be COPY_DELAY.
18074 				 * If we want to copy this object (copy==1),
18075 				 * we can't just set "needs_copy" on our side
18076 				 * and expect the other side to do the same
18077 				 * (symmetrically), so we can't let the object
18078 				 * stay COPY_SYMMETRIC.
18079 				 * So we always switch from COPY_SYMMETRIC to
18080 				 * COPY_DELAY.
18081 				 */
18082 				object->copy_strategy =
18083 				    MEMORY_OBJECT_COPY_DELAY;
18084 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
18085 			}
18086 			vm_object_unlock(object);
18087 		}
18088 
18089 		offset = (VME_OFFSET(src_entry) +
18090 		    (src_start - src_entry->vme_start));
18091 
18092 copy_src_entry:
18093 		new_entry = _vm_map_entry_create(map_header);
18094 		vm_map_entry_copy(map, new_entry, src_entry);
18095 		if (new_entry->is_sub_map) {
18096 			/* clr address space specifics */
18097 			new_entry->use_pmap = FALSE;
18098 		} else if (copy) {
18099 			/*
18100 			 * We're dealing with a copy-on-write operation,
18101 			 * so the resulting mapping should not inherit the
18102 			 * original mapping's accounting settings.
18103 			 * "use_pmap" should be reset to its default (TRUE)
18104 			 * so that the new mapping gets accounted for in
18105 			 * the task's memory footprint.
18106 			 */
18107 			new_entry->use_pmap = TRUE;
18108 		}
18109 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
18110 		assert(!new_entry->iokit_acct);
18111 
18112 		new_entry->map_aligned = FALSE;
18113 
18114 		new_entry->vme_start = map_address;
18115 		new_entry->vme_end = map_address + tmp_size;
18116 		assert(new_entry->vme_start < new_entry->vme_end);
18117 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
18118 			/* security: keep "permanent" and "csm_associated" */
18119 			new_entry->vme_permanent = src_entry->vme_permanent;
18120 			new_entry->csm_associated = src_entry->csm_associated;
18121 			/*
18122 			 * Remapping for vm_map_protect(VM_PROT_COPY)
18123 			 * to convert a read-only mapping into a
18124 			 * copy-on-write version of itself but
18125 			 * with write access:
18126 			 * keep the original inheritance but let's not
18127 			 * add VM_PROT_WRITE to the max protection yet
18128 			 * since we want to do more security checks against
18129 			 * the target map.
18130 			 */
18131 			new_entry->inheritance = src_entry->inheritance;
18132 			new_entry->protection &= max_prot_for_prot_copy;
18133 
18134 #ifdef __arm64e__
18135 			/*
18136 			 * Remapping for vm_map_protect(VM_PROT_COPY) to remap a TPRO
18137 			 * region to be explicitly writable without TPRO is only permitted
18138 			 * if TPRO enforcement has been overridden.
18139 			 *
18140 			 * In this case we ensure any entries reset the TPRO state
18141 			 * and we permit the region to be downgraded from permanent.
18142 			 */
18143 			if (new_entry->used_for_tpro) {
18144 				if (vmk_flags.vmkf_tpro_enforcement_override) {
18145 					new_entry->used_for_tpro = FALSE;
18146 					new_entry->vme_permanent = FALSE;
18147 				} else {
18148 					result = KERN_PROTECTION_FAILURE;
18149 					vm_object_deallocate(object);
18150 					vm_map_entry_dispose(new_entry);
18151 					new_entry = VM_MAP_ENTRY_NULL;
18152 					break;
18153 				}
18154 			}
18155 #endif
18156 		} else {
18157 			new_entry->inheritance = inheritance;
18158 			if (!vm_remap_legacy) {
18159 				new_entry->protection = *cur_protection;
18160 				new_entry->max_protection = *max_protection;
18161 			}
18162 		}
18163 
18164 		VME_OFFSET_SET(new_entry, offset);
18165 
18166 		/*
18167 		 * The new region has to be copied now if required.
18168 		 */
18169 RestartCopy:
18170 		if (!copy) {
18171 			if (src_entry->used_for_jit == TRUE) {
18172 				if (same_map) {
18173 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
18174 					/*
18175 					 * Cannot allow an entry describing a JIT
18176 					 * region to be shared across address spaces.
18177 					 */
18178 					result = KERN_INVALID_ARGUMENT;
18179 					vm_object_deallocate(object);
18180 					vm_map_entry_dispose(new_entry);
18181 					new_entry = VM_MAP_ENTRY_NULL;
18182 					break;
18183 				}
18184 			}
18185 
18186 			if (!src_entry->is_sub_map &&
18187 			    VME_OBJECT(src_entry) == VM_OBJECT_NULL) {
18188 				/* no accessible memory; nothing to share */
18189 				assert(src_entry->protection == VM_PROT_NONE);
18190 				assert(src_entry->max_protection == VM_PROT_NONE);
18191 				src_entry->is_shared = FALSE;
18192 			} else {
18193 				src_entry->is_shared = TRUE;
18194 			}
18195 			if (!new_entry->is_sub_map &&
18196 			    VME_OBJECT(new_entry) == VM_OBJECT_NULL) {
18197 				/* no accessible memory; nothing to share */
18198 				assert(new_entry->protection == VM_PROT_NONE);
18199 				assert(new_entry->max_protection == VM_PROT_NONE);
18200 				new_entry->is_shared = FALSE;
18201 			} else {
18202 				new_entry->is_shared = TRUE;
18203 			}
18204 			if (!(new_entry->is_sub_map)) {
18205 				new_entry->needs_copy = FALSE;
18206 			}
18207 		} else if (src_entry->is_sub_map) {
18208 			/* make this a COW sub_map if not already */
18209 			assert(new_entry->wired_count == 0);
18210 			new_entry->needs_copy = TRUE;
18211 			object = VM_OBJECT_NULL;
18212 		} else if (src_entry->wired_count == 0 &&
18213 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
18214 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
18215 		    VME_OFFSET(new_entry),
18216 		    (new_entry->vme_end -
18217 		    new_entry->vme_start),
18218 		    &src_needs_copy,
18219 		    &new_entry_needs_copy)) {
18220 			new_entry->needs_copy = new_entry_needs_copy;
18221 			new_entry->is_shared = FALSE;
18222 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18223 
18224 			/*
18225 			 * Handle copy_on_write semantics.
18226 			 */
18227 			if (src_needs_copy && !src_entry->needs_copy) {
18228 				vm_prot_t prot;
18229 
18230 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18231 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18232 					    __FUNCTION__,
18233 					    map, map->pmap, src_entry,
18234 					    (uint64_t)src_entry->vme_start,
18235 					    (uint64_t)src_entry->vme_end,
18236 					    src_entry->protection);
18237 				}
18238 
18239 				prot = src_entry->protection & ~VM_PROT_WRITE;
18240 
18241 				if (override_nx(map,
18242 				    VME_ALIAS(src_entry))
18243 				    && prot) {
18244 					prot |= VM_PROT_EXECUTE;
18245 				}
18246 
18247 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18248 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18249 					    __FUNCTION__,
18250 					    map, map->pmap, src_entry,
18251 					    (uint64_t)src_entry->vme_start,
18252 					    (uint64_t)src_entry->vme_end,
18253 					    prot);
18254 				}
18255 
18256 				vm_object_pmap_protect(object,
18257 				    offset,
18258 				    entry_size,
18259 				    ((src_entry->is_shared
18260 				    || map->mapped_in_other_pmaps) ?
18261 				    PMAP_NULL : map->pmap),
18262 				    VM_MAP_PAGE_SIZE(map),
18263 				    src_entry->vme_start,
18264 				    prot);
18265 
18266 				assert(src_entry->wired_count == 0);
18267 				src_entry->needs_copy = TRUE;
18268 			}
18269 			/*
18270 			 * Throw away the old object reference of the new entry.
18271 			 */
18272 			vm_object_deallocate(object);
18273 		} else {
18274 			new_entry->is_shared = FALSE;
18275 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18276 
18277 			src_entry_was_wired = (src_entry->wired_count > 0);
18278 			saved_src_entry = src_entry;
18279 			src_entry = VM_MAP_ENTRY_NULL;
18280 
18281 			/*
18282 			 * The map can be safely unlocked since we
18283 			 * already hold a reference on the object.
18284 			 *
18285 			 * Record the timestamp of the map for later
18286 			 * verification, and unlock the map.
18287 			 */
18288 			version.main_timestamp = map->timestamp;
18289 			vm_map_unlock(map);     /* Increments timestamp once! */
18290 
18291 			/*
18292 			 * Perform the copy.
18293 			 */
18294 			if (src_entry_was_wired > 0 ||
18295 			    (debug4k_no_cow_copyin &&
18296 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18297 				vm_object_lock(object);
18298 				result = vm_object_copy_slowly(
18299 					object,
18300 					offset,
18301 					(new_entry->vme_end -
18302 					new_entry->vme_start),
18303 					THREAD_UNINT,
18304 					&new_copy_object);
18305 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18306 				saved_used_for_jit = new_entry->used_for_jit;
18307 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18308 				new_entry->used_for_jit = saved_used_for_jit;
18309 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18310 				new_entry->needs_copy = FALSE;
18311 			} else {
18312 				vm_object_offset_t new_offset;
18313 
18314 				new_offset = VME_OFFSET(new_entry);
18315 				result = vm_object_copy_strategically(
18316 					object,
18317 					offset,
18318 					(new_entry->vme_end -
18319 					new_entry->vme_start),
18320 					false, /* forking */
18321 					&new_copy_object,
18322 					&new_offset,
18323 					&new_entry_needs_copy);
18324 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18325 				saved_used_for_jit = new_entry->used_for_jit;
18326 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18327 				new_entry->used_for_jit = saved_used_for_jit;
18328 				if (new_offset != VME_OFFSET(new_entry)) {
18329 					VME_OFFSET_SET(new_entry, new_offset);
18330 				}
18331 
18332 				new_entry->needs_copy = new_entry_needs_copy;
18333 			}
18334 
18335 			/*
18336 			 * Throw away the old object reference of the new entry.
18337 			 */
18338 			vm_object_deallocate(object);
18339 
18340 			if (result != KERN_SUCCESS &&
18341 			    result != KERN_MEMORY_RESTART_COPY) {
18342 				vm_map_entry_dispose(new_entry);
18343 				vm_map_lock(map);
18344 				break;
18345 			}
18346 
18347 			/*
18348 			 * Verify that the map has not substantially
18349 			 * changed while the copy was being made.
18350 			 */
18351 
18352 			vm_map_lock(map);
18353 			if (version.main_timestamp + 1 != map->timestamp) {
18354 				/*
18355 				 * Simple version comparison failed.
18356 				 *
18357 				 * Retry the lookup and verify that the
18358 				 * same object/offset are still present.
18359 				 */
18360 				saved_src_entry = VM_MAP_ENTRY_NULL;
18361 				vm_object_deallocate(VME_OBJECT(new_entry));
18362 				vm_map_entry_dispose(new_entry);
18363 				if (result == KERN_MEMORY_RESTART_COPY) {
18364 					result = KERN_SUCCESS;
18365 				}
18366 				continue;
18367 			}
18368 			/* map hasn't changed: src_entry is still valid */
18369 			src_entry = saved_src_entry;
18370 			saved_src_entry = VM_MAP_ENTRY_NULL;
18371 
18372 			if (result == KERN_MEMORY_RESTART_COPY) {
18373 				vm_object_reference(object);
18374 				goto RestartCopy;
18375 			}
18376 		}
18377 
18378 		_vm_map_store_entry_link(map_header,
18379 		    map_header->links.prev, new_entry);
18380 
18381 		/* protections for submap mapping are irrelevant here */
18382 		if (vm_remap_legacy && !src_entry->is_sub_map) {
18383 			*cur_protection &= src_entry->protection;
18384 			*max_protection &= src_entry->max_protection;
18385 		}
18386 
18387 		map_address += tmp_size;
18388 		mapped_size += tmp_size;
18389 		src_start += tmp_size;
18390 
18391 		if (vmk_flags.vmkf_copy_single_object) {
18392 			if (mapped_size != size) {
18393 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18394 				    map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18395 				if (src_entry->vme_next != vm_map_to_entry(map) &&
18396 				    src_entry->vme_next->vme_object_value ==
18397 				    src_entry->vme_object_value) {
18398 					/* XXX TODO4K */
18399 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
18400 				}
18401 			}
18402 			break;
18403 		}
18404 	} /* end while */
18405 
18406 	vm_map_unlock(map);
18407 	if (result != KERN_SUCCESS) {
18408 		/*
18409 		 * Free all allocated elements.
18410 		 */
18411 		for (src_entry = map_header->links.next;
18412 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18413 		    src_entry = new_entry) {
18414 			new_entry = src_entry->vme_next;
18415 			_vm_map_store_entry_unlink(map_header, src_entry, false);
18416 			if (src_entry->is_sub_map) {
18417 				vm_map_deallocate(VME_SUBMAP(src_entry));
18418 			} else {
18419 				vm_object_deallocate(VME_OBJECT(src_entry));
18420 			}
18421 			vm_map_entry_dispose(src_entry);
18422 		}
18423 	}
18424 	return result;
18425 }
18426 
18427 bool
vm_map_is_exotic(vm_map_t map)18428 vm_map_is_exotic(
18429 	vm_map_t map)
18430 {
18431 	return VM_MAP_IS_EXOTIC(map);
18432 }
18433 
18434 bool
vm_map_is_alien(vm_map_t map)18435 vm_map_is_alien(
18436 	vm_map_t map)
18437 {
18438 	return VM_MAP_IS_ALIEN(map);
18439 }
18440 
18441 #if XNU_TARGET_OS_OSX
18442 void
vm_map_mark_alien(vm_map_t map)18443 vm_map_mark_alien(
18444 	vm_map_t map)
18445 {
18446 	vm_map_lock(map);
18447 	map->is_alien = true;
18448 	vm_map_unlock(map);
18449 }
18450 
18451 void
vm_map_single_jit(vm_map_t map)18452 vm_map_single_jit(
18453 	vm_map_t map)
18454 {
18455 	vm_map_lock(map);
18456 	map->single_jit = true;
18457 	vm_map_unlock(map);
18458 }
18459 #endif /* XNU_TARGET_OS_OSX */
18460 
18461 
18462 
18463 /*
18464  * Callers of this function must call vm_map_copy_require on
18465  * previously created vm_map_copy_t or pass a newly created
18466  * one to ensure that it hasn't been forged.
18467  */
18468 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18469 vm_map_copy_to_physcopy(
18470 	vm_map_copy_t   copy_map,
18471 	vm_map_t        target_map)
18472 {
18473 	vm_map_size_t           size;
18474 	vm_map_entry_t          entry;
18475 	vm_map_entry_t          new_entry;
18476 	vm_object_t             new_object;
18477 	unsigned int            pmap_flags;
18478 	pmap_t                  new_pmap;
18479 	vm_map_t                new_map;
18480 	vm_map_address_t        src_start, src_end, src_cur;
18481 	vm_map_address_t        dst_start, dst_end, dst_cur;
18482 	kern_return_t           kr;
18483 	void                    *kbuf;
18484 
18485 	/*
18486 	 * Perform the equivalent of vm_allocate() and memcpy().
18487 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
18488 	 */
18489 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18490 
18491 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18492 
18493 	/* create a new pmap to map "copy_map" */
18494 	pmap_flags = 0;
18495 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18496 #if PMAP_CREATE_FORCE_4K_PAGES
18497 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18498 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18499 	pmap_flags |= PMAP_CREATE_64BIT;
18500 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18501 	if (new_pmap == NULL) {
18502 		return KERN_RESOURCE_SHORTAGE;
18503 	}
18504 
18505 	/* allocate new VM object */
18506 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18507 	new_object = vm_object_allocate(size);
18508 	assert(new_object);
18509 
18510 	/* allocate new VM map entry */
18511 	new_entry = vm_map_copy_entry_create(copy_map);
18512 	assert(new_entry);
18513 
18514 	/* finish initializing new VM map entry */
18515 	new_entry->protection = VM_PROT_DEFAULT;
18516 	new_entry->max_protection = VM_PROT_DEFAULT;
18517 	new_entry->use_pmap = TRUE;
18518 
18519 	/* make new VM map entry point to new VM object */
18520 	new_entry->vme_start = 0;
18521 	new_entry->vme_end = size;
18522 	VME_OBJECT_SET(new_entry, new_object, false, 0);
18523 	VME_OFFSET_SET(new_entry, 0);
18524 
18525 	/* create a new pageable VM map to map "copy_map" */
18526 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18527 	    VM_MAP_CREATE_PAGEABLE);
18528 	assert(new_map);
18529 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18530 
18531 	/* map "copy_map" in the new VM map */
18532 	src_start = 0;
18533 	kr = vm_map_copyout_internal(
18534 		new_map,
18535 		&src_start,
18536 		copy_map,
18537 		copy_map->size,
18538 		FALSE, /* consume_on_success */
18539 		VM_PROT_DEFAULT,
18540 		VM_PROT_DEFAULT,
18541 		VM_INHERIT_DEFAULT);
18542 	assert(kr == KERN_SUCCESS);
18543 	src_end = src_start + copy_map->size;
18544 
18545 	/* map "new_object" in the new VM map */
18546 	vm_object_reference(new_object);
18547 	dst_start = 0;
18548 	kr = vm_map_enter(new_map,
18549 	    &dst_start,
18550 	    size,
18551 	    0,               /* mask */
18552 	    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18553 	    new_object,
18554 	    0,               /* offset */
18555 	    FALSE,               /* needs copy */
18556 	    VM_PROT_DEFAULT,
18557 	    VM_PROT_DEFAULT,
18558 	    VM_INHERIT_DEFAULT);
18559 	assert(kr == KERN_SUCCESS);
18560 	dst_end = dst_start + size;
18561 
18562 	/* get a kernel buffer */
18563 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18564 
18565 	/* physically copy "copy_map" mappings to new VM object */
18566 	for (src_cur = src_start, dst_cur = dst_start;
18567 	    src_cur < src_end;
18568 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18569 		vm_size_t bytes;
18570 
18571 		bytes = PAGE_SIZE;
18572 		if (src_cur + PAGE_SIZE > src_end) {
18573 			/* partial copy for last page */
18574 			bytes = src_end - src_cur;
18575 			assert(bytes > 0 && bytes < PAGE_SIZE);
18576 			/* rest of dst page should be zero-filled */
18577 		}
18578 		/* get bytes from src mapping */
18579 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
18580 		if (kr != KERN_SUCCESS) {
18581 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18582 		}
18583 		/* put bytes in dst mapping */
18584 		assert(dst_cur < dst_end);
18585 		assert(dst_cur + bytes <= dst_end);
18586 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18587 		if (kr != KERN_SUCCESS) {
18588 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18589 		}
18590 	}
18591 
18592 	/* free kernel buffer */
18593 	kfree_data(kbuf, PAGE_SIZE);
18594 
18595 	/* destroy new map */
18596 	vm_map_destroy(new_map);
18597 	new_map = VM_MAP_NULL;
18598 
18599 	/* dispose of the old map entries in "copy_map" */
18600 	while (vm_map_copy_first_entry(copy_map) !=
18601 	    vm_map_copy_to_entry(copy_map)) {
18602 		entry = vm_map_copy_first_entry(copy_map);
18603 		vm_map_copy_entry_unlink(copy_map, entry);
18604 		if (entry->is_sub_map) {
18605 			vm_map_deallocate(VME_SUBMAP(entry));
18606 		} else {
18607 			vm_object_deallocate(VME_OBJECT(entry));
18608 		}
18609 		vm_map_copy_entry_dispose(entry);
18610 	}
18611 
18612 	/* change "copy_map"'s page_size to match "target_map" */
18613 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18614 	copy_map->offset = 0;
18615 	copy_map->size = size;
18616 
18617 	/* insert new map entry in "copy_map" */
18618 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18619 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18620 
18621 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18622 	return KERN_SUCCESS;
18623 }
18624 
18625 void
18626 vm_map_copy_adjust_get_target_copy_map(
18627 	vm_map_copy_t   copy_map,
18628 	vm_map_copy_t   *target_copy_map_p);
18629 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18630 vm_map_copy_adjust_get_target_copy_map(
18631 	vm_map_copy_t   copy_map,
18632 	vm_map_copy_t   *target_copy_map_p)
18633 {
18634 	vm_map_copy_t   target_copy_map;
18635 	vm_map_entry_t  entry, target_entry;
18636 
18637 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18638 		/* the caller already has a "target_copy_map": use it */
18639 		return;
18640 	}
18641 
18642 	/* the caller wants us to create a new copy of "copy_map" */
18643 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18644 	target_copy_map = vm_map_copy_allocate(copy_map->type);
18645 	target_copy_map->offset = copy_map->offset;
18646 	target_copy_map->size = copy_map->size;
18647 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18648 	for (entry = vm_map_copy_first_entry(copy_map);
18649 	    entry != vm_map_copy_to_entry(copy_map);
18650 	    entry = entry->vme_next) {
18651 		target_entry = vm_map_copy_entry_create(target_copy_map);
18652 		vm_map_entry_copy_full(target_entry, entry);
18653 		if (target_entry->is_sub_map) {
18654 			vm_map_reference(VME_SUBMAP(target_entry));
18655 		} else {
18656 			vm_object_reference(VME_OBJECT(target_entry));
18657 		}
18658 		vm_map_copy_entry_link(
18659 			target_copy_map,
18660 			vm_map_copy_last_entry(target_copy_map),
18661 			target_entry);
18662 	}
18663 	entry = VM_MAP_ENTRY_NULL;
18664 	*target_copy_map_p = target_copy_map;
18665 }
18666 
18667 /*
18668  * Callers of this function must call vm_map_copy_require on
18669  * previously created vm_map_copy_t or pass a newly created
18670  * one to ensure that it hasn't been forged.
18671  */
18672 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18673 vm_map_copy_trim(
18674 	vm_map_copy_t   copy_map,
18675 	uint16_t        new_page_shift,
18676 	vm_map_offset_t trim_start,
18677 	vm_map_offset_t trim_end)
18678 {
18679 	uint16_t        copy_page_shift;
18680 	vm_map_entry_t  entry, next_entry;
18681 
18682 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18683 	assert(copy_map->cpy_hdr.nentries > 0);
18684 
18685 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18686 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18687 
18688 	/* use the new page_shift to do the clipping */
18689 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18690 	copy_map->cpy_hdr.page_shift = new_page_shift;
18691 
18692 	for (entry = vm_map_copy_first_entry(copy_map);
18693 	    entry != vm_map_copy_to_entry(copy_map);
18694 	    entry = next_entry) {
18695 		next_entry = entry->vme_next;
18696 		if (entry->vme_end <= trim_start) {
18697 			/* entry fully before trim range: skip */
18698 			continue;
18699 		}
18700 		if (entry->vme_start >= trim_end) {
18701 			/* entry fully after trim range: done */
18702 			break;
18703 		}
18704 		/* clip entry if needed */
18705 		vm_map_copy_clip_start(copy_map, entry, trim_start);
18706 		vm_map_copy_clip_end(copy_map, entry, trim_end);
18707 		/* dispose of entry */
18708 		copy_map->size -= entry->vme_end - entry->vme_start;
18709 		vm_map_copy_entry_unlink(copy_map, entry);
18710 		if (entry->is_sub_map) {
18711 			vm_map_deallocate(VME_SUBMAP(entry));
18712 		} else {
18713 			vm_object_deallocate(VME_OBJECT(entry));
18714 		}
18715 		vm_map_copy_entry_dispose(entry);
18716 		entry = VM_MAP_ENTRY_NULL;
18717 	}
18718 
18719 	/* restore copy_map's original page_shift */
18720 	copy_map->cpy_hdr.page_shift = copy_page_shift;
18721 }
18722 
18723 /*
18724  * Make any necessary adjustments to "copy_map" to allow it to be
18725  * mapped into "target_map".
18726  * If no changes were necessary, "target_copy_map" points to the
18727  * untouched "copy_map".
18728  * If changes are necessary, changes will be made to "target_copy_map".
18729  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18730  * copy the original "copy_map" to it before applying the changes.
18731  * The caller should discard "target_copy_map" if it's not the same as
18732  * the original "copy_map".
18733  */
18734 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18735 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_ut offset_u,vm_map_size_ut size_u,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18736 vm_map_copy_adjust_to_target(
18737 	vm_map_copy_t           src_copy_map,
18738 	vm_map_offset_ut        offset_u,
18739 	vm_map_size_ut          size_u,
18740 	vm_map_t                target_map,
18741 	boolean_t               copy,
18742 	vm_map_copy_t           *target_copy_map_p,
18743 	vm_map_offset_t         *overmap_start_p,
18744 	vm_map_offset_t         *overmap_end_p,
18745 	vm_map_offset_t         *trimmed_start_p)
18746 {
18747 	vm_map_copy_t           copy_map, target_copy_map;
18748 	vm_map_size_t           target_size;
18749 	vm_map_size_t           src_copy_map_size;
18750 	vm_map_size_t           overmap_start, overmap_end;
18751 	int                     misalignments;
18752 	vm_map_entry_t          entry, target_entry;
18753 	vm_map_offset_t         addr_adjustment;
18754 	vm_map_offset_t         new_start, new_end;
18755 	int                     copy_page_mask, target_page_mask;
18756 	uint16_t                copy_page_shift, target_page_shift;
18757 	vm_map_offset_t         trimmed_end;
18758 	vm_map_size_t           map_size;
18759 	kern_return_t           kr;
18760 
18761 	/*
18762 	 * Sanitize any input parameters that are addr/size/prot/inherit
18763 	 */
18764 	kr = vm_map_copy_addr_size_sanitize(
18765 		target_map,
18766 		offset_u,
18767 		size_u,
18768 		VM_SANITIZE_CALLER_MACH_MEMORY_ENTRY_MAP_SIZE,
18769 		&new_start,
18770 		&new_end,
18771 		&map_size);
18772 	if (__improbable(kr != KERN_SUCCESS)) {
18773 		return vm_sanitize_get_kr(kr);
18774 	}
18775 
18776 	/*
18777 	 * Assert that the vm_map_copy is coming from the right
18778 	 * zone and hasn't been forged
18779 	 */
18780 	vm_map_copy_require(src_copy_map);
18781 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18782 
18783 	/*
18784 	 * Start working with "src_copy_map" but we'll switch
18785 	 * to "target_copy_map" as soon as we start making adjustments.
18786 	 */
18787 	copy_map = src_copy_map;
18788 	src_copy_map_size = src_copy_map->size;
18789 
18790 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18791 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18792 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18793 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18794 
18795 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), *target_copy_map_p);
18796 
18797 	target_copy_map = *target_copy_map_p;
18798 	if (target_copy_map != VM_MAP_COPY_NULL) {
18799 		vm_map_copy_require(target_copy_map);
18800 	}
18801 
18802 	if (new_end > copy_map->size) {
18803 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u));
18804 		return KERN_INVALID_ARGUMENT;
18805 	}
18806 
18807 	/* trim the end */
18808 	trimmed_end = 0;
18809 	new_end = VM_MAP_ROUND_PAGE(new_end, target_page_mask);
18810 	if (new_end < copy_map->size) {
18811 		trimmed_end = src_copy_map_size - new_end;
18812 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18813 		/* get "target_copy_map" if needed and adjust it */
18814 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18815 		    &target_copy_map);
18816 		copy_map = target_copy_map;
18817 		vm_map_copy_trim(target_copy_map, target_page_shift,
18818 		    new_end, copy_map->size);
18819 	}
18820 
18821 	/* trim the start */
18822 	new_start = VM_MAP_TRUNC_PAGE(new_start, target_page_mask);
18823 	if (new_start != 0) {
18824 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)0, (uint64_t)new_start);
18825 		/* get "target_copy_map" if needed and adjust it */
18826 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18827 		    &target_copy_map);
18828 		copy_map = target_copy_map;
18829 		vm_map_copy_trim(target_copy_map, target_page_shift,
18830 		    0, new_start);
18831 	}
18832 	*trimmed_start_p = new_start;
18833 
18834 	/* target_size starts with what's left after trimming */
18835 	target_size = copy_map->size;
18836 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18837 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18838 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
18839 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18840 
18841 	/* check for misalignments but don't adjust yet */
18842 	misalignments = 0;
18843 	overmap_start = 0;
18844 	overmap_end = 0;
18845 	if (copy_page_shift < target_page_shift) {
18846 		/*
18847 		 * Remapping from 4K to 16K: check the VM object alignments
18848 		 * throughout the range.
18849 		 * If the start and end of the range are mis-aligned, we can
18850 		 * over-map to re-align, and adjust the "overmap" start/end
18851 		 * and "target_size" of the range accordingly.
18852 		 * If there is any mis-alignment within the range:
18853 		 *     if "copy":
18854 		 *         we can do immediate-copy instead of copy-on-write,
18855 		 *     else:
18856 		 *         no way to remap and share; fail.
18857 		 */
18858 		for (entry = vm_map_copy_first_entry(copy_map);
18859 		    entry != vm_map_copy_to_entry(copy_map);
18860 		    entry = entry->vme_next) {
18861 			vm_object_offset_t object_offset_start, object_offset_end;
18862 
18863 			object_offset_start = VME_OFFSET(entry);
18864 			object_offset_end = object_offset_start;
18865 			object_offset_end += entry->vme_end - entry->vme_start;
18866 			if (object_offset_start & target_page_mask) {
18867 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18868 					overmap_start++;
18869 				} else {
18870 					misalignments++;
18871 				}
18872 			}
18873 			if (object_offset_end & target_page_mask) {
18874 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18875 					overmap_end++;
18876 				} else {
18877 					misalignments++;
18878 				}
18879 			}
18880 		}
18881 	}
18882 	entry = VM_MAP_ENTRY_NULL;
18883 
18884 	/* decide how to deal with misalignments */
18885 	assert(overmap_start <= 1);
18886 	assert(overmap_end <= 1);
18887 	if (!overmap_start && !overmap_end && !misalignments) {
18888 		/* copy_map is properly aligned for target_map ... */
18889 		if (*trimmed_start_p) {
18890 			/* ... but we trimmed it, so still need to adjust */
18891 		} else {
18892 			/* ... and we didn't trim anything: we're done */
18893 			if (target_copy_map == VM_MAP_COPY_NULL) {
18894 				target_copy_map = copy_map;
18895 			}
18896 			*target_copy_map_p = target_copy_map;
18897 			*overmap_start_p = 0;
18898 			*overmap_end_p = 0;
18899 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18900 			return KERN_SUCCESS;
18901 		}
18902 	} else if (misalignments && !copy) {
18903 		/* can't "share" if misaligned */
18904 		DEBUG4K_ADJUST("unsupported sharing\n");
18905 #if MACH_ASSERT
18906 		if (debug4k_panic_on_misaligned_sharing) {
18907 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18908 		}
18909 #endif /* MACH_ASSERT */
18910 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18911 		return KERN_NOT_SUPPORTED;
18912 	} else {
18913 		/* can't virtual-copy if misaligned (but can physical-copy) */
18914 		DEBUG4K_ADJUST("mis-aligned copying\n");
18915 	}
18916 
18917 	/* get a "target_copy_map" if needed and switch to it */
18918 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18919 	copy_map = target_copy_map;
18920 
18921 	if (misalignments && copy) {
18922 		vm_map_size_t target_copy_map_size;
18923 
18924 		/*
18925 		 * Can't do copy-on-write with misaligned mappings.
18926 		 * Replace the mappings with a physical copy of the original
18927 		 * mappings' contents.
18928 		 */
18929 		target_copy_map_size = target_copy_map->size;
18930 		kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18931 		if (kr != KERN_SUCCESS) {
18932 			return kr;
18933 		}
18934 		*target_copy_map_p = target_copy_map;
18935 		*overmap_start_p = 0;
18936 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
18937 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18938 		return KERN_SUCCESS;
18939 	}
18940 
18941 	/* apply the adjustments */
18942 	misalignments = 0;
18943 	overmap_start = 0;
18944 	overmap_end = 0;
18945 	/* remove copy_map->offset, so that everything starts at offset 0 */
18946 	addr_adjustment = copy_map->offset;
18947 	/* also remove whatever we trimmed from the start */
18948 	addr_adjustment += *trimmed_start_p;
18949 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
18950 	    target_entry != vm_map_copy_to_entry(target_copy_map);
18951 	    target_entry = target_entry->vme_next) {
18952 		vm_object_offset_t object_offset_start, object_offset_end;
18953 
18954 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18955 		object_offset_start = VME_OFFSET(target_entry);
18956 		if (object_offset_start & target_page_mask) {
18957 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18958 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18959 				/*
18960 				 * start of 1st entry is mis-aligned:
18961 				 * re-adjust by over-mapping.
18962 				 */
18963 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18964 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18965 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18966 			} else {
18967 				misalignments++;
18968 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18969 				assert(copy);
18970 			}
18971 		}
18972 
18973 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18974 			target_size += overmap_start;
18975 		} else {
18976 			target_entry->vme_start += overmap_start;
18977 		}
18978 		target_entry->vme_end += overmap_start;
18979 
18980 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18981 		if (object_offset_end & target_page_mask) {
18982 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18983 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18984 				/*
18985 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
18986 				 */
18987 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18988 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18989 				target_entry->vme_end += overmap_end;
18990 				target_size += overmap_end;
18991 			} else {
18992 				misalignments++;
18993 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18994 				assert(copy);
18995 			}
18996 		}
18997 		target_entry->vme_start -= addr_adjustment;
18998 		target_entry->vme_end -= addr_adjustment;
18999 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19000 	}
19001 
19002 	target_copy_map->size = target_size;
19003 	target_copy_map->offset += overmap_start;
19004 	target_copy_map->offset -= addr_adjustment;
19005 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
19006 
19007 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
19008 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
19009 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
19010 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
19011 
19012 	*target_copy_map_p = target_copy_map;
19013 	*overmap_start_p = overmap_start;
19014 	*overmap_end_p = overmap_end;
19015 
19016 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19017 	return KERN_SUCCESS;
19018 }
19019 
19020 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)19021 vm_map_range_physical_size(
19022 	vm_map_t         map,
19023 	vm_map_address_t start,
19024 	mach_vm_size_t   size,
19025 	mach_vm_size_t * phys_size)
19026 {
19027 	kern_return_t   kr;
19028 	vm_map_copy_t   copy_map, target_copy_map;
19029 	vm_map_offset_t adjusted_start, adjusted_end;
19030 	vm_map_size_t   adjusted_size;
19031 	vm_prot_t       cur_prot, max_prot;
19032 	vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
19033 	vm_map_kernel_flags_t vmk_flags;
19034 
19035 	if (size == 0) {
19036 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
19037 		*phys_size = 0;
19038 		return KERN_SUCCESS;
19039 	}
19040 
19041 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
19042 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
19043 	if (__improbable(os_add_overflow(start, size, &end) ||
19044 	    adjusted_end <= adjusted_start)) {
19045 		/* wraparound */
19046 		printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
19047 		*phys_size = 0;
19048 		return KERN_INVALID_ARGUMENT;
19049 	}
19050 	if (__improbable(vm_map_range_overflows(map, start, size))) {
19051 		*phys_size = 0;
19052 		return KERN_INVALID_ADDRESS;
19053 	}
19054 	assert(adjusted_end > adjusted_start);
19055 	adjusted_size = adjusted_end - adjusted_start;
19056 	*phys_size = adjusted_size;
19057 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
19058 		return KERN_SUCCESS;
19059 	}
19060 	if (start == 0) {
19061 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
19062 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
19063 		if (__improbable(adjusted_end <= adjusted_start)) {
19064 			/* wraparound */
19065 			printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
19066 			*phys_size = 0;
19067 			return KERN_INVALID_ARGUMENT;
19068 		}
19069 		assert(adjusted_end > adjusted_start);
19070 		adjusted_size = adjusted_end - adjusted_start;
19071 		*phys_size = adjusted_size;
19072 		return KERN_SUCCESS;
19073 	}
19074 
19075 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
19076 	vmk_flags.vmkf_copy_pageable = TRUE;
19077 	vmk_flags.vmkf_copy_same_map = TRUE;
19078 	assert(adjusted_size != 0);
19079 	cur_prot = VM_PROT_NONE; /* legacy mode */
19080 	max_prot = VM_PROT_NONE; /* legacy mode */
19081 	vmk_flags.vmkf_remap_legacy_mode = true;
19082 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
19083 	    FALSE /* copy */,
19084 	    &copy_map,
19085 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
19086 	    vmk_flags);
19087 	if (kr != KERN_SUCCESS) {
19088 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19089 		//assert(0);
19090 		*phys_size = 0;
19091 		return kr;
19092 	}
19093 	assert(copy_map != VM_MAP_COPY_NULL);
19094 	target_copy_map = copy_map;
19095 	DEBUG4K_ADJUST("adjusting...\n");
19096 	kr = vm_map_copy_adjust_to_target(
19097 		copy_map,
19098 		start - adjusted_start, /* offset */
19099 		size, /* size */
19100 		kernel_map,
19101 		FALSE,                          /* copy */
19102 		&target_copy_map,
19103 		&overmap_start,
19104 		&overmap_end,
19105 		&trimmed_start);
19106 	if (kr == KERN_SUCCESS) {
19107 		if (target_copy_map->size != *phys_size) {
19108 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
19109 		}
19110 		*phys_size = target_copy_map->size;
19111 	} else {
19112 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19113 		//assert(0);
19114 		*phys_size = 0;
19115 	}
19116 	vm_map_copy_discard(copy_map);
19117 	copy_map = VM_MAP_COPY_NULL;
19118 
19119 	return kr;
19120 }
19121 
19122 static __attribute__((always_inline, warn_unused_result))
19123 kern_return_t
vm_map_remap_sanitize(vm_map_t src_map,vm_map_t target_map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_offset_ut memory_address_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * target_addr,vm_map_address_t * mask,vm_map_offset_t * memory_address,vm_map_offset_t * memory_end,vm_map_size_t * memory_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)19124 vm_map_remap_sanitize(
19125 	vm_map_t                src_map,
19126 	vm_map_t                target_map,
19127 	vm_map_address_ut       address_u,
19128 	vm_map_size_ut          size_u,
19129 	vm_map_offset_ut        mask_u,
19130 	vm_map_offset_ut        memory_address_u,
19131 	vm_prot_ut              cur_protection_u,
19132 	vm_prot_ut              max_protection_u,
19133 	vm_inherit_ut           inheritance_u,
19134 	vm_map_kernel_flags_t   vmk_flags,
19135 	vm_map_address_t       *target_addr,
19136 	vm_map_address_t       *mask,
19137 	vm_map_offset_t        *memory_address,
19138 	vm_map_offset_t        *memory_end,
19139 	vm_map_size_t          *memory_size,
19140 	vm_prot_t              *cur_protection,
19141 	vm_prot_t              *max_protection,
19142 	vm_inherit_t           *inheritance)
19143 {
19144 	kern_return_t           result;
19145 	vm_sanitize_flags_t     vm_sanitize_flags;
19146 
19147 	result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_VM_MAP_REMAP,
19148 	    inheritance);
19149 	if (__improbable(result != KERN_SUCCESS)) {
19150 		return result;
19151 	}
19152 
19153 	result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
19154 	    VM_SANITIZE_CALLER_VM_MAP_REMAP, target_map,
19155 	    cur_protection, max_protection);
19156 	if (__improbable(result != KERN_SUCCESS)) {
19157 		return result;
19158 	}
19159 
19160 	result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, mask);
19161 	if (__improbable(result != KERN_SUCCESS)) {
19162 		return result;
19163 	}
19164 
19165 	/*
19166 	 * If the user is requesting that we return the address of the
19167 	 * first byte of the data (rather than the base of the page),
19168 	 * then we use different rounding semantics: specifically,
19169 	 * we assume that (memory_address, size) describes a region
19170 	 * all of whose pages we must cover, rather than a base to be truncated
19171 	 * down and a size to be added to that base.  So we figure out
19172 	 * the highest page that the requested region includes and make
19173 	 * sure that the size will cover it.
19174 	 *
19175 	 * The key example we're worried about it is of the form:
19176 	 *
19177 	 *              memory_address = 0x1ff0, size = 0x20
19178 	 *
19179 	 * With the old semantics, we round down the memory_address to 0x1000
19180 	 * and round up the size to 0x1000, resulting in our covering *only*
19181 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
19182 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
19183 	 * 0x1000 and page 0x2000 in the region we remap.
19184 	 *
19185 	 * VM_SANITIZE_FLAGS_REALIGN_START asks for the old (broken) semantics.
19186 	 */
19187 	vm_sanitize_flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS;
19188 	if (!vmk_flags.vmf_return_data_addr) {
19189 		vm_sanitize_flags |= VM_SANITIZE_FLAGS_REALIGN_START;
19190 	}
19191 
19192 	result = vm_sanitize_addr_size(memory_address_u, size_u,
19193 	    VM_SANITIZE_CALLER_VM_MAP_REMAP, src_map,
19194 	    vm_sanitize_flags, memory_address, memory_end,
19195 	    memory_size);
19196 	if (__improbable(result != KERN_SUCCESS)) {
19197 		return result;
19198 	}
19199 
19200 	*target_addr = vm_sanitize_addr(target_map, address_u);
19201 	return KERN_SUCCESS;
19202 }
19203 
19204 /*
19205  *	Routine:	vm_remap
19206  *
19207  *			Map portion of a task's address space.
19208  *			Mapped region must not overlap more than
19209  *			one vm memory object. Protections and
19210  *			inheritance attributes remain the same
19211  *			as in the original task and are	out parameters.
19212  *			Source and Target task can be identical
19213  *			Other attributes are identical as for vm_map()
19214  */
19215 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_ut * address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_ut memory_address_u,boolean_t copy,vm_prot_ut * cur_protection_u,vm_prot_ut * max_protection_u,vm_inherit_ut inheritance_u)19216 vm_map_remap(
19217 	vm_map_t                target_map,
19218 	vm_map_address_ut      *address_u,
19219 	vm_map_size_ut          size_u,
19220 	vm_map_offset_ut        mask_u,
19221 	vm_map_kernel_flags_t   vmk_flags,
19222 	vm_map_t                src_map,
19223 	vm_map_offset_ut        memory_address_u,
19224 	boolean_t               copy,
19225 	vm_prot_ut             *cur_protection_u, /* IN/OUT */
19226 	vm_prot_ut             *max_protection_u, /* IN/OUT */
19227 	vm_inherit_ut           inheritance_u)
19228 {
19229 	vm_map_address_t        target_addr, mask;
19230 	vm_map_size_t           target_size;
19231 	vm_map_offset_t         memory_address, memory_end;
19232 	vm_map_size_t           memory_size;
19233 	vm_prot_t               cur_protection, max_protection;
19234 	vm_inherit_t            inheritance;
19235 	kern_return_t           result;
19236 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
19237 	vm_map_copy_t           copy_map;
19238 	vm_map_offset_t         offset_in_mapping;
19239 	vm_map_size_t           src_page_mask, target_page_mask;
19240 	vm_map_size_t           initial_size;
19241 	VM_MAP_ZAP_DECLARE(zap_list);
19242 
19243 	if (target_map == VM_MAP_NULL || src_map == VM_MAP_NULL) {
19244 		return KERN_INVALID_ARGUMENT;
19245 	}
19246 	src_page_mask    = VM_MAP_PAGE_MASK(src_map);
19247 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
19248 
19249 	if (src_page_mask != target_page_mask) {
19250 		if (copy) {
19251 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19252 		} else {
19253 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19254 		}
19255 	}
19256 
19257 	/*
19258 	 * Sanitize any input parameters that are addr/size/prot/inherit
19259 	 */
19260 	result = vm_map_remap_sanitize(src_map,
19261 	    target_map,
19262 	    *address_u,
19263 	    size_u,
19264 	    mask_u,
19265 	    memory_address_u,
19266 	    *cur_protection_u,
19267 	    *max_protection_u,
19268 	    inheritance_u,
19269 	    vmk_flags,
19270 	    &target_addr,
19271 	    &mask,
19272 	    &memory_address,
19273 	    &memory_end,
19274 	    &memory_size,
19275 	    &cur_protection,
19276 	    &max_protection,
19277 	    &inheritance);
19278 	if (__improbable(result != KERN_SUCCESS)) {
19279 		return vm_sanitize_get_kr(result);
19280 	}
19281 
19282 	if (vmk_flags.vmf_return_data_addr) {
19283 		/*
19284 		 * This is safe to unwrap now that the quantities
19285 		 * have been validated and rounded up normally.
19286 		 */
19287 		offset_in_mapping = vm_sanitize_offset_in_page(src_map,
19288 		    memory_address_u);
19289 		initial_size = VM_SANITIZE_UNSAFE_UNWRAP(size_u);
19290 	} else {
19291 		/*
19292 		 * IMPORTANT:
19293 		 * This legacy code path is broken: for the range mentioned
19294 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19295 		 * two 4k pages, it yields [ memory_address = 0x1000,
19296 		 * size = 0x1000 ], which covers only the first 4k page.
19297 		 * BUT some code unfortunately depends on this bug, so we
19298 		 * can't fix it without breaking something.
19299 		 * New code should get automatically opted in the new
19300 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19301 		 */
19302 		offset_in_mapping = 0;
19303 		initial_size = memory_size;
19304 	}
19305 
19306 	if (vmk_flags.vmf_resilient_media) {
19307 		/* must be copy-on-write to be "media resilient" */
19308 		if (!copy) {
19309 			return KERN_INVALID_ARGUMENT;
19310 		}
19311 	}
19312 
19313 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19314 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19315 
19316 	assert(memory_size != 0);
19317 	result = vm_map_copy_extract(src_map,
19318 	    memory_address,
19319 	    memory_size,
19320 	    copy, &copy_map,
19321 	    &cur_protection, /* IN/OUT */
19322 	    &max_protection, /* IN/OUT */
19323 	    inheritance,
19324 	    vmk_flags);
19325 	if (result != KERN_SUCCESS) {
19326 		return result;
19327 	}
19328 	assert(copy_map != VM_MAP_COPY_NULL);
19329 
19330 	/*
19331 	 * Handle the policy for vm map ranges
19332 	 *
19333 	 * If the maps differ, the target_map policy applies like for vm_map()
19334 	 * For same mapping remaps, we preserve the range.
19335 	 */
19336 	if (vmk_flags.vmkf_copy_same_map) {
19337 		vmk_flags.vmkf_range_id = copy_map->orig_range;
19338 	} else {
19339 		vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, memory_size);
19340 	}
19341 
19342 	target_size = memory_size;
19343 	if (src_page_mask != target_page_mask) {
19344 		vm_map_copy_t   target_copy_map;
19345 		vm_map_offset_t overmap_start = 0;
19346 		vm_map_offset_t overmap_end   = 0;
19347 		vm_map_offset_t trimmed_start = 0;
19348 
19349 		target_copy_map = copy_map; /* can modify "copy_map" itself */
19350 		DEBUG4K_ADJUST("adjusting...\n");
19351 		result = vm_map_copy_adjust_to_target(
19352 			copy_map,
19353 			offset_in_mapping, /* offset */
19354 			initial_size,
19355 			target_map,
19356 			copy,
19357 			&target_copy_map,
19358 			&overmap_start,
19359 			&overmap_end,
19360 			&trimmed_start);
19361 		if (result != KERN_SUCCESS) {
19362 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19363 			vm_map_copy_discard(copy_map);
19364 			return result;
19365 		}
19366 		if (trimmed_start == 0) {
19367 			/* nothing trimmed: no adjustment needed */
19368 		} else if (trimmed_start >= offset_in_mapping) {
19369 			/* trimmed more than offset_in_mapping: nothing left */
19370 			assert(overmap_start == 0);
19371 			assert(overmap_end == 0);
19372 			offset_in_mapping = 0;
19373 		} else {
19374 			/* trimmed some of offset_in_mapping: adjust */
19375 			assert(overmap_start == 0);
19376 			assert(overmap_end == 0);
19377 			offset_in_mapping -= trimmed_start;
19378 		}
19379 		offset_in_mapping += overmap_start;
19380 		target_size = target_copy_map->size;
19381 	}
19382 
19383 	/*
19384 	 * Allocate/check a range of free virtual address
19385 	 * space for the target
19386 	 */
19387 	target_size = vm_map_round_page(target_size, target_page_mask);
19388 
19389 	if (target_size == 0) {
19390 		vm_map_copy_discard(copy_map);
19391 		return KERN_INVALID_ARGUMENT;
19392 	}
19393 
19394 	vm_map_lock(target_map);
19395 
19396 	if (!vmk_flags.vmf_fixed) {
19397 		result = vm_map_locate_space_anywhere(target_map, target_size,
19398 		    mask, vmk_flags, &target_addr, &insp_entry);
19399 	} else {
19400 		/*
19401 		 * vm_map_locate_space_fixed will reject overflowing
19402 		 * target_addr + target_size values
19403 		 */
19404 		result = vm_map_locate_space_fixed(target_map, target_addr,
19405 		    target_size, mask, vmk_flags, &insp_entry, &zap_list);
19406 
19407 		if (result == KERN_MEMORY_PRESENT) {
19408 			assert(!vmk_flags.vmkf_already);
19409 			insp_entry = VM_MAP_ENTRY_NULL;
19410 			result = KERN_NO_SPACE;
19411 		}
19412 	}
19413 
19414 	if (result == KERN_SUCCESS) {
19415 		while (vm_map_copy_first_entry(copy_map) !=
19416 		    vm_map_copy_to_entry(copy_map)) {
19417 			vm_map_entry_t entry = vm_map_copy_first_entry(copy_map);
19418 
19419 			vm_map_copy_entry_unlink(copy_map, entry);
19420 
19421 			if (vmk_flags.vmkf_remap_prot_copy) {
19422 				/*
19423 				 * This vm_map_remap() is for a
19424 				 * vm_protect(VM_PROT_COPY), so the caller
19425 				 * expects to be allowed to add write access
19426 				 * to this new mapping.  This is done by
19427 				 * adding VM_PROT_WRITE to each entry's
19428 				 * max_protection... unless some security
19429 				 * settings disallow it.
19430 				 */
19431 				bool allow_write = false;
19432 				if (entry->vme_permanent) {
19433 					/* immutable mapping... */
19434 					if ((entry->max_protection & VM_PROT_EXECUTE) &&
19435 					    developer_mode_state()) {
19436 						/*
19437 						 * ... but executable and
19438 						 * possibly being debugged,
19439 						 * so let's allow it to become
19440 						 * writable, for breakpoints
19441 						 * and dtrace probes, for
19442 						 * example.
19443 						 */
19444 						allow_write = true;
19445 					} else {
19446 						printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19447 						    proc_selfpid(),
19448 						    (get_bsdtask_info(current_task())
19449 						    ? proc_name_address(get_bsdtask_info(current_task()))
19450 						    : "?"),
19451 						    (uint64_t)memory_address,
19452 						    (uint64_t)memory_size,
19453 						    entry->protection,
19454 						    entry->max_protection,
19455 						    developer_mode_state());
19456 						DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19457 						    vm_map_entry_t, entry,
19458 						    vm_map_offset_t, entry->vme_start,
19459 						    vm_map_offset_t, entry->vme_end,
19460 						    vm_prot_t, entry->protection,
19461 						    vm_prot_t, entry->max_protection,
19462 						    int, VME_ALIAS(entry));
19463 					}
19464 				} else {
19465 					allow_write = true;
19466 				}
19467 
19468 				/*
19469 				 * VM_PROT_COPY: allow this mapping to become
19470 				 * writable, unless it was "permanent".
19471 				 */
19472 				if (allow_write) {
19473 					entry->max_protection |= VM_PROT_WRITE;
19474 				}
19475 			}
19476 			if (vmk_flags.vmf_resilient_codesign) {
19477 				/* no codesigning -> read-only access */
19478 				entry->max_protection = VM_PROT_READ;
19479 				entry->protection = VM_PROT_READ;
19480 				entry->vme_resilient_codesign = TRUE;
19481 			}
19482 			entry->vme_start += target_addr;
19483 			entry->vme_end += target_addr;
19484 			assert(!entry->map_aligned);
19485 			if (vmk_flags.vmf_resilient_media &&
19486 			    !entry->is_sub_map &&
19487 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19488 			    VME_OBJECT(entry)->internal)) {
19489 				entry->vme_resilient_media = TRUE;
19490 			}
19491 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19492 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19493 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19494 			vm_map_store_entry_link(target_map, insp_entry, entry,
19495 			    vmk_flags);
19496 			insp_entry = entry;
19497 		}
19498 	}
19499 
19500 	if (vmk_flags.vmf_resilient_codesign) {
19501 		cur_protection = VM_PROT_READ;
19502 		max_protection = VM_PROT_READ;
19503 	}
19504 
19505 	if (result == KERN_SUCCESS) {
19506 		target_map->size += target_size;
19507 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19508 	}
19509 	vm_map_unlock(target_map);
19510 
19511 	vm_map_zap_dispose(&zap_list);
19512 
19513 	if (result == KERN_SUCCESS && target_map->wiring_required) {
19514 		result = vm_map_wire_nested(target_map, target_addr,
19515 		    target_addr + target_size, cur_protection, VM_KERN_MEMORY_MLOCK,
19516 		    TRUE, PMAP_NULL, 0, NULL);
19517 	}
19518 
19519 	if (result == KERN_SUCCESS) {
19520 #if KASAN
19521 		if (target_map->pmap == kernel_pmap) {
19522 			kasan_notify_address(target_addr, target_size);
19523 		}
19524 #endif
19525 		/*
19526 		 * If requested, return the address of the data pointed to by the
19527 		 * request, rather than the base of the resulting page.
19528 		 */
19529 		if (vmk_flags.vmf_return_data_addr) {
19530 			target_addr += offset_in_mapping;
19531 		}
19532 
19533 		/*
19534 		 * Update OUT parameters.
19535 		 */
19536 		*address_u = vm_sanitize_wrap_addr(target_addr);
19537 
19538 		*cur_protection_u = vm_sanitize_wrap_prot(cur_protection);
19539 		*max_protection_u = vm_sanitize_wrap_prot(max_protection);
19540 	}
19541 
19542 	if (src_page_mask != target_page_mask) {
19543 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)target_size, copy, target_map, (uint64_t)target_addr, (uint64_t)offset_in_mapping, result);
19544 	}
19545 	vm_map_copy_discard(copy_map);
19546 	copy_map = VM_MAP_COPY_NULL;
19547 
19548 	return result;
19549 }
19550 
19551 /*
19552  *	vm_map_switch:
19553  *
19554  *	Set the address map for the current thread to the specified map
19555  */
19556 
19557 vm_map_t
vm_map_switch(vm_map_t map)19558 vm_map_switch(
19559 	vm_map_t        map)
19560 {
19561 	thread_t        thread = current_thread();
19562 	vm_map_t        oldmap = thread->map;
19563 
19564 
19565 	/*
19566 	 *	Deactivate the current map and activate the requested map
19567 	 */
19568 	mp_disable_preemption();
19569 	PMAP_SWITCH_USER(thread, map, cpu_number());
19570 	mp_enable_preemption();
19571 	return oldmap;
19572 }
19573 
19574 static __attribute__((always_inline, warn_unused_result))
19575 kern_return_t
vm_map_rw_user_sanitize(vm_map_t map,vm_map_address_ut addr_u,vm_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_address_t * addr,vm_map_address_t * end,vm_map_size_t * size)19576 vm_map_rw_user_sanitize(
19577 	vm_map_t                map,
19578 	vm_map_address_ut       addr_u,
19579 	vm_size_ut              size_u,
19580 	vm_sanitize_caller_t    vm_sanitize_caller,
19581 	vm_map_address_t       *addr,
19582 	vm_map_address_t       *end,
19583 	vm_map_size_t          *size)
19584 {
19585 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
19586 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES;
19587 
19588 
19589 	return vm_sanitize_addr_size(addr_u, size_u,
19590 	           vm_sanitize_caller, map,
19591 	           flags,
19592 	           addr, end, size);
19593 }
19594 
19595 /*
19596  *	Routine:	vm_map_write_user
19597  *
19598  *	Description:
19599  *		Copy out data from a kernel space into space in the
19600  *		destination map. The space must already exist in the
19601  *		destination map.
19602  *		NOTE:  This routine should only be called by threads
19603  *		which can block on a page fault. i.e. kernel mode user
19604  *		threads.
19605  *
19606  */
19607 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_ut dst_addr_u,vm_size_ut size_u)19608 vm_map_write_user(
19609 	vm_map_t                map,
19610 	void                   *src_p,
19611 	vm_map_address_ut       dst_addr_u,
19612 	vm_size_ut              size_u)
19613 {
19614 	kern_return_t    kr;
19615 	vm_map_address_t dst_addr, dst_end;
19616 	vm_map_size_t    size;
19617 
19618 	/*
19619 	 * src_p isn't validated: [src_p, src_p + size_u)
19620 	 * is trusted kernel input.
19621 	 *
19622 	 * dst_addr_u and size_u are untrusted and need to be sanitized.
19623 	 */
19624 	kr = vm_map_rw_user_sanitize(map,
19625 	    dst_addr_u,
19626 	    size_u,
19627 	    VM_SANITIZE_CALLER_VM_MAP_WRITE_USER,
19628 	    &dst_addr,
19629 	    &dst_end,
19630 	    &size);
19631 	if (__improbable(kr != KERN_SUCCESS)) {
19632 		return vm_sanitize_get_kr(kr);
19633 	}
19634 
19635 	if (current_map() == map) {
19636 		if (copyout(src_p, dst_addr, size)) {
19637 			kr = KERN_INVALID_ADDRESS;
19638 		}
19639 	} else {
19640 		vm_map_t        oldmap;
19641 
19642 		/* take on the identity of the target map while doing */
19643 		/* the transfer */
19644 
19645 		vm_map_reference(map);
19646 		oldmap = vm_map_switch(map);
19647 		if (copyout(src_p, dst_addr, size)) {
19648 			kr = KERN_INVALID_ADDRESS;
19649 		}
19650 		vm_map_switch(oldmap);
19651 		vm_map_deallocate(map);
19652 	}
19653 	return kr;
19654 }
19655 
19656 /*
19657  *	Routine:	vm_map_read_user
19658  *
19659  *	Description:
19660  *		Copy in data from a user space source map into the
19661  *		kernel map. The space must already exist in the
19662  *		kernel map.
19663  *		NOTE:  This routine should only be called by threads
19664  *		which can block on a page fault. i.e. kernel mode user
19665  *		threads.
19666  *
19667  */
19668 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_ut src_addr_u,void * dst_p,vm_size_ut size_u)19669 vm_map_read_user(
19670 	vm_map_t                map,
19671 	vm_map_address_ut       src_addr_u,
19672 	void                   *dst_p,
19673 	vm_size_ut              size_u)
19674 {
19675 	kern_return_t    kr;
19676 	vm_map_address_t src_addr, src_end;
19677 	vm_map_size_t    size;
19678 
19679 	/*
19680 	 * dst_p isn't validated: [dst_p, dst_p + size_u)
19681 	 * is trusted kernel input.
19682 	 *
19683 	 * src_addr_u and size_u are untrusted and need to be sanitized.
19684 	 */
19685 	kr = vm_map_rw_user_sanitize(map,
19686 	    src_addr_u,
19687 	    size_u,
19688 	    VM_SANITIZE_CALLER_VM_MAP_READ_USER,
19689 	    &src_addr,
19690 	    &src_end,
19691 	    &size);
19692 	if (__improbable(kr != KERN_SUCCESS)) {
19693 		return vm_sanitize_get_kr(kr);
19694 	}
19695 
19696 	if (current_map() == map) {
19697 		if (copyin(src_addr, dst_p, size)) {
19698 			kr = KERN_INVALID_ADDRESS;
19699 		}
19700 	} else {
19701 		vm_map_t        oldmap;
19702 
19703 		/* take on the identity of the target map while doing */
19704 		/* the transfer */
19705 
19706 		vm_map_reference(map);
19707 		oldmap = vm_map_switch(map);
19708 		if (copyin(src_addr, dst_p, size)) {
19709 			kr = KERN_INVALID_ADDRESS;
19710 		}
19711 		vm_map_switch(oldmap);
19712 		vm_map_deallocate(map);
19713 	}
19714 	return kr;
19715 }
19716 
19717 
19718 static __attribute__((always_inline, warn_unused_result))
19719 kern_return_t
vm_map_check_protection_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * protection)19720 vm_map_check_protection_sanitize(
19721 	vm_map_t                map,
19722 	vm_map_offset_ut        start_u,
19723 	vm_map_offset_ut        end_u,
19724 	vm_prot_ut              protection_u,
19725 	vm_sanitize_caller_t    vm_sanitize_caller,
19726 	vm_map_offset_t        *start,
19727 	vm_map_offset_t        *end,
19728 	vm_prot_t              *protection)
19729 {
19730 	kern_return_t           kr;
19731 	vm_map_size_t           size;
19732 
19733 	kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
19734 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start, end,
19735 	    &size);
19736 	if (__improbable(kr != KERN_SUCCESS)) {
19737 		return kr;
19738 	}
19739 
19740 	/*
19741 	 * Given that the protection is used only for comparisons below
19742 	 * no sanitization is being applied on it.
19743 	 */
19744 	*protection = VM_SANITIZE_UNSAFE_UNWRAP(protection_u);
19745 
19746 	return KERN_SUCCESS;
19747 }
19748 
19749 /*
19750  *	vm_map_check_protection:
19751  *
19752  *	Assert that the target map allows the specified
19753  *	privilege on the entire address region given.
19754  *	The entire region must be allocated.
19755  */
19756 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller)19757 vm_map_check_protection(
19758 	vm_map_t                map,
19759 	vm_map_offset_ut        start_u,
19760 	vm_map_offset_ut        end_u,
19761 	vm_prot_ut              protection_u,
19762 	vm_sanitize_caller_t    vm_sanitize_caller)
19763 {
19764 	vm_map_entry_t entry;
19765 	vm_map_entry_t tmp_entry;
19766 	vm_map_offset_t start;
19767 	vm_map_offset_t end;
19768 	vm_prot_t protection;
19769 	kern_return_t kr;
19770 
19771 	kr = vm_map_check_protection_sanitize(map,
19772 	    start_u,
19773 	    end_u,
19774 	    protection_u,
19775 	    vm_sanitize_caller,
19776 	    &start,
19777 	    &end,
19778 	    &protection);
19779 	if (__improbable(kr != KERN_SUCCESS)) {
19780 		kr = vm_sanitize_get_kr(kr);
19781 		if (kr == KERN_SUCCESS) {
19782 			return true;
19783 		}
19784 		return false;
19785 	}
19786 
19787 	vm_map_lock(map);
19788 
19789 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
19790 		vm_map_unlock(map);
19791 		return false;
19792 	}
19793 
19794 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19795 		vm_map_unlock(map);
19796 		return false;
19797 	}
19798 
19799 	entry = tmp_entry;
19800 
19801 	while (start < end) {
19802 		if (entry == vm_map_to_entry(map)) {
19803 			vm_map_unlock(map);
19804 			return false;
19805 		}
19806 
19807 		/*
19808 		 *	No holes allowed!
19809 		 */
19810 
19811 		if (start < entry->vme_start) {
19812 			vm_map_unlock(map);
19813 			return false;
19814 		}
19815 
19816 		/*
19817 		 * Check protection associated with entry.
19818 		 */
19819 
19820 		if ((entry->protection & protection) != protection) {
19821 			vm_map_unlock(map);
19822 			return false;
19823 		}
19824 
19825 		/* go to next entry */
19826 
19827 		start = entry->vme_end;
19828 		entry = entry->vme_next;
19829 	}
19830 	vm_map_unlock(map);
19831 	return true;
19832 }
19833 
19834 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_ut address_u,vm_purgable_t control,int * state)19835 vm_map_purgable_control(
19836 	vm_map_t                map,
19837 	vm_map_offset_ut        address_u,
19838 	vm_purgable_t           control,
19839 	int                    *state)
19840 {
19841 	vm_map_offset_t         address;
19842 	vm_map_entry_t          entry;
19843 	vm_object_t             object;
19844 	kern_return_t           kr;
19845 	boolean_t               was_nonvolatile;
19846 
19847 	/*
19848 	 * Vet all the input parameters and current type and state of the
19849 	 * underlaying object.  Return with an error if anything is amiss.
19850 	 */
19851 	if (map == VM_MAP_NULL) {
19852 		return KERN_INVALID_ARGUMENT;
19853 	}
19854 
19855 	if (control != VM_PURGABLE_SET_STATE &&
19856 	    control != VM_PURGABLE_GET_STATE &&
19857 	    control != VM_PURGABLE_PURGE_ALL &&
19858 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19859 		return KERN_INVALID_ARGUMENT;
19860 	}
19861 
19862 	if (control == VM_PURGABLE_PURGE_ALL) {
19863 		vm_purgeable_object_purge_all();
19864 		return KERN_SUCCESS;
19865 	}
19866 
19867 	if ((control == VM_PURGABLE_SET_STATE ||
19868 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19869 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19870 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19871 		return KERN_INVALID_ARGUMENT;
19872 	}
19873 
19874 	address = vm_sanitize_addr(map, address_u);
19875 
19876 	vm_map_lock_read(map);
19877 
19878 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19879 		/*
19880 		 * Must pass a valid non-submap address.
19881 		 */
19882 		vm_map_unlock_read(map);
19883 		return KERN_INVALID_ADDRESS;
19884 	}
19885 
19886 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
19887 	    control != VM_PURGABLE_GET_STATE) {
19888 		/*
19889 		 * Can't apply purgable controls to something you can't write.
19890 		 */
19891 		vm_map_unlock_read(map);
19892 		return KERN_PROTECTION_FAILURE;
19893 	}
19894 
19895 	object = VME_OBJECT(entry);
19896 	if (object == VM_OBJECT_NULL ||
19897 	    object->purgable == VM_PURGABLE_DENY) {
19898 		/*
19899 		 * Object must already be present and be purgeable.
19900 		 */
19901 		vm_map_unlock_read(map);
19902 		return KERN_INVALID_ARGUMENT;
19903 	}
19904 
19905 	vm_object_lock(object);
19906 
19907 #if 00
19908 	if (VME_OFFSET(entry) != 0 ||
19909 	    entry->vme_end - entry->vme_start != object->vo_size) {
19910 		/*
19911 		 * Can only apply purgable controls to the whole (existing)
19912 		 * object at once.
19913 		 */
19914 		vm_map_unlock_read(map);
19915 		vm_object_unlock(object);
19916 		return KERN_INVALID_ARGUMENT;
19917 	}
19918 #endif
19919 
19920 	assert(!entry->is_sub_map);
19921 	assert(!entry->use_pmap); /* purgeable has its own accounting */
19922 
19923 	vm_map_unlock_read(map);
19924 
19925 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19926 
19927 	kr = vm_object_purgable_control(object, control, state);
19928 
19929 	if (was_nonvolatile &&
19930 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
19931 	    map->pmap == kernel_pmap) {
19932 #if DEBUG
19933 		object->vo_purgeable_volatilizer = kernel_task;
19934 #endif /* DEBUG */
19935 	}
19936 
19937 	vm_object_unlock(object);
19938 
19939 	return kr;
19940 }
19941 
19942 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19943 vm_map_footprint_query_page_info(
19944 	vm_map_t        map,
19945 	vm_map_entry_t  map_entry,
19946 	vm_map_offset_t curr_s_offset,
19947 	int             *disposition_p)
19948 {
19949 	int             pmap_disp;
19950 	vm_object_t     object = VM_OBJECT_NULL;
19951 	int             disposition;
19952 	int             effective_page_size;
19953 
19954 	vm_map_lock_assert_held(map);
19955 	assert(!map->has_corpse_footprint);
19956 	assert(curr_s_offset >= map_entry->vme_start);
19957 	assert(curr_s_offset < map_entry->vme_end);
19958 
19959 	if (map_entry->is_sub_map) {
19960 		if (!map_entry->use_pmap) {
19961 			/* nested pmap: no footprint */
19962 			*disposition_p = 0;
19963 			return;
19964 		}
19965 	} else {
19966 		object = VME_OBJECT(map_entry);
19967 		if (object == VM_OBJECT_NULL) {
19968 			/* nothing mapped here: no need to ask */
19969 			*disposition_p = 0;
19970 			return;
19971 		}
19972 	}
19973 
19974 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19975 
19976 	pmap_disp = 0;
19977 
19978 	/*
19979 	 * Query the pmap.
19980 	 */
19981 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19982 
19983 	/*
19984 	 * Compute this page's disposition.
19985 	 */
19986 	disposition = 0;
19987 
19988 	/* deal with "alternate accounting" first */
19989 	if (!map_entry->is_sub_map &&
19990 	    object->vo_no_footprint) {
19991 		/* does not count in footprint */
19992 //		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19993 	} else if (!map_entry->is_sub_map &&
19994 	    !object->internal &&
19995 	    object->vo_ledger_tag &&
19996 	    VM_OBJECT_OWNER(object) != NULL &&
19997 	    VM_OBJECT_OWNER(object)->map == map) {
19998 		/* owned external object: wired pages count in footprint */
19999 		assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20000 		if ((((curr_s_offset
20001 		    - map_entry->vme_start
20002 		    + VME_OFFSET(map_entry))
20003 		    / effective_page_size) <
20004 		    object->wired_page_count)) {
20005 			/*
20006 			 * External object owned by this task: report the first
20007 			 * "#wired" pages as "resident" (to show that they
20008 			 * contribute to the footprint) but not "dirty"
20009 			 * (to avoid double-counting with the fake "owned"
20010 			 * region we'll report at the end of the address space
20011 			 * to account for all (mapped or not) owned memory
20012 			 * owned by this task.
20013 			 */
20014 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20015 		}
20016 	} else if (!map_entry->is_sub_map &&
20017 	    object->internal &&
20018 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
20019 	    (object->purgable == VM_PURGABLE_DENY &&
20020 	    object->vo_ledger_tag)) &&
20021 	    VM_OBJECT_OWNER(object) != NULL &&
20022 	    VM_OBJECT_OWNER(object)->map == map) {
20023 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20024 		if ((((curr_s_offset
20025 		    - map_entry->vme_start
20026 		    + VME_OFFSET(map_entry))
20027 		    / effective_page_size) <
20028 		    (object->resident_page_count +
20029 		    vm_compressor_pager_get_count(object->pager)))) {
20030 			/*
20031 			 * Non-volatile purgeable object owned
20032 			 * by this task: report the first
20033 			 * "#resident + #compressed" pages as
20034 			 * "resident" (to show that they
20035 			 * contribute to the footprint) but not
20036 			 * "dirty" (to avoid double-counting
20037 			 * with the fake "non-volatile" region
20038 			 * we'll report at the end of the
20039 			 * address space to account for all
20040 			 * (mapped or not) non-volatile memory
20041 			 * owned by this task.
20042 			 */
20043 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20044 		}
20045 	} else if (!map_entry->is_sub_map &&
20046 	    object->internal &&
20047 	    (object->purgable == VM_PURGABLE_VOLATILE ||
20048 	    object->purgable == VM_PURGABLE_EMPTY) &&
20049 	    VM_OBJECT_OWNER(object) != NULL &&
20050 	    VM_OBJECT_OWNER(object)->map == map) {
20051 		if (object->internal) {
20052 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20053 		}
20054 		if ((((curr_s_offset
20055 		    - map_entry->vme_start
20056 		    + VME_OFFSET(map_entry))
20057 		    / effective_page_size) <
20058 		    object->wired_page_count)) {
20059 			/*
20060 			 * Volatile|empty purgeable object owned
20061 			 * by this task: report the first
20062 			 * "#wired" pages as "resident" (to
20063 			 * show that they contribute to the
20064 			 * footprint) but not "dirty" (to avoid
20065 			 * double-counting with the fake
20066 			 * "non-volatile" region we'll report
20067 			 * at the end of the address space to
20068 			 * account for all (mapped or not)
20069 			 * non-volatile memory owned by this
20070 			 * task.
20071 			 */
20072 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20073 		}
20074 	} else if (!map_entry->is_sub_map &&
20075 	    map_entry->iokit_acct &&
20076 	    object->internal &&
20077 	    object->purgable == VM_PURGABLE_DENY) {
20078 		/*
20079 		 * Non-purgeable IOKit memory: phys_footprint
20080 		 * includes the entire virtual mapping.
20081 		 */
20082 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20083 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20084 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20085 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
20086 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
20087 		/* alternate accounting */
20088 #if __arm64__ && (DEVELOPMENT || DEBUG)
20089 		if (map->pmap->footprint_was_suspended) {
20090 			/*
20091 			 * The assertion below can fail if dyld
20092 			 * suspended footprint accounting
20093 			 * while doing some adjustments to
20094 			 * this page;  the mapping would say
20095 			 * "use pmap accounting" but the page
20096 			 * would be marked "alternate
20097 			 * accounting".
20098 			 */
20099 		} else
20100 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
20101 		{
20102 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20103 		}
20104 		disposition = 0;
20105 	} else {
20106 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
20107 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20108 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20109 			disposition |= VM_PAGE_QUERY_PAGE_REF;
20110 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
20111 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20112 			} else {
20113 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20114 			}
20115 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
20116 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20117 			}
20118 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
20119 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20120 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20121 		}
20122 	}
20123 
20124 	*disposition_p = disposition;
20125 }
20126 
20127 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_ut offset_u,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20128 vm_map_page_info(
20129 	vm_map_t                map,
20130 	vm_map_offset_ut        offset_u,
20131 	vm_page_info_flavor_t   flavor,
20132 	vm_page_info_t          info,
20133 	mach_msg_type_number_t  *count)
20134 {
20135 	return vm_map_page_range_info_internal(map,
20136 	           offset_u, /* start of range */
20137 	           vm_sanitize_compute_ut_end(offset_u, 1), /* this will get rounded in the call to the page boundary */
20138 	           (int)-1, /* effective_page_shift: unspecified */
20139 	           flavor,
20140 	           info,
20141 	           count);
20142 }
20143 
20144 static __attribute__((always_inline, warn_unused_result))
20145 kern_return_t
vm_map_page_range_info_sanitize(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,vm_map_offset_t effective_page_mask,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_offset_t * offset_in_page)20146 vm_map_page_range_info_sanitize(
20147 	vm_map_t                map,
20148 	vm_map_offset_ut        start_offset_u,
20149 	vm_map_offset_ut        end_offset_u,
20150 	vm_map_offset_t         effective_page_mask,
20151 	vm_map_offset_t        *start,
20152 	vm_map_offset_t        *end,
20153 	vm_map_offset_t        *offset_in_page)
20154 {
20155 	kern_return_t           retval;
20156 	vm_map_size_t           size;
20157 
20158 	/*
20159 	 * Perform validation against map's mask but don't align start/end,
20160 	 * as we need for those to be aligned wrt effective_page_mask
20161 	 */
20162 	retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20163 	    VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, map,
20164 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
20165 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES, start,
20166 	    end, &size);
20167 	if (retval != KERN_SUCCESS) {
20168 		return retval;
20169 	}
20170 
20171 	retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20172 	    VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, effective_page_mask,
20173 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start,
20174 	    end, &size);
20175 	if (retval != KERN_SUCCESS) {
20176 		return retval;
20177 	}
20178 
20179 	*offset_in_page = vm_sanitize_offset_in_page(effective_page_mask,
20180 	    start_offset_u);
20181 
20182 	return KERN_SUCCESS;
20183 }
20184 
20185 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20186 vm_map_page_range_info_internal(
20187 	vm_map_t                map,
20188 	vm_map_offset_ut        start_offset_u,
20189 	vm_map_offset_ut        end_offset_u,
20190 	int                     effective_page_shift,
20191 	vm_page_info_flavor_t   flavor,
20192 	vm_page_info_t          info,
20193 	mach_msg_type_number_t  *count)
20194 {
20195 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
20196 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
20197 	vm_page_t               m = VM_PAGE_NULL;
20198 	kern_return_t           retval = KERN_SUCCESS;
20199 	int                     disposition = 0;
20200 	int                     ref_count = 0;
20201 	int                     depth = 0, info_idx = 0;
20202 	vm_page_info_basic_t    basic_info = 0;
20203 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
20204 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
20205 	boolean_t               do_region_footprint;
20206 	ledger_amount_t         ledger_resident, ledger_compressed;
20207 	int                     effective_page_size;
20208 	vm_map_offset_t         effective_page_mask;
20209 
20210 	switch (flavor) {
20211 	case VM_PAGE_INFO_BASIC:
20212 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
20213 			/*
20214 			 * The "vm_page_info_basic_data" structure was not
20215 			 * properly padded, so allow the size to be off by
20216 			 * one to maintain backwards binary compatibility...
20217 			 */
20218 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
20219 				return KERN_INVALID_ARGUMENT;
20220 			}
20221 		}
20222 		break;
20223 	default:
20224 		return KERN_INVALID_ARGUMENT;
20225 	}
20226 
20227 	if (effective_page_shift == -1) {
20228 		effective_page_shift = vm_self_region_page_shift_safely(map);
20229 		if (effective_page_shift == -1) {
20230 			return KERN_INVALID_ARGUMENT;
20231 		}
20232 	}
20233 	effective_page_size = (1 << effective_page_shift);
20234 	effective_page_mask = effective_page_size - 1;
20235 
20236 
20237 	retval = vm_map_page_range_info_sanitize(map,
20238 	    start_offset_u,
20239 	    end_offset_u,
20240 	    effective_page_mask,
20241 	    &start,
20242 	    &end,
20243 	    &offset_in_page);
20244 	if (retval != KERN_SUCCESS) {
20245 		return vm_sanitize_get_kr(retval);
20246 	}
20247 
20248 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
20249 
20250 	do_region_footprint = task_self_region_footprint();
20251 	disposition = 0;
20252 	ref_count = 0;
20253 	depth = 0;
20254 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
20255 
20256 	vm_map_lock_read(map);
20257 
20258 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
20259 
20260 	for (curr_s_offset = start; curr_s_offset < end;) {
20261 		/*
20262 		 * New lookup needs reset of these variables.
20263 		 */
20264 		curr_object = object = VM_OBJECT_NULL;
20265 		offset_in_object = 0;
20266 		ref_count = 0;
20267 		depth = 0;
20268 
20269 		if (do_region_footprint &&
20270 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
20271 			/*
20272 			 * Request for "footprint" info about a page beyond
20273 			 * the end of address space: this must be for
20274 			 * the fake region vm_map_region_recurse_64()
20275 			 * reported to account for non-volatile purgeable
20276 			 * memory owned by this task.
20277 			 */
20278 			disposition = 0;
20279 
20280 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
20281 			    (unsigned) ledger_compressed) {
20282 				/*
20283 				 * We haven't reported all the "non-volatile
20284 				 * compressed" pages yet, so report this fake
20285 				 * page as "compressed".
20286 				 */
20287 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20288 			} else {
20289 				/*
20290 				 * We've reported all the non-volatile
20291 				 * compressed page but not all the non-volatile
20292 				 * pages , so report this fake page as
20293 				 * "resident dirty".
20294 				 */
20295 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20296 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20297 				disposition |= VM_PAGE_QUERY_PAGE_REF;
20298 			}
20299 			switch (flavor) {
20300 			case VM_PAGE_INFO_BASIC:
20301 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20302 				basic_info->disposition = disposition;
20303 				basic_info->ref_count = 1;
20304 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20305 				basic_info->offset = 0;
20306 				basic_info->depth = 0;
20307 
20308 				info_idx++;
20309 				break;
20310 			}
20311 			curr_s_offset += effective_page_size;
20312 			continue;
20313 		}
20314 
20315 		/*
20316 		 * First, find the map entry covering "curr_s_offset", going down
20317 		 * submaps if necessary.
20318 		 */
20319 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
20320 			/* no entry -> no object -> no page */
20321 
20322 			if (curr_s_offset < vm_map_min(map)) {
20323 				/*
20324 				 * Illegal address that falls below map min.
20325 				 */
20326 				curr_e_offset = MIN(end, vm_map_min(map));
20327 			} else if (curr_s_offset >= vm_map_max(map)) {
20328 				/*
20329 				 * Illegal address that falls on/after map max.
20330 				 */
20331 				curr_e_offset = end;
20332 			} else if (map_entry == vm_map_to_entry(map)) {
20333 				/*
20334 				 * Hit a hole.
20335 				 */
20336 				if (map_entry->vme_next == vm_map_to_entry(map)) {
20337 					/*
20338 					 * Empty map.
20339 					 */
20340 					curr_e_offset = MIN(map->max_offset, end);
20341 				} else {
20342 					/*
20343 					 * Hole at start of the map.
20344 					 */
20345 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20346 				}
20347 			} else {
20348 				if (map_entry->vme_next == vm_map_to_entry(map)) {
20349 					/*
20350 					 * Hole at the end of the map.
20351 					 */
20352 					curr_e_offset = MIN(map->max_offset, end);
20353 				} else {
20354 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20355 				}
20356 			}
20357 
20358 			assert(curr_e_offset >= curr_s_offset);
20359 
20360 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20361 
20362 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20363 
20364 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20365 
20366 			curr_s_offset = curr_e_offset;
20367 
20368 			info_idx += num_pages;
20369 
20370 			continue;
20371 		}
20372 
20373 		/* compute offset from this map entry's start */
20374 		offset_in_object = curr_s_offset - map_entry->vme_start;
20375 
20376 		/* compute offset into this map entry's object (or submap) */
20377 		offset_in_object += VME_OFFSET(map_entry);
20378 
20379 		if (map_entry->is_sub_map) {
20380 			vm_map_t sub_map = VM_MAP_NULL;
20381 			vm_page_info_t submap_info = 0;
20382 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20383 
20384 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20385 
20386 			submap_s_offset = offset_in_object;
20387 			submap_e_offset = submap_s_offset + range_len;
20388 
20389 			sub_map = VME_SUBMAP(map_entry);
20390 
20391 			vm_map_reference(sub_map);
20392 			vm_map_unlock_read(map);
20393 
20394 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20395 
20396 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20397 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20398 
20399 			retval = vm_map_page_range_info_internal(sub_map,
20400 			    submap_s_offset,
20401 			    submap_e_offset,
20402 			    effective_page_shift,
20403 			    VM_PAGE_INFO_BASIC,
20404 			    (vm_page_info_t) submap_info,
20405 			    count);
20406 
20407 			assert(retval == KERN_SUCCESS);
20408 
20409 			vm_map_lock_read(map);
20410 			vm_map_deallocate(sub_map);
20411 
20412 			/* Move the "info" index by the number of pages we inspected.*/
20413 			info_idx += range_len >> effective_page_shift;
20414 
20415 			/* Move our current offset by the size of the range we inspected.*/
20416 			curr_s_offset += range_len;
20417 
20418 			continue;
20419 		}
20420 
20421 		object = VME_OBJECT(map_entry);
20422 
20423 		if (object == VM_OBJECT_NULL) {
20424 			/*
20425 			 * We don't have an object here and, hence,
20426 			 * no pages to inspect. We'll fill up the
20427 			 * info structure appropriately.
20428 			 */
20429 
20430 			curr_e_offset = MIN(map_entry->vme_end, end);
20431 
20432 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20433 
20434 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20435 
20436 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20437 
20438 			curr_s_offset = curr_e_offset;
20439 
20440 			info_idx += num_pages;
20441 
20442 			continue;
20443 		}
20444 
20445 		if (do_region_footprint) {
20446 			disposition = 0;
20447 			if (map->has_corpse_footprint) {
20448 				/*
20449 				 * Query the page info data we saved
20450 				 * while forking the corpse.
20451 				 */
20452 				vm_map_corpse_footprint_query_page_info(
20453 					map,
20454 					curr_s_offset,
20455 					&disposition);
20456 			} else {
20457 				/*
20458 				 * Query the live pmap for footprint info
20459 				 * about this page.
20460 				 */
20461 				vm_map_footprint_query_page_info(
20462 					map,
20463 					map_entry,
20464 					curr_s_offset,
20465 					&disposition);
20466 			}
20467 			switch (flavor) {
20468 			case VM_PAGE_INFO_BASIC:
20469 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20470 				basic_info->disposition = disposition;
20471 				basic_info->ref_count = 1;
20472 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20473 				basic_info->offset = 0;
20474 				basic_info->depth = 0;
20475 
20476 				info_idx++;
20477 				break;
20478 			}
20479 			curr_s_offset += effective_page_size;
20480 			continue;
20481 		}
20482 
20483 		vm_object_reference(object);
20484 		/*
20485 		 * Shared mode -- so we can allow other readers
20486 		 * to grab the lock too.
20487 		 */
20488 		vm_object_lock_shared(object);
20489 
20490 		curr_e_offset = MIN(map_entry->vme_end, end);
20491 
20492 		vm_map_unlock_read(map);
20493 
20494 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20495 
20496 		curr_object = object;
20497 
20498 		for (; curr_s_offset < curr_e_offset;) {
20499 			if (object == curr_object) {
20500 				/* account for our object reference above. */
20501 				ref_count = os_ref_get_count_raw(&curr_object->ref_count) - 1;
20502 			} else {
20503 				ref_count = os_ref_get_count_raw(&curr_object->ref_count);
20504 			}
20505 
20506 			curr_offset_in_object = offset_in_object;
20507 
20508 			for (;;) {
20509 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20510 
20511 				if (m != VM_PAGE_NULL) {
20512 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20513 					break;
20514 				} else {
20515 					if (curr_object->internal &&
20516 					    curr_object->alive &&
20517 					    !curr_object->terminating &&
20518 					    curr_object->pager_ready) {
20519 						if (vm_object_compressor_pager_state_get(curr_object, vm_object_trunc_page(curr_offset_in_object))
20520 						    == VM_EXTERNAL_STATE_EXISTS) {
20521 							/* the pager has that page */
20522 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20523 							break;
20524 						}
20525 					}
20526 
20527 					/*
20528 					 * Go down the VM object shadow chain until we find the page
20529 					 * we're looking for.
20530 					 */
20531 
20532 					if (curr_object->shadow != VM_OBJECT_NULL) {
20533 						vm_object_t shadow = VM_OBJECT_NULL;
20534 
20535 						curr_offset_in_object += curr_object->vo_shadow_offset;
20536 						shadow = curr_object->shadow;
20537 
20538 						vm_object_lock_shared(shadow);
20539 						vm_object_unlock(curr_object);
20540 
20541 						curr_object = shadow;
20542 						depth++;
20543 						continue;
20544 					} else {
20545 						break;
20546 					}
20547 				}
20548 			}
20549 
20550 			/* The ref_count is not strictly accurate, it measures the number   */
20551 			/* of entities holding a ref on the object, they may not be mapping */
20552 			/* the object or may not be mapping the section holding the         */
20553 			/* target page but its still a ball park number and though an over- */
20554 			/* count, it picks up the copy-on-write cases                       */
20555 
20556 			/* We could also get a picture of page sharing from pmap_attributes */
20557 			/* but this would under count as only faulted-in mappings would     */
20558 			/* show up.							    */
20559 
20560 			if ((curr_object == object) && curr_object->shadow) {
20561 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20562 			}
20563 
20564 			if (!curr_object->internal) {
20565 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20566 			}
20567 
20568 			if (m != VM_PAGE_NULL) {
20569 				if (m->vmp_fictitious) {
20570 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20571 				} else {
20572 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20573 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20574 					}
20575 
20576 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20577 						disposition |= VM_PAGE_QUERY_PAGE_REF;
20578 					}
20579 
20580 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20581 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20582 					}
20583 
20584 					/*
20585 					 * XXX TODO4K:
20586 					 * when this routine deals with 4k
20587 					 * pages, check the appropriate CS bit
20588 					 * here.
20589 					 */
20590 					if (m->vmp_cs_validated) {
20591 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20592 					}
20593 					if (m->vmp_cs_tainted) {
20594 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20595 					}
20596 					if (m->vmp_cs_nx) {
20597 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20598 					}
20599 					if (m->vmp_reusable || curr_object->all_reusable) {
20600 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20601 					}
20602 				}
20603 			}
20604 
20605 			switch (flavor) {
20606 			case VM_PAGE_INFO_BASIC:
20607 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20608 				basic_info->disposition = disposition;
20609 				basic_info->ref_count = ref_count;
20610 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
20611 				    VM_KERNEL_ADDRHASH(curr_object);
20612 				basic_info->offset =
20613 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20614 				basic_info->depth = depth;
20615 
20616 				info_idx++;
20617 				break;
20618 			}
20619 
20620 			disposition = 0;
20621 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20622 
20623 			/*
20624 			 * Move to next offset in the range and in our object.
20625 			 */
20626 			curr_s_offset += effective_page_size;
20627 			offset_in_object += effective_page_size;
20628 			curr_offset_in_object = offset_in_object;
20629 
20630 			if (curr_object != object) {
20631 				vm_object_unlock(curr_object);
20632 
20633 				curr_object = object;
20634 
20635 				vm_object_lock_shared(curr_object);
20636 			} else {
20637 				vm_object_lock_yield_shared(curr_object);
20638 			}
20639 		}
20640 
20641 		vm_object_unlock(curr_object);
20642 		vm_object_deallocate(curr_object);
20643 
20644 		vm_map_lock_read(map);
20645 	}
20646 
20647 	vm_map_unlock_read(map);
20648 	return retval;
20649 }
20650 
20651 static __attribute__((always_inline, warn_unused_result))
20652 kern_return_t
vm_map_msync_sanitize(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_object_offset_t * address,vm_map_size_t * size)20653 vm_map_msync_sanitize(
20654 	vm_map_t                map,
20655 	vm_map_address_ut       address_u,
20656 	vm_map_size_ut          size_u,
20657 	vm_object_offset_t     *address,
20658 	vm_map_size_t          *size)
20659 {
20660 	vm_object_offset_t      end;
20661 
20662 	return vm_sanitize_addr_size(address_u, size_u,
20663 	           VM_SANITIZE_CALLER_VM_MAP_MSYNC,
20664 	           map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS,
20665 	           address, &end, size);
20666 }
20667 
20668 /*
20669  *	vm_map_msync
20670  *
20671  *	Synchronises the memory range specified with its backing store
20672  *	image by either flushing or cleaning the contents to the appropriate
20673  *	memory manager engaging in a memory object synchronize dialog with
20674  *	the manager.  The client doesn't return until the manager issues
20675  *	m_o_s_completed message.  MIG Magically converts user task parameter
20676  *	to the task's address map.
20677  *
20678  *	interpretation of sync_flags
20679  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
20680  *				  pages to manager.
20681  *
20682  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20683  *				- discard pages, write dirty or precious
20684  *				  pages back to memory manager.
20685  *
20686  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20687  *				- write dirty or precious pages back to
20688  *				  the memory manager.
20689  *
20690  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
20691  *				  is a hole in the region, and we would
20692  *				  have returned KERN_SUCCESS, return
20693  *				  KERN_INVALID_ADDRESS instead.
20694  *
20695  *	NOTE
20696  *	The memory object attributes have not yet been implemented, this
20697  *	function will have to deal with the invalidate attribute
20698  *
20699  *	RETURNS
20700  *	KERN_INVALID_TASK		Bad task parameter
20701  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
20702  *	KERN_SUCCESS			The usual.
20703  *	KERN_INVALID_ADDRESS		There was a hole in the region.
20704  */
20705 
20706 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_sync_t sync_flags)20707 vm_map_msync(
20708 	vm_map_t                map,
20709 	vm_map_address_ut       address_u,
20710 	vm_map_size_ut          size_u,
20711 	vm_sync_t               sync_flags)
20712 {
20713 	vm_map_entry_t          entry;
20714 	vm_map_size_t           size, amount_left;
20715 	vm_object_offset_t      address, offset;
20716 	vm_object_offset_t      start_offset, end_offset;
20717 	boolean_t               do_sync_req;
20718 	boolean_t               had_hole = FALSE;
20719 	vm_map_offset_t         pmap_offset;
20720 	kern_return_t           kr;
20721 
20722 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20723 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20724 		return KERN_INVALID_ARGUMENT;
20725 	}
20726 
20727 	if (map == VM_MAP_NULL) {
20728 		return KERN_INVALID_TASK;
20729 	}
20730 
20731 	kr = vm_map_msync_sanitize(map,
20732 	    address_u,
20733 	    size_u,
20734 	    &address,
20735 	    &size);
20736 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20737 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20738 	}
20739 	if (__improbable(kr != KERN_SUCCESS)) {
20740 		return vm_sanitize_get_kr(kr);
20741 	}
20742 
20743 	amount_left = size;
20744 
20745 	while (amount_left > 0) {
20746 		vm_object_size_t        flush_size;
20747 		vm_object_t             object;
20748 
20749 		vm_map_lock(map);
20750 		if (!vm_map_lookup_entry(map,
20751 		    address,
20752 		    &entry)) {
20753 			vm_map_size_t   skip;
20754 
20755 			/*
20756 			 * hole in the address map.
20757 			 */
20758 			had_hole = TRUE;
20759 
20760 			if (sync_flags & VM_SYNC_KILLPAGES) {
20761 				/*
20762 				 * For VM_SYNC_KILLPAGES, there should be
20763 				 * no holes in the range, since we couldn't
20764 				 * prevent someone else from allocating in
20765 				 * that hole and we wouldn't want to "kill"
20766 				 * their pages.
20767 				 */
20768 				vm_map_unlock(map);
20769 				break;
20770 			}
20771 
20772 			/*
20773 			 * Check for empty map.
20774 			 */
20775 			if (entry == vm_map_to_entry(map) &&
20776 			    entry->vme_next == entry) {
20777 				vm_map_unlock(map);
20778 				break;
20779 			}
20780 			/*
20781 			 * Check that we don't wrap and that
20782 			 * we have at least one real map entry.
20783 			 */
20784 			if ((map->hdr.nentries == 0) ||
20785 			    (entry->vme_next->vme_start < address)) {
20786 				vm_map_unlock(map);
20787 				break;
20788 			}
20789 			/*
20790 			 * Move up to the next entry if needed
20791 			 */
20792 			skip = (entry->vme_next->vme_start - address);
20793 			if (skip >= amount_left) {
20794 				amount_left = 0;
20795 			} else {
20796 				amount_left -= skip;
20797 			}
20798 			address = entry->vme_next->vme_start;
20799 			vm_map_unlock(map);
20800 			continue;
20801 		}
20802 
20803 		offset = address - entry->vme_start;
20804 		pmap_offset = address;
20805 
20806 		/*
20807 		 * do we have more to flush than is contained in this
20808 		 * entry ?
20809 		 */
20810 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
20811 			flush_size = entry->vme_end -
20812 			    (entry->vme_start + offset);
20813 		} else {
20814 			flush_size = amount_left;
20815 		}
20816 		amount_left -= flush_size;
20817 		address += flush_size;
20818 
20819 		if (entry->is_sub_map == TRUE) {
20820 			vm_map_t        local_map;
20821 			vm_map_offset_t local_offset;
20822 
20823 			local_map = VME_SUBMAP(entry);
20824 			local_offset = VME_OFFSET(entry);
20825 			vm_map_reference(local_map);
20826 			vm_map_unlock(map);
20827 			if (vm_map_msync(
20828 				    local_map,
20829 				    local_offset,
20830 				    flush_size,
20831 				    sync_flags) == KERN_INVALID_ADDRESS) {
20832 				had_hole = TRUE;
20833 			}
20834 			vm_map_deallocate(local_map);
20835 			continue;
20836 		}
20837 		object = VME_OBJECT(entry);
20838 
20839 		/*
20840 		 * We can't sync this object if the object has not been
20841 		 * created yet
20842 		 */
20843 		if (object == VM_OBJECT_NULL) {
20844 			vm_map_unlock(map);
20845 			continue;
20846 		}
20847 		offset += VME_OFFSET(entry);
20848 
20849 		vm_object_lock(object);
20850 
20851 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20852 			int kill_pages = 0;
20853 
20854 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20855 				/*
20856 				 * This is a destructive operation and so we
20857 				 * err on the side of limiting the range of
20858 				 * the operation.
20859 				 */
20860 				start_offset = vm_object_round_page(offset);
20861 				end_offset = vm_object_trunc_page(offset + flush_size);
20862 
20863 				if (end_offset <= start_offset) {
20864 					vm_object_unlock(object);
20865 					vm_map_unlock(map);
20866 					continue;
20867 				}
20868 
20869 				pmap_offset += start_offset - offset;
20870 			} else {
20871 				start_offset = offset;
20872 				end_offset = offset + flush_size;
20873 			}
20874 
20875 			if (sync_flags & VM_SYNC_KILLPAGES) {
20876 				if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
20877 				    ((object->copy_strategy !=
20878 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
20879 				    (object->vo_copy == VM_OBJECT_NULL))) &&
20880 				    (object->shadow == VM_OBJECT_NULL)) {
20881 					if (os_ref_get_count_raw(&object->ref_count) != 1) {
20882 						vm_page_stats_reusable.free_shared++;
20883 					}
20884 					kill_pages = 1;
20885 				} else {
20886 					kill_pages = -1;
20887 				}
20888 			}
20889 			if (kill_pages != -1) {
20890 				vm_object_deactivate_pages(
20891 					object,
20892 					start_offset,
20893 					(vm_object_size_t) (end_offset - start_offset),
20894 					kill_pages,
20895 					FALSE, /* reusable_pages */
20896 					FALSE, /* reusable_no_write */
20897 					map->pmap,
20898 					pmap_offset);
20899 			}
20900 			vm_object_unlock(object);
20901 			vm_map_unlock(map);
20902 			continue;
20903 		}
20904 		/*
20905 		 * We can't sync this object if there isn't a pager.
20906 		 * Don't bother to sync internal objects, since there can't
20907 		 * be any "permanent" storage for these objects anyway.
20908 		 */
20909 		if ((object->pager == MEMORY_OBJECT_NULL) ||
20910 		    (object->internal) || (object->private)) {
20911 			vm_object_unlock(object);
20912 			vm_map_unlock(map);
20913 			continue;
20914 		}
20915 		/*
20916 		 * keep reference on the object until syncing is done
20917 		 */
20918 		vm_object_reference_locked(object);
20919 		vm_object_unlock(object);
20920 
20921 		vm_map_unlock(map);
20922 
20923 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20924 			start_offset = vm_object_trunc_page(offset);
20925 			end_offset = vm_object_round_page(offset + flush_size);
20926 		} else {
20927 			start_offset = offset;
20928 			end_offset = offset + flush_size;
20929 		}
20930 
20931 		do_sync_req = vm_object_sync(object,
20932 		    start_offset,
20933 		    (end_offset - start_offset),
20934 		    sync_flags & VM_SYNC_INVALIDATE,
20935 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20936 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20937 		    sync_flags & VM_SYNC_SYNCHRONOUS);
20938 
20939 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20940 			/*
20941 			 * clear out the clustering and read-ahead hints
20942 			 */
20943 			vm_object_lock(object);
20944 
20945 			object->pages_created = 0;
20946 			object->pages_used = 0;
20947 			object->sequential = 0;
20948 			object->last_alloc = 0;
20949 
20950 			vm_object_unlock(object);
20951 		}
20952 		vm_object_deallocate(object);
20953 	} /* while */
20954 
20955 	/* for proper msync() behaviour */
20956 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20957 		return KERN_INVALID_ADDRESS;
20958 	}
20959 
20960 	return KERN_SUCCESS;
20961 }/* vm_msync */
20962 
20963 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20964 vm_named_entry_associate_vm_object(
20965 	vm_named_entry_t        named_entry,
20966 	vm_object_t             object,
20967 	vm_object_offset_t      offset,
20968 	vm_object_size_t        size,
20969 	vm_prot_t               prot)
20970 {
20971 	vm_map_copy_t copy;
20972 	vm_map_entry_t copy_entry;
20973 
20974 	assert(!named_entry->is_sub_map);
20975 	assert(!named_entry->is_copy);
20976 	assert(!named_entry->is_object);
20977 	assert(!named_entry->internal);
20978 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20979 
20980 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20981 	copy->offset = offset;
20982 	copy->size = size;
20983 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20984 
20985 	copy_entry = vm_map_copy_entry_create(copy);
20986 	copy_entry->protection = prot;
20987 	copy_entry->max_protection = prot;
20988 	copy_entry->use_pmap = TRUE;
20989 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20990 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20991 	VME_OBJECT_SET(copy_entry, object, false, 0);
20992 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20993 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20994 
20995 	named_entry->backing.copy = copy;
20996 	named_entry->is_object = TRUE;
20997 	if (object->internal) {
20998 		named_entry->internal = TRUE;
20999 	}
21000 
21001 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
21002 	    named_entry, copy, object, offset, size, prot);
21003 }
21004 
21005 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)21006 vm_named_entry_to_vm_object(
21007 	vm_named_entry_t named_entry)
21008 {
21009 	vm_map_copy_t   copy;
21010 	vm_map_entry_t  copy_entry;
21011 	vm_object_t     object;
21012 
21013 	assert(!named_entry->is_sub_map);
21014 	assert(!named_entry->is_copy);
21015 	assert(named_entry->is_object);
21016 	copy = named_entry->backing.copy;
21017 	assert(copy != VM_MAP_COPY_NULL);
21018 	/*
21019 	 * Assert that the vm_map_copy is coming from the right
21020 	 * zone and hasn't been forged
21021 	 */
21022 	vm_map_copy_require(copy);
21023 	assert(copy->cpy_hdr.nentries == 1);
21024 	copy_entry = vm_map_copy_first_entry(copy);
21025 	object = VME_OBJECT(copy_entry);
21026 
21027 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
21028 
21029 	return object;
21030 }
21031 
21032 /*
21033  *	Routine:	convert_port_entry_to_map
21034  *	Purpose:
21035  *		Convert from a port specifying an entry or a task
21036  *		to a map. Doesn't consume the port ref; produces a map ref,
21037  *		which may be null.  Unlike convert_port_to_map, the
21038  *		port may be task or a named entry backed.
21039  *	Conditions:
21040  *		Nothing locked.
21041  */
21042 
21043 vm_map_t
convert_port_entry_to_map(ipc_port_t port)21044 convert_port_entry_to_map(
21045 	ipc_port_t      port)
21046 {
21047 	vm_map_t map = VM_MAP_NULL;
21048 	vm_named_entry_t named_entry;
21049 
21050 	if (!IP_VALID(port)) {
21051 		return VM_MAP_NULL;
21052 	}
21053 
21054 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
21055 		return convert_port_to_map(port);
21056 	}
21057 
21058 	named_entry = mach_memory_entry_from_port(port);
21059 
21060 	if ((named_entry->is_sub_map) &&
21061 	    (named_entry->protection & VM_PROT_WRITE)) {
21062 		map = named_entry->backing.map;
21063 		if (map->pmap != PMAP_NULL) {
21064 			if (map->pmap == kernel_pmap) {
21065 				panic("userspace has access "
21066 				    "to a kernel map %p", map);
21067 			}
21068 			pmap_require(map->pmap);
21069 		}
21070 		vm_map_reference(map);
21071 	}
21072 
21073 	return map;
21074 }
21075 
21076 /*
21077  * Export routines to other components for the things we access locally through
21078  * macros.
21079  */
21080 #undef current_map
21081 vm_map_t
current_map(void)21082 current_map(void)
21083 {
21084 	return current_map_fast();
21085 }
21086 
21087 /*
21088  *	vm_map_reference:
21089  *
21090  *	Takes a reference on the specified map.
21091  */
21092 void
vm_map_reference(vm_map_t map)21093 vm_map_reference(
21094 	vm_map_t        map)
21095 {
21096 	if (__probable(map != VM_MAP_NULL)) {
21097 		vm_map_require(map);
21098 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
21099 	}
21100 }
21101 
21102 /*
21103  *	vm_map_deallocate:
21104  *
21105  *	Removes a reference from the specified map,
21106  *	destroying it if no references remain.
21107  *	The map should not be locked.
21108  */
21109 void
vm_map_deallocate(vm_map_t map)21110 vm_map_deallocate(
21111 	vm_map_t        map)
21112 {
21113 	if (__probable(map != VM_MAP_NULL)) {
21114 		vm_map_require(map);
21115 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
21116 			vm_map_destroy(map);
21117 		}
21118 	}
21119 }
21120 
21121 void
vm_map_inspect_deallocate(vm_map_inspect_t map)21122 vm_map_inspect_deallocate(
21123 	vm_map_inspect_t      map)
21124 {
21125 	vm_map_deallocate((vm_map_t)map);
21126 }
21127 
21128 void
vm_map_read_deallocate(vm_map_read_t map)21129 vm_map_read_deallocate(
21130 	vm_map_read_t      map)
21131 {
21132 	vm_map_deallocate((vm_map_t)map);
21133 }
21134 
21135 
21136 void
vm_map_disable_NX(vm_map_t map)21137 vm_map_disable_NX(vm_map_t map)
21138 {
21139 	if (map == NULL) {
21140 		return;
21141 	}
21142 	if (map->pmap == NULL) {
21143 		return;
21144 	}
21145 
21146 	pmap_disable_NX(map->pmap);
21147 }
21148 
21149 void
vm_map_disallow_data_exec(vm_map_t map)21150 vm_map_disallow_data_exec(vm_map_t map)
21151 {
21152 	if (map == NULL) {
21153 		return;
21154 	}
21155 
21156 	map->map_disallow_data_exec = TRUE;
21157 }
21158 
21159 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
21160  * more descriptive.
21161  */
21162 void
vm_map_set_32bit(vm_map_t map)21163 vm_map_set_32bit(vm_map_t map)
21164 {
21165 #if defined(__arm64__)
21166 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
21167 #else
21168 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
21169 #endif
21170 }
21171 
21172 
21173 void
vm_map_set_64bit(vm_map_t map)21174 vm_map_set_64bit(vm_map_t map)
21175 {
21176 #if defined(__arm64__)
21177 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
21178 #else
21179 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
21180 #endif
21181 }
21182 
21183 /*
21184  * Expand the maximum size of an existing map to 64GB.
21185  */
21186 void
vm_map_set_jumbo(vm_map_t map)21187 vm_map_set_jumbo(vm_map_t map)
21188 {
21189 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21190 	vm_map_set_max_addr(map, ~0, false);
21191 #else /* arm64 */
21192 	(void) map;
21193 #endif
21194 }
21195 
21196 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21197 /*
21198  * Expand the maximum size of an existing map to the maximum supported.
21199  */
21200 void
vm_map_set_extra_jumbo(vm_map_t map)21201 vm_map_set_extra_jumbo(vm_map_t map)
21202 {
21203 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21204 	vm_map_set_max_addr(map, ~0, true);
21205 #else /* arm64 */
21206 	(void) map;
21207 #endif
21208 }
21209 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21210 
21211 /*
21212  * This map has a JIT entitlement
21213  */
21214 void
vm_map_set_jit_entitled(vm_map_t map)21215 vm_map_set_jit_entitled(vm_map_t map)
21216 {
21217 #if defined (__arm64__)
21218 	pmap_set_jit_entitled(map->pmap);
21219 #else /* arm64 */
21220 	(void) map;
21221 #endif
21222 }
21223 
21224 /*
21225  * Get status of this maps TPRO flag
21226  */
21227 boolean_t
vm_map_tpro(vm_map_t map)21228 vm_map_tpro(vm_map_t map)
21229 {
21230 #if defined (__arm64e__)
21231 	return pmap_get_tpro(map->pmap);
21232 #else /* arm64e */
21233 	(void) map;
21234 	return FALSE;
21235 #endif
21236 }
21237 
21238 /*
21239  * This map has TPRO enabled
21240  */
21241 void
vm_map_set_tpro(vm_map_t map)21242 vm_map_set_tpro(vm_map_t map)
21243 {
21244 #if defined (__arm64e__)
21245 	pmap_set_tpro(map->pmap);
21246 #else /* arm64e */
21247 	(void) map;
21248 #endif
21249 }
21250 
21251 /*
21252  * Does this map have TPRO enforcement enabled
21253  */
21254 boolean_t
vm_map_tpro_enforcement(vm_map_t map)21255 vm_map_tpro_enforcement(vm_map_t map)
21256 {
21257 	return map->tpro_enforcement;
21258 }
21259 
21260 /*
21261  * Set TPRO enforcement for this map
21262  */
21263 void
vm_map_set_tpro_enforcement(vm_map_t map)21264 vm_map_set_tpro_enforcement(vm_map_t map)
21265 {
21266 	if (vm_map_tpro(map)) {
21267 		vm_map_lock(map);
21268 		map->tpro_enforcement = TRUE;
21269 		vm_map_unlock(map);
21270 	}
21271 }
21272 
21273 /*
21274  * Enable TPRO on the requested region
21275  *
21276  * Note:
21277  *     This routine is primarily intended to be called during/soon after map
21278  *     creation before the associated task has been released to run. It is only
21279  *     currently safe when we have no resident pages.
21280  */
21281 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)21282 vm_map_set_tpro_range(
21283 	__unused vm_map_t map,
21284 	__unused vm_map_address_t start,
21285 	__unused vm_map_address_t end)
21286 {
21287 	return TRUE;
21288 }
21289 
21290 /*
21291  * Expand the maximum size of an existing map.
21292  */
21293 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset,__unused bool extra_jumbo)21294 vm_map_set_max_addr(
21295 	vm_map_t map,
21296 	vm_map_offset_t new_max_offset,
21297 	__unused bool extra_jumbo)
21298 {
21299 #if defined(__arm64__)
21300 	vm_map_offset_t max_supported_offset;
21301 	vm_map_offset_t old_max_offset;
21302 	unsigned int option = ARM_PMAP_MAX_OFFSET_JUMBO;
21303 
21304 	vm_map_lock(map);
21305 
21306 	old_max_offset = map->max_offset;
21307 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21308 	if (extra_jumbo) {
21309 		option = ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO;
21310 	}
21311 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21312 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), option);
21313 
21314 	new_max_offset = trunc_page(new_max_offset);
21315 
21316 	/* The address space cannot be shrunk using this routine. */
21317 	if (old_max_offset >= new_max_offset) {
21318 		vm_map_unlock(map);
21319 		return;
21320 	}
21321 
21322 	if (max_supported_offset < new_max_offset) {
21323 		new_max_offset = max_supported_offset;
21324 	}
21325 
21326 	map->max_offset = new_max_offset;
21327 
21328 	/*
21329 	 * Disable the following chunk of code that extends the "holes" list
21330 	 * to accomodate a larger VM map.
21331 	 * In `vm_map_create_options()`, we now set the end of the "holes" list to
21332 	 * max(map->max_offset, MACH_VM_MAX_ADDRESS) for all platforms.
21333 	 * MACH_VM_MAX_ADDRESS is the largest virtual address a userspace process
21334 	 * can map, so any `new_max_offset` value will be <= MACH_VM_MAX_ADDRESS.
21335 	 * The "holes" list does not need to be adjusted.
21336 	 */
21337 #if 0
21338 	if (map->holelistenabled) {
21339 		if (map->holes_list->prev->vme_end == old_max_offset) {
21340 			/*
21341 			 * There is already a hole at the end of the map; simply make it bigger.
21342 			 */
21343 			map->holes_list->prev->vme_end = map->max_offset;
21344 		} else {
21345 			/*
21346 			 * There is no hole at the end, so we need to create a new hole
21347 			 * for the new empty space we're creating.
21348 			 */
21349 			struct vm_map_links *new_hole;
21350 
21351 			new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
21352 			new_hole->start = old_max_offset;
21353 			new_hole->end = map->max_offset;
21354 			new_hole->prev = map->holes_list->prev;
21355 			new_hole->next = (struct vm_map_entry *)map->holes_list;
21356 			map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
21357 			map->holes_list->prev = (struct vm_map_entry *)new_hole;
21358 		}
21359 	}
21360 #endif
21361 
21362 	vm_map_unlock(map);
21363 #else
21364 	(void)map;
21365 	(void)new_max_offset;
21366 #endif
21367 }
21368 
21369 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21370 vm_compute_max_offset(boolean_t is64)
21371 {
21372 #if defined(__arm64__)
21373 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21374 #else
21375 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21376 #endif
21377 }
21378 
21379 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21380 vm_map_get_max_aslr_slide_section(
21381 	vm_map_t                map __unused,
21382 	int64_t                 *max_sections,
21383 	int64_t                 *section_size)
21384 {
21385 #if defined(__arm64__)
21386 	*max_sections = 3;
21387 	*section_size = ARM_TT_TWIG_SIZE;
21388 #else
21389 	*max_sections = 1;
21390 	*section_size = 0;
21391 #endif
21392 }
21393 
21394 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21395 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21396 {
21397 #if defined(__arm64__)
21398 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21399 	 * limited embedded address space; this is also meant to minimize pmap
21400 	 * memory usage on 16KB page systems.
21401 	 */
21402 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21403 #else
21404 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21405 #endif
21406 }
21407 
21408 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21409 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21410 {
21411 #if defined(__arm64__)
21412 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21413 	 * of independent entropy on 16KB page systems.
21414 	 */
21415 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21416 #else
21417 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21418 #endif
21419 }
21420 
21421 boolean_t
vm_map_is_64bit(vm_map_t map)21422 vm_map_is_64bit(
21423 	vm_map_t map)
21424 {
21425 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21426 }
21427 
21428 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21429 vm_map_has_hard_pagezero(
21430 	vm_map_t        map,
21431 	vm_map_offset_t pagezero_size)
21432 {
21433 	/*
21434 	 * XXX FBDP
21435 	 * We should lock the VM map (for read) here but we can get away
21436 	 * with it for now because there can't really be any race condition:
21437 	 * the VM map's min_offset is changed only when the VM map is created
21438 	 * and when the zero page is established (when the binary gets loaded),
21439 	 * and this routine gets called only when the task terminates and the
21440 	 * VM map is being torn down, and when a new map is created via
21441 	 * load_machfile()/execve().
21442 	 */
21443 	return map->min_offset >= pagezero_size;
21444 }
21445 
21446 /*
21447  * Raise a VM map's maximun offset.
21448  */
21449 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21450 vm_map_raise_max_offset(
21451 	vm_map_t        map,
21452 	vm_map_offset_t new_max_offset)
21453 {
21454 	kern_return_t   ret;
21455 
21456 	vm_map_lock(map);
21457 	ret = KERN_INVALID_ADDRESS;
21458 
21459 	if (new_max_offset >= map->max_offset) {
21460 		if (!vm_map_is_64bit(map)) {
21461 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21462 				map->max_offset = new_max_offset;
21463 				ret = KERN_SUCCESS;
21464 			}
21465 		} else {
21466 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21467 				map->max_offset = new_max_offset;
21468 				ret = KERN_SUCCESS;
21469 			}
21470 		}
21471 	}
21472 
21473 	vm_map_unlock(map);
21474 	return ret;
21475 }
21476 
21477 
21478 /*
21479  * Raise a VM map's minimum offset.
21480  * To strictly enforce "page zero" reservation.
21481  */
21482 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21483 vm_map_raise_min_offset(
21484 	vm_map_t        map,
21485 	vm_map_offset_t new_min_offset)
21486 {
21487 	vm_map_entry_t  first_entry;
21488 
21489 	new_min_offset = vm_map_round_page(new_min_offset,
21490 	    VM_MAP_PAGE_MASK(map));
21491 
21492 	vm_map_lock(map);
21493 
21494 	if (new_min_offset < map->min_offset) {
21495 		/*
21496 		 * Can't move min_offset backwards, as that would expose
21497 		 * a part of the address space that was previously, and for
21498 		 * possibly good reasons, inaccessible.
21499 		 */
21500 		vm_map_unlock(map);
21501 		return KERN_INVALID_ADDRESS;
21502 	}
21503 	if (new_min_offset >= map->max_offset) {
21504 		/* can't go beyond the end of the address space */
21505 		vm_map_unlock(map);
21506 		return KERN_INVALID_ADDRESS;
21507 	}
21508 
21509 	first_entry = vm_map_first_entry(map);
21510 	if (first_entry != vm_map_to_entry(map) &&
21511 	    first_entry->vme_start < new_min_offset) {
21512 		/*
21513 		 * Some memory was already allocated below the new
21514 		 * minimun offset.  It's too late to change it now...
21515 		 */
21516 		vm_map_unlock(map);
21517 		return KERN_NO_SPACE;
21518 	}
21519 
21520 	map->min_offset = new_min_offset;
21521 
21522 	if (map->holelistenabled) {
21523 		assert(map->holes_list);
21524 		map->holes_list->start = new_min_offset;
21525 		assert(new_min_offset < map->holes_list->end);
21526 	}
21527 
21528 	vm_map_unlock(map);
21529 
21530 	return KERN_SUCCESS;
21531 }
21532 
21533 /*
21534  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21535  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21536  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21537  * have to reach over to the BSD data structures.
21538  */
21539 
21540 uint64_t vm_map_set_size_limit_count = 0;
21541 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21542 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21543 {
21544 	kern_return_t kr;
21545 
21546 	vm_map_lock(map);
21547 	if (new_size_limit < map->size) {
21548 		/* new limit should not be lower than its current size */
21549 		DTRACE_VM2(vm_map_set_size_limit_fail,
21550 		    vm_map_size_t, map->size,
21551 		    uint64_t, new_size_limit);
21552 		kr = KERN_FAILURE;
21553 	} else if (new_size_limit == map->size_limit) {
21554 		/* no change */
21555 		kr = KERN_SUCCESS;
21556 	} else {
21557 		/* set new limit */
21558 		DTRACE_VM2(vm_map_set_size_limit,
21559 		    vm_map_size_t, map->size,
21560 		    uint64_t, new_size_limit);
21561 		if (new_size_limit != RLIM_INFINITY) {
21562 			vm_map_set_size_limit_count++;
21563 		}
21564 		map->size_limit = new_size_limit;
21565 		kr = KERN_SUCCESS;
21566 	}
21567 	vm_map_unlock(map);
21568 	return kr;
21569 }
21570 
21571 uint64_t vm_map_set_data_limit_count = 0;
21572 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21573 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21574 {
21575 	kern_return_t kr;
21576 
21577 	vm_map_lock(map);
21578 	if (new_data_limit < map->size) {
21579 		/* new limit should not be lower than its current size */
21580 		DTRACE_VM2(vm_map_set_data_limit_fail,
21581 		    vm_map_size_t, map->size,
21582 		    uint64_t, new_data_limit);
21583 		kr = KERN_FAILURE;
21584 	} else if (new_data_limit == map->data_limit) {
21585 		/* no change */
21586 		kr = KERN_SUCCESS;
21587 	} else {
21588 		/* set new limit */
21589 		DTRACE_VM2(vm_map_set_data_limit,
21590 		    vm_map_size_t, map->size,
21591 		    uint64_t, new_data_limit);
21592 		if (new_data_limit != RLIM_INFINITY) {
21593 			vm_map_set_data_limit_count++;
21594 		}
21595 		map->data_limit = new_data_limit;
21596 		kr = KERN_SUCCESS;
21597 	}
21598 	vm_map_unlock(map);
21599 	return kr;
21600 }
21601 
21602 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21603 vm_map_set_user_wire_limit(vm_map_t     map,
21604     vm_size_t    limit)
21605 {
21606 	vm_map_lock(map);
21607 	map->user_wire_limit = limit;
21608 	vm_map_unlock(map);
21609 }
21610 
21611 
21612 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21613 vm_map_switch_protect(vm_map_t     map,
21614     boolean_t    val)
21615 {
21616 	vm_map_lock(map);
21617 	map->switch_protect = val;
21618 	vm_map_unlock(map);
21619 }
21620 
21621 extern int cs_process_enforcement_enable;
21622 boolean_t
vm_map_cs_enforcement(vm_map_t map)21623 vm_map_cs_enforcement(
21624 	vm_map_t map)
21625 {
21626 	if (cs_process_enforcement_enable) {
21627 		return TRUE;
21628 	}
21629 	return map->cs_enforcement;
21630 }
21631 
21632 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21633 vm_map_cs_wx_enable(
21634 	__unused vm_map_t map)
21635 {
21636 #if CODE_SIGNING_MONITOR
21637 	kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21638 	if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21639 		return KERN_SUCCESS;
21640 	}
21641 	return ret;
21642 #else
21643 	/* The VM manages WX memory entirely on its own */
21644 	return KERN_SUCCESS;
21645 #endif
21646 }
21647 
21648 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21649 vm_map_csm_allow_jit(
21650 	__unused vm_map_t map)
21651 {
21652 #if CODE_SIGNING_MONITOR
21653 	return csm_allow_jit_region(vm_map_pmap(map));
21654 #else
21655 	/* No code signing monitor to enforce JIT policy */
21656 	return KERN_SUCCESS;
21657 #endif
21658 }
21659 
21660 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21661 vm_map_cs_debugged_set(
21662 	vm_map_t map,
21663 	boolean_t val)
21664 {
21665 	vm_map_lock(map);
21666 	map->cs_debugged = val;
21667 	vm_map_unlock(map);
21668 }
21669 
21670 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21671 vm_map_cs_enforcement_set(
21672 	vm_map_t map,
21673 	boolean_t val)
21674 {
21675 	vm_map_lock(map);
21676 	map->cs_enforcement = val;
21677 	pmap_set_vm_map_cs_enforced(map->pmap, val);
21678 	vm_map_unlock(map);
21679 }
21680 
21681 /*
21682  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21683  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21684  * bump both counters.
21685  */
21686 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21687 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21688 {
21689 	pmap_t pmap = vm_map_pmap(map);
21690 
21691 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21692 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21693 }
21694 
21695 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21696 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21697 {
21698 	pmap_t pmap = vm_map_pmap(map);
21699 
21700 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21701 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21702 }
21703 
21704 /* Add (generate) code signature for memory range */
21705 #if CONFIG_DYNAMIC_CODE_SIGNING
21706 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21707 vm_map_sign(vm_map_t map,
21708     vm_map_offset_t start,
21709     vm_map_offset_t end)
21710 {
21711 	vm_map_entry_t entry;
21712 	vm_page_t m;
21713 	vm_object_t object;
21714 
21715 	/*
21716 	 * Vet all the input parameters and current type and state of the
21717 	 * underlaying object.  Return with an error if anything is amiss.
21718 	 */
21719 	if (map == VM_MAP_NULL) {
21720 		return KERN_INVALID_ARGUMENT;
21721 	}
21722 
21723 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21724 		return KERN_INVALID_ADDRESS;
21725 	}
21726 
21727 	vm_map_lock_read(map);
21728 
21729 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21730 		/*
21731 		 * Must pass a valid non-submap address.
21732 		 */
21733 		vm_map_unlock_read(map);
21734 		return KERN_INVALID_ADDRESS;
21735 	}
21736 
21737 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
21738 		/*
21739 		 * Map entry doesn't cover the requested range. Not handling
21740 		 * this situation currently.
21741 		 */
21742 		vm_map_unlock_read(map);
21743 		return KERN_INVALID_ARGUMENT;
21744 	}
21745 
21746 	object = VME_OBJECT(entry);
21747 	if (object == VM_OBJECT_NULL) {
21748 		/*
21749 		 * Object must already be present or we can't sign.
21750 		 */
21751 		vm_map_unlock_read(map);
21752 		return KERN_INVALID_ARGUMENT;
21753 	}
21754 
21755 	vm_object_lock(object);
21756 	vm_map_unlock_read(map);
21757 
21758 	while (start < end) {
21759 		uint32_t refmod;
21760 
21761 		m = vm_page_lookup(object,
21762 		    start - entry->vme_start + VME_OFFSET(entry));
21763 		if (m == VM_PAGE_NULL) {
21764 			/* shoud we try to fault a page here? we can probably
21765 			 * demand it exists and is locked for this request */
21766 			vm_object_unlock(object);
21767 			return KERN_FAILURE;
21768 		}
21769 		/* deal with special page status */
21770 		if (m->vmp_busy ||
21771 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
21772 			vm_object_unlock(object);
21773 			return KERN_FAILURE;
21774 		}
21775 
21776 		/* Page is OK... now "validate" it */
21777 		/* This is the place where we'll call out to create a code
21778 		 * directory, later */
21779 		/* XXX TODO4K: deal with 4k subpages individually? */
21780 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21781 
21782 		/* The page is now "clean" for codesigning purposes. That means
21783 		 * we don't consider it as modified (wpmapped) anymore. But
21784 		 * we'll disconnect the page so we note any future modification
21785 		 * attempts. */
21786 		m->vmp_wpmapped = FALSE;
21787 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21788 
21789 		/* Pull the dirty status from the pmap, since we cleared the
21790 		 * wpmapped bit */
21791 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21792 			SET_PAGE_DIRTY(m, FALSE);
21793 		}
21794 
21795 		/* On to the next page */
21796 		start += PAGE_SIZE;
21797 	}
21798 	vm_object_unlock(object);
21799 
21800 	return KERN_SUCCESS;
21801 }
21802 #endif
21803 
21804 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)21805 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21806 {
21807 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
21808 	vm_map_entry_t  next_entry;
21809 	kern_return_t   kr = KERN_SUCCESS;
21810 	VM_MAP_ZAP_DECLARE(zap_list);
21811 
21812 	vm_map_lock(map);
21813 
21814 	for (entry = vm_map_first_entry(map);
21815 	    entry != vm_map_to_entry(map);
21816 	    entry = next_entry) {
21817 		next_entry = entry->vme_next;
21818 
21819 		if (!entry->is_sub_map &&
21820 		    VME_OBJECT(entry) &&
21821 		    (VME_OBJECT(entry)->internal == TRUE) &&
21822 		    (os_ref_get_count_raw(&VME_OBJECT(entry)->ref_count) == 1)) {
21823 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21824 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21825 
21826 			(void)vm_map_delete(map, entry->vme_start,
21827 			    entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
21828 			    KMEM_GUARD_NONE, &zap_list);
21829 		}
21830 	}
21831 
21832 	vm_map_unlock(map);
21833 
21834 	vm_map_zap_dispose(&zap_list);
21835 
21836 	return kr;
21837 }
21838 
21839 
21840 #if DEVELOPMENT || DEBUG
21841 
21842 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)21843 vm_map_disconnect_page_mappings(
21844 	vm_map_t map,
21845 	boolean_t do_unnest)
21846 {
21847 	vm_map_entry_t entry;
21848 	ledger_amount_t byte_count = 0;
21849 
21850 	if (do_unnest == TRUE) {
21851 #ifndef NO_NESTED_PMAP
21852 		vm_map_lock(map);
21853 
21854 		for (entry = vm_map_first_entry(map);
21855 		    entry != vm_map_to_entry(map);
21856 		    entry = entry->vme_next) {
21857 			if (entry->is_sub_map && entry->use_pmap) {
21858 				/*
21859 				 * Make sure the range between the start of this entry and
21860 				 * the end of this entry is no longer nested, so that
21861 				 * we will only remove mappings from the pmap in use by this
21862 				 * this task
21863 				 */
21864 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
21865 			}
21866 		}
21867 		vm_map_unlock(map);
21868 #endif
21869 	}
21870 	vm_map_lock_read(map);
21871 
21872 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
21873 
21874 	for (entry = vm_map_first_entry(map);
21875 	    entry != vm_map_to_entry(map);
21876 	    entry = entry->vme_next) {
21877 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
21878 		    (VME_OBJECT(entry)->phys_contiguous))) {
21879 			continue;
21880 		}
21881 		if (entry->is_sub_map) {
21882 			assert(!entry->use_pmap);
21883 		}
21884 
21885 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
21886 	}
21887 	vm_map_unlock_read(map);
21888 
21889 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
21890 }
21891 
21892 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)21893 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
21894 {
21895 	vm_object_t object = NULL;
21896 	vm_object_offset_t offset;
21897 	vm_prot_t prot;
21898 	boolean_t wired;
21899 	vm_map_version_t version;
21900 	vm_map_t real_map;
21901 	int result = KERN_FAILURE;
21902 
21903 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
21904 	vm_map_lock(map);
21905 
21906 	result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
21907 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
21908 	    NULL, &real_map, NULL);
21909 	if (object == NULL) {
21910 		result = KERN_MEMORY_ERROR;
21911 	} else if (object->pager) {
21912 		result = vm_compressor_pager_inject_error(object->pager,
21913 		    offset);
21914 	} else {
21915 		result = KERN_MEMORY_PRESENT;
21916 	}
21917 
21918 	if (object != NULL) {
21919 		vm_object_unlock(object);
21920 	}
21921 
21922 	if (real_map != map) {
21923 		vm_map_unlock(real_map);
21924 	}
21925 	vm_map_unlock(map);
21926 
21927 	return result;
21928 }
21929 
21930 /* iterate over map entries. Call the first argument block for the number of entries and the second for every entry
21931  * returns: KERN_SUCCESS if iteration completed ok,
21932  *      error code if callback returned an error
21933  *      KERN_FAILURE if there was a race of adding/removing entries during the iteration and the number of entries
21934  *      iterated is different from the number in the first call
21935  */
21936 static kern_return_t
21937 vm_map_entries_foreach_locked(vm_map_t map, kern_return_t (^count_handler)(int nentries),
21938     kern_return_t (^entry_handler)(void* entry))
21939 {
21940 	vm_map_lock_assert_held(map);
21941 	int nentries = map->hdr.nentries;
21942 	kern_return_t error = count_handler(nentries);
21943 	if (error) {
21944 		return error;
21945 	}
21946 
21947 	/* iterate until we loop back to the map, see get_vmmap_entries() */
21948 	vm_map_entry_t entry = vm_map_first_entry(map);
21949 	int count = 0;
21950 	while (entry != vm_map_to_entry(map)) {
21951 		error = entry_handler(entry);
21952 		if (error != KERN_SUCCESS) {
21953 			return error;
21954 		}
21955 		entry = entry->vme_next;
21956 		++count;
21957 		if (count > nentries) {
21958 			/* nentries and entries iteration don't agree on how many entries there are, shouldn't really happen */
21959 			return KERN_FAILURE;
21960 		}
21961 	}
21962 	if (count < nentries) {
21963 		return KERN_FAILURE;
21964 	}
21965 	return KERN_SUCCESS;
21966 }
21967 
21968 kern_return_t
21969 vm_map_entries_foreach(vm_map_t map, kern_return_t (^count_handler)(int nentries),
21970     kern_return_t (^entry_handler)(void* entry))
21971 {
21972 	vm_map_lock_read(map);
21973 	kern_return_t error = vm_map_entries_foreach_locked(map, count_handler, entry_handler);
21974 	vm_map_unlock_read(map);
21975 	return error;
21976 }
21977 
21978 /*
21979  * Dump info about the entry into the given buffer.
21980  * return true on success, false if there was not enough space in the give buffer
21981  * argument size in: bytes free in the given buffer, out: bytes written
21982  */
21983 kern_return_t
vm_map_dump_entry_and_compressor_pager(void * pentry,char * buf,size_t * size)21984 vm_map_dump_entry_and_compressor_pager(void* pentry, char *buf, size_t *size)
21985 {
21986 	size_t insize = *size;
21987 	kern_return_t kr;
21988 	size_t offset = 0;
21989 
21990 	*size = 0;
21991 	if (sizeof(struct vm_map_entry_info) > insize) {
21992 		return KERN_NO_SPACE;
21993 	}
21994 
21995 	vm_map_entry_t entry = (vm_map_entry_t)pentry;
21996 	struct vm_map_entry_info *out_entry = (struct vm_map_entry_info*)buf;
21997 	out_entry->vmei_start = entry->vme_start;
21998 	out_entry->vmei_end = entry->vme_end;
21999 	out_entry->vmei_alias = VME_ALIAS(entry);
22000 	out_entry->vmei_offset = VME_OFFSET(entry);
22001 	out_entry->vmei_is_sub_map = entry->is_sub_map;
22002 	out_entry->vmei_protection = entry->protection;
22003 	offset += sizeof(struct vm_map_entry_info);
22004 
22005 	out_entry->vmei_slot_mapping_count = 0;
22006 	out_entry->vmei_is_compressor_pager = false;
22007 	*size = offset;
22008 	if (out_entry->vmei_is_sub_map) {
22009 		return KERN_SUCCESS; // TODO: sub_map interrogation not supported yet
22010 	}
22011 	/* have a vm_object? */
22012 	vm_object_t object = VME_OBJECT(entry);
22013 	if (object == VM_OBJECT_NULL || !object->internal) {
22014 		return KERN_SUCCESS;
22015 	}
22016 	/* objects has a pager? */
22017 	memory_object_t pager = object->pager;
22018 	if (pager != MEMORY_OBJECT_NULL) {
22019 		return KERN_SUCCESS;
22020 	}
22021 	bool is_compressor = false;
22022 	unsigned int slot_mapping_count = 0;
22023 	size_t pager_info_size = insize - offset;
22024 	kr = vm_compressor_pager_dump(pager, buf + offset, &pager_info_size, &is_compressor, &slot_mapping_count);
22025 	if (kr != KERN_SUCCESS) {
22026 		/* didn't have enough space for everything we want to write, caller needs to retry */
22027 		return kr;
22028 	}
22029 	offset += pager_info_size;
22030 	/* if we got here, is_compressor should be true due to the object->internal check above, so this assignment
22031 	 * is just for sanity sake */
22032 	out_entry->vmei_is_compressor_pager = is_compressor;
22033 	out_entry->vmei_slot_mapping_count = slot_mapping_count;
22034 	*size = offset;
22035 	return KERN_SUCCESS;
22036 }
22037 
22038 
22039 #endif
22040 
22041 
22042 #if CONFIG_FREEZE
22043 
22044 
22045 extern struct freezer_context freezer_context_global;
22046 AbsoluteTime c_freezer_last_yield_ts = 0;
22047 
22048 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
22049 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
22050 
22051 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)22052 vm_map_freeze(
22053 	task_t       task,
22054 	unsigned int *purgeable_count,
22055 	unsigned int *wired_count,
22056 	unsigned int *clean_count,
22057 	unsigned int *dirty_count,
22058 	unsigned int dirty_budget,
22059 	unsigned int *shared_count,
22060 	int          *freezer_error_code,
22061 	boolean_t    eval_only)
22062 {
22063 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
22064 	kern_return_t   kr = KERN_SUCCESS;
22065 	boolean_t       evaluation_phase = TRUE;
22066 	vm_object_t     cur_shared_object = NULL;
22067 	int             cur_shared_obj_ref_cnt = 0;
22068 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
22069 
22070 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
22071 
22072 	/*
22073 	 * We need the exclusive lock here so that we can
22074 	 * block any page faults or lookups while we are
22075 	 * in the middle of freezing this vm map.
22076 	 */
22077 	vm_map_t map = task->map;
22078 
22079 	vm_map_lock(map);
22080 
22081 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
22082 
22083 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22084 		if (vm_compressor_low_on_space()) {
22085 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22086 		}
22087 
22088 		if (vm_swap_low_on_space()) {
22089 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22090 		}
22091 
22092 		kr = KERN_NO_SPACE;
22093 		goto done;
22094 	}
22095 
22096 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
22097 		/*
22098 		 * In-memory compressor backing the freezer. No disk.
22099 		 * So no need to do the evaluation phase.
22100 		 */
22101 		evaluation_phase = FALSE;
22102 
22103 		if (eval_only == TRUE) {
22104 			/*
22105 			 * We don't support 'eval_only' mode
22106 			 * in this non-swap config.
22107 			 */
22108 			*freezer_error_code = FREEZER_ERROR_GENERIC;
22109 			kr = KERN_INVALID_ARGUMENT;
22110 			goto done;
22111 		}
22112 
22113 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22114 		clock_get_uptime(&c_freezer_last_yield_ts);
22115 	}
22116 again:
22117 
22118 	for (entry2 = vm_map_first_entry(map);
22119 	    entry2 != vm_map_to_entry(map);
22120 	    entry2 = entry2->vme_next) {
22121 		vm_object_t src_object;
22122 
22123 		if (entry2->is_sub_map) {
22124 			continue;
22125 		}
22126 
22127 		src_object = VME_OBJECT(entry2);
22128 		if (!src_object ||
22129 		    src_object->phys_contiguous ||
22130 		    !src_object->internal) {
22131 			continue;
22132 		}
22133 
22134 		/* If eligible, scan the entry, moving eligible pages over to our parent object */
22135 
22136 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
22137 			/*
22138 			 * We skip purgeable objects during evaluation phase only.
22139 			 * If we decide to freeze this process, we'll explicitly
22140 			 * purge these objects before we go around again with
22141 			 * 'evaluation_phase' set to FALSE.
22142 			 */
22143 
22144 			if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
22145 				/*
22146 				 * We want to purge objects that may not belong to this task but are mapped
22147 				 * in this task alone. Since we already purged this task's purgeable memory
22148 				 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
22149 				 * on this task's purgeable objects. Hence the check for only volatile objects.
22150 				 */
22151 				if (evaluation_phase ||
22152 				    src_object->purgable != VM_PURGABLE_VOLATILE ||
22153 				    os_ref_get_count_raw(&src_object->ref_count) != 1) {
22154 					continue;
22155 				}
22156 				vm_object_lock(src_object);
22157 				if (src_object->purgable == VM_PURGABLE_VOLATILE &&
22158 				    os_ref_get_count_raw(&src_object->ref_count) == 1) {
22159 					purgeable_q_t old_queue;
22160 
22161 					/* object should be on a purgeable queue */
22162 					assert(src_object->objq.next != NULL &&
22163 					    src_object->objq.prev != NULL);
22164 					/* move object from its volatile queue to the nonvolatile queue */
22165 					old_queue = vm_purgeable_object_remove(src_object);
22166 					assert(old_queue);
22167 					if (src_object->purgeable_when_ripe) {
22168 						/* remove a token from that volatile queue */
22169 						vm_page_lock_queues();
22170 						vm_purgeable_token_delete_first(old_queue);
22171 						vm_page_unlock_queues();
22172 					}
22173 					/* purge the object */
22174 					vm_object_purge(src_object, 0);
22175 				}
22176 				vm_object_unlock(src_object);
22177 				continue;
22178 			}
22179 
22180 			/*
22181 			 * Pages belonging to this object could be swapped to disk.
22182 			 * Make sure it's not a shared object because we could end
22183 			 * up just bringing it back in again.
22184 			 *
22185 			 * We try to optimize somewhat by checking for objects that are mapped
22186 			 * more than once within our own map. But we don't do full searches,
22187 			 * we just look at the entries following our current entry.
22188 			 */
22189 
22190 			if (os_ref_get_count_raw(&src_object->ref_count) > 1) {
22191 				if (src_object != cur_shared_object) {
22192 					obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22193 					dirty_shared_count += obj_pages_snapshot;
22194 
22195 					cur_shared_object = src_object;
22196 					cur_shared_obj_ref_cnt = 1;
22197 					continue;
22198 				} else {
22199 					cur_shared_obj_ref_cnt++;
22200 					if (os_ref_get_count_raw(&src_object->ref_count) == cur_shared_obj_ref_cnt) {
22201 						/*
22202 						 * Fall through to below and treat this object as private.
22203 						 * So deduct its pages from our shared total and add it to the
22204 						 * private total.
22205 						 */
22206 
22207 						dirty_shared_count -= obj_pages_snapshot;
22208 						dirty_private_count += obj_pages_snapshot;
22209 					} else {
22210 						continue;
22211 					}
22212 				}
22213 			}
22214 
22215 
22216 			if (os_ref_get_count_raw(&src_object->ref_count) == 1) {
22217 				dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22218 			}
22219 
22220 			if (evaluation_phase == TRUE) {
22221 				continue;
22222 			}
22223 		}
22224 
22225 		uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
22226 		*wired_count += src_object->wired_page_count;
22227 
22228 		if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22229 			if (vm_compressor_low_on_space()) {
22230 				*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22231 			}
22232 
22233 			if (vm_swap_low_on_space()) {
22234 				*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22235 			}
22236 
22237 			kr = KERN_NO_SPACE;
22238 			break;
22239 		}
22240 		if (paged_out_count >= dirty_budget) {
22241 			break;
22242 		}
22243 		dirty_budget -= paged_out_count;
22244 	}
22245 
22246 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
22247 	if (evaluation_phase) {
22248 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
22249 
22250 		if (dirty_shared_count > shared_pages_threshold) {
22251 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
22252 			kr = KERN_FAILURE;
22253 			goto done;
22254 		}
22255 
22256 		if (dirty_shared_count &&
22257 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
22258 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
22259 			kr = KERN_FAILURE;
22260 			goto done;
22261 		}
22262 
22263 		evaluation_phase = FALSE;
22264 		dirty_shared_count = dirty_private_count = 0;
22265 
22266 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22267 		clock_get_uptime(&c_freezer_last_yield_ts);
22268 
22269 		if (eval_only) {
22270 			kr = KERN_SUCCESS;
22271 			goto done;
22272 		}
22273 
22274 		vm_purgeable_purge_task_owned(task);
22275 
22276 		goto again;
22277 	} else {
22278 		kr = KERN_SUCCESS;
22279 	}
22280 
22281 done:
22282 	vm_map_unlock(map);
22283 
22284 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
22285 		vm_object_compressed_freezer_done();
22286 	}
22287 	return kr;
22288 }
22289 
22290 #endif
22291 
22292 /*
22293  * vm_map_entry_should_cow_for_true_share:
22294  *
22295  * Determines if the map entry should be clipped and setup for copy-on-write
22296  * to avoid applying "true_share" to a large VM object when only a subset is
22297  * targeted.
22298  *
22299  * For now, we target only the map entries created for the Objective C
22300  * Garbage Collector, which initially have the following properties:
22301  *	- alias == VM_MEMORY_MALLOC
22302  *      - wired_count == 0
22303  *      - !needs_copy
22304  * and a VM object with:
22305  *      - internal
22306  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
22307  *      - !true_share
22308  *      - vo_size == ANON_CHUNK_SIZE
22309  *
22310  * Only non-kernel map entries.
22311  */
22312 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)22313 vm_map_entry_should_cow_for_true_share(
22314 	vm_map_entry_t  entry)
22315 {
22316 	vm_object_t     object;
22317 
22318 	if (entry->is_sub_map) {
22319 		/* entry does not point at a VM object */
22320 		return FALSE;
22321 	}
22322 
22323 	if (entry->needs_copy) {
22324 		/* already set for copy_on_write: done! */
22325 		return FALSE;
22326 	}
22327 
22328 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
22329 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
22330 		/* not a malloc heap or Obj-C Garbage Collector heap */
22331 		return FALSE;
22332 	}
22333 
22334 	if (entry->wired_count) {
22335 		/* wired: can't change the map entry... */
22336 		vm_counters.should_cow_but_wired++;
22337 		return FALSE;
22338 	}
22339 
22340 	object = VME_OBJECT(entry);
22341 
22342 	if (object == VM_OBJECT_NULL) {
22343 		/* no object yet... */
22344 		return FALSE;
22345 	}
22346 
22347 	if (!object->internal) {
22348 		/* not an internal object */
22349 		return FALSE;
22350 	}
22351 
22352 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
22353 		/* not the default copy strategy */
22354 		return FALSE;
22355 	}
22356 
22357 	if (object->true_share) {
22358 		/* already true_share: too late to avoid it */
22359 		return FALSE;
22360 	}
22361 
22362 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
22363 	    object->vo_size != ANON_CHUNK_SIZE) {
22364 		/* ... not an object created for the ObjC Garbage Collector */
22365 		return FALSE;
22366 	}
22367 
22368 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
22369 	    object->vo_size != 2048 * 4096) {
22370 		/* ... not a "MALLOC_SMALL" heap */
22371 		return FALSE;
22372 	}
22373 
22374 	/*
22375 	 * All the criteria match: we have a large object being targeted for "true_share".
22376 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
22377 	 * try and avoid setting up the entire object for "true_share" by clipping the
22378 	 * targeted range and setting it up for copy-on-write.
22379 	 */
22380 	return TRUE;
22381 }
22382 
22383 uint64_t vm_map_range_overflows_count = 0;
22384 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22385 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22386 vm_map_range_overflows(
22387 	vm_map_t map,
22388 	vm_map_offset_t addr,
22389 	vm_map_size_t size)
22390 {
22391 	vm_map_offset_t start, end, sum;
22392 	vm_map_offset_t pgmask;
22393 
22394 	if (size == 0) {
22395 		/* empty range -> no overflow */
22396 		return false;
22397 	}
22398 	pgmask = vm_map_page_mask(map);
22399 	start = vm_map_trunc_page_mask(addr, pgmask);
22400 	end = vm_map_round_page_mask(addr + size, pgmask);
22401 	if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22402 		vm_map_range_overflows_count++;
22403 		if (vm_map_range_overflows_log) {
22404 			printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22405 			    proc_selfpid(),
22406 			    proc_best_name(current_proc()),
22407 			    (uint64_t)addr,
22408 			    (uint64_t)size,
22409 			    (uint64_t)pgmask);
22410 		}
22411 		DTRACE_VM4(vm_map_range_overflows,
22412 		    vm_map_t, map,
22413 		    uint32_t, pgmask,
22414 		    uint64_t, (uint64_t)addr,
22415 		    uint64_t, (uint64_t)size);
22416 		return true;
22417 	}
22418 	return false;
22419 }
22420 
22421 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22422 vm_map_round_page_mask(
22423 	vm_map_offset_t offset,
22424 	vm_map_offset_t mask)
22425 {
22426 	return VM_MAP_ROUND_PAGE(offset, mask);
22427 }
22428 
22429 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22430 vm_map_trunc_page_mask(
22431 	vm_map_offset_t offset,
22432 	vm_map_offset_t mask)
22433 {
22434 	return VM_MAP_TRUNC_PAGE(offset, mask);
22435 }
22436 
22437 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22438 vm_map_page_aligned(
22439 	vm_map_offset_t offset,
22440 	vm_map_offset_t mask)
22441 {
22442 	return ((offset) & mask) == 0;
22443 }
22444 
22445 int
vm_map_page_shift(vm_map_t map)22446 vm_map_page_shift(
22447 	vm_map_t map)
22448 {
22449 	return VM_MAP_PAGE_SHIFT(map);
22450 }
22451 
22452 int
vm_map_page_size(vm_map_t map)22453 vm_map_page_size(
22454 	vm_map_t map)
22455 {
22456 	return VM_MAP_PAGE_SIZE(map);
22457 }
22458 
22459 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22460 vm_map_page_mask(
22461 	vm_map_t map)
22462 {
22463 	return VM_MAP_PAGE_MASK(map);
22464 }
22465 
22466 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22467 vm_map_set_page_shift(
22468 	vm_map_t        map,
22469 	int             pageshift)
22470 {
22471 	if (map->hdr.nentries != 0) {
22472 		/* too late to change page size */
22473 		return KERN_FAILURE;
22474 	}
22475 
22476 	map->hdr.page_shift = (uint16_t)pageshift;
22477 
22478 	return KERN_SUCCESS;
22479 }
22480 
22481 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22482 vm_map_query_volatile(
22483 	vm_map_t        map,
22484 	mach_vm_size_t  *volatile_virtual_size_p,
22485 	mach_vm_size_t  *volatile_resident_size_p,
22486 	mach_vm_size_t  *volatile_compressed_size_p,
22487 	mach_vm_size_t  *volatile_pmap_size_p,
22488 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
22489 {
22490 	mach_vm_size_t  volatile_virtual_size;
22491 	mach_vm_size_t  volatile_resident_count;
22492 	mach_vm_size_t  volatile_compressed_count;
22493 	mach_vm_size_t  volatile_pmap_count;
22494 	mach_vm_size_t  volatile_compressed_pmap_count;
22495 	mach_vm_size_t  resident_count;
22496 	vm_map_entry_t  entry;
22497 	vm_object_t     object;
22498 
22499 	/* map should be locked by caller */
22500 
22501 	volatile_virtual_size = 0;
22502 	volatile_resident_count = 0;
22503 	volatile_compressed_count = 0;
22504 	volatile_pmap_count = 0;
22505 	volatile_compressed_pmap_count = 0;
22506 
22507 	for (entry = vm_map_first_entry(map);
22508 	    entry != vm_map_to_entry(map);
22509 	    entry = entry->vme_next) {
22510 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
22511 
22512 		if (entry->is_sub_map) {
22513 			continue;
22514 		}
22515 		if (!(entry->protection & VM_PROT_WRITE)) {
22516 			continue;
22517 		}
22518 		object = VME_OBJECT(entry);
22519 		if (object == VM_OBJECT_NULL) {
22520 			continue;
22521 		}
22522 		if (object->purgable != VM_PURGABLE_VOLATILE &&
22523 		    object->purgable != VM_PURGABLE_EMPTY) {
22524 			continue;
22525 		}
22526 		if (VME_OFFSET(entry)) {
22527 			/*
22528 			 * If the map entry has been split and the object now
22529 			 * appears several times in the VM map, we don't want
22530 			 * to count the object's resident_page_count more than
22531 			 * once.  We count it only for the first one, starting
22532 			 * at offset 0 and ignore the other VM map entries.
22533 			 */
22534 			continue;
22535 		}
22536 		resident_count = object->resident_page_count;
22537 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22538 			resident_count = 0;
22539 		} else {
22540 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22541 		}
22542 
22543 		volatile_virtual_size += entry->vme_end - entry->vme_start;
22544 		volatile_resident_count += resident_count;
22545 		if (object->pager) {
22546 			volatile_compressed_count +=
22547 			    vm_compressor_pager_get_count(object->pager);
22548 		}
22549 		pmap_compressed_bytes = 0;
22550 		pmap_resident_bytes =
22551 		    pmap_query_resident(map->pmap,
22552 		    entry->vme_start,
22553 		    entry->vme_end,
22554 		    &pmap_compressed_bytes);
22555 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22556 		volatile_compressed_pmap_count += (pmap_compressed_bytes
22557 		    / PAGE_SIZE);
22558 	}
22559 
22560 	/* map is still locked on return */
22561 
22562 	*volatile_virtual_size_p = volatile_virtual_size;
22563 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22564 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22565 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22566 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22567 
22568 	return KERN_SUCCESS;
22569 }
22570 
22571 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22572 vm_map_sizes(vm_map_t map,
22573     vm_map_size_t * psize,
22574     vm_map_size_t * pfree,
22575     vm_map_size_t * plargest_free)
22576 {
22577 	vm_map_entry_t  entry;
22578 	vm_map_offset_t prev;
22579 	vm_map_size_t   free, total_free, largest_free;
22580 	boolean_t       end;
22581 
22582 	if (!map) {
22583 		*psize = *pfree = *plargest_free = 0;
22584 		return;
22585 	}
22586 	total_free = largest_free = 0;
22587 
22588 	vm_map_lock_read(map);
22589 	if (psize) {
22590 		*psize = map->max_offset - map->min_offset;
22591 	}
22592 
22593 	prev = map->min_offset;
22594 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22595 		end = (entry == vm_map_to_entry(map));
22596 
22597 		if (end) {
22598 			free = entry->vme_end   - prev;
22599 		} else {
22600 			free = entry->vme_start - prev;
22601 		}
22602 
22603 		total_free += free;
22604 		if (free > largest_free) {
22605 			largest_free = free;
22606 		}
22607 
22608 		if (end) {
22609 			break;
22610 		}
22611 		prev = entry->vme_end;
22612 	}
22613 	vm_map_unlock_read(map);
22614 	if (pfree) {
22615 		*pfree = total_free;
22616 	}
22617 	if (plargest_free) {
22618 		*plargest_free = largest_free;
22619 	}
22620 }
22621 
22622 #if VM_SCAN_FOR_SHADOW_CHAIN
22623 int
vm_map_shadow_max(vm_map_t map)22624 vm_map_shadow_max(
22625 	vm_map_t map)
22626 {
22627 	int             shadows, shadows_max;
22628 	vm_map_entry_t  entry;
22629 	vm_object_t     object, next_object;
22630 
22631 	if (map == NULL) {
22632 		return 0;
22633 	}
22634 
22635 	shadows_max = 0;
22636 
22637 	vm_map_lock_read(map);
22638 
22639 	for (entry = vm_map_first_entry(map);
22640 	    entry != vm_map_to_entry(map);
22641 	    entry = entry->vme_next) {
22642 		if (entry->is_sub_map) {
22643 			continue;
22644 		}
22645 		object = VME_OBJECT(entry);
22646 		if (object == NULL) {
22647 			continue;
22648 		}
22649 		vm_object_lock_shared(object);
22650 		for (shadows = 0;
22651 		    object->shadow != NULL;
22652 		    shadows++, object = next_object) {
22653 			next_object = object->shadow;
22654 			vm_object_lock_shared(next_object);
22655 			vm_object_unlock(object);
22656 		}
22657 		vm_object_unlock(object);
22658 		if (shadows > shadows_max) {
22659 			shadows_max = shadows;
22660 		}
22661 	}
22662 
22663 	vm_map_unlock_read(map);
22664 
22665 	return shadows_max;
22666 }
22667 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22668 
22669 void
vm_commit_pagezero_status(vm_map_t lmap)22670 vm_commit_pagezero_status(vm_map_t lmap)
22671 {
22672 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22673 }
22674 
22675 #if __x86_64__
22676 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22677 vm_map_set_high_start(
22678 	vm_map_t        map,
22679 	vm_map_offset_t high_start)
22680 {
22681 	map->vmmap_high_start = high_start;
22682 }
22683 #endif /* __x86_64__ */
22684 
22685 #if CODE_SIGNING_MONITOR
22686 
22687 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22688 vm_map_entry_cs_associate(
22689 	vm_map_t                map,
22690 	vm_map_entry_t          entry,
22691 	vm_map_kernel_flags_t   vmk_flags)
22692 {
22693 	vm_object_t cs_object, cs_shadow, backing_object;
22694 	vm_object_offset_t cs_offset, backing_offset;
22695 	void *cs_blobs;
22696 	struct vnode *cs_vnode;
22697 	kern_return_t cs_ret;
22698 
22699 	if (map->pmap == NULL ||
22700 	    entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22701 	    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22702 	    VME_OBJECT(entry) == VM_OBJECT_NULL) {
22703 		return KERN_SUCCESS;
22704 	}
22705 
22706 	if (!(entry->protection & VM_PROT_EXECUTE)) {
22707 		/*
22708 		 * This memory region is not executable, so the code-signing
22709 		 * monitor would usually not care about it...
22710 		 */
22711 		if (vmk_flags.vmkf_remap_prot_copy &&
22712 		    (entry->max_protection & VM_PROT_EXECUTE)) {
22713 			/*
22714 			 * ... except if the memory region is being remapped
22715 			 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22716 			 * which is what a debugger or dtrace would be doing
22717 			 * to prepare to modify an executable page to insert
22718 			 * a breakpoint or activate a probe.
22719 			 * In that case, fall through so that we can mark
22720 			 * this region as being "debugged" and no longer
22721 			 * strictly code-signed.
22722 			 */
22723 		} else {
22724 			/*
22725 			 * Really not executable, so no need to tell the
22726 			 * code-signing monitor.
22727 			 */
22728 			return KERN_SUCCESS;
22729 		}
22730 	}
22731 
22732 	vm_map_lock_assert_exclusive(map);
22733 
22734 	/*
22735 	 * Check for a debug association mapping before we check for used_for_jit. This
22736 	 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22737 	 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22738 	 * since they are mapped with RW or RX permissions, which the page table monitor
22739 	 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22740 	 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22741 	 * violation when those USER_EXEC pages are mapped as RW.
22742 	 *
22743 	 * Since these pages switch between RW and RX through mprotect, they mimic what
22744 	 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22745 	 * on macOS systems, this works in our favor here and allows us to continue to
22746 	 * support these legacy-programmed applications without sacrificing security on
22747 	 * the page table or the code signing monitor. We don't need to explicitly check
22748 	 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22749 	 * created with RX, then the application must map it as RW in order to first write
22750 	 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22751 	 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22752 	 * Similarly, if the mapping was created as RW, and then switched to RX,
22753 	 * vm_map_protect will again mark the entry as a copy, and both these cases
22754 	 * lead to this if-statement being entered.
22755 	 *
22756 	 * For more information: rdar://115313336.
22757 	 */
22758 	if (vmk_flags.vmkf_remap_prot_copy) {
22759 		cs_ret = csm_associate_debug_region(
22760 			map->pmap,
22761 			entry->vme_start,
22762 			entry->vme_end - entry->vme_start);
22763 
22764 		/*
22765 		 * csm_associate_debug_region returns not supported when the code signing
22766 		 * monitor is disabled. This is intentional, since cs_ret is checked towards
22767 		 * the end of the function, and if it is not supported, then we still want the
22768 		 * VM to perform code-signing enforcement on this entry. That said, if we don't
22769 		 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22770 		 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22771 		 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22772 		 * cases, which will cause a violation when attempted to be mapped as writable).
22773 		 */
22774 		if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22775 			entry->vme_xnu_user_debug = TRUE;
22776 		}
22777 #if DEVELOPMENT || DEBUG
22778 		if (vm_log_xnu_user_debug) {
22779 			printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ]  vme_xnu_user_debug=%d cs_ret %d\n",
22780 			    proc_selfpid(),
22781 			    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22782 			    __FUNCTION__, __LINE__,
22783 			    map, entry,
22784 			    (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22785 			    entry->vme_xnu_user_debug,
22786 			    cs_ret);
22787 		}
22788 #endif /* DEVELOPMENT || DEBUG */
22789 		goto done;
22790 	}
22791 
22792 	if (entry->used_for_jit) {
22793 		cs_ret = csm_associate_jit_region(
22794 			map->pmap,
22795 			entry->vme_start,
22796 			entry->vme_end - entry->vme_start);
22797 		goto done;
22798 	}
22799 
22800 	cs_object = VME_OBJECT(entry);
22801 	vm_object_lock_shared(cs_object);
22802 	cs_offset = VME_OFFSET(entry);
22803 
22804 	/* find the VM object backed by the code-signed vnode */
22805 	for (;;) {
22806 		/* go to the bottom of cs_object's shadow chain */
22807 		for (;
22808 		    cs_object->shadow != VM_OBJECT_NULL;
22809 		    cs_object = cs_shadow) {
22810 			cs_shadow = cs_object->shadow;
22811 			cs_offset += cs_object->vo_shadow_offset;
22812 			vm_object_lock_shared(cs_shadow);
22813 			vm_object_unlock(cs_object);
22814 		}
22815 		if (cs_object->internal ||
22816 		    cs_object->pager == MEMORY_OBJECT_NULL) {
22817 			vm_object_unlock(cs_object);
22818 			return KERN_SUCCESS;
22819 		}
22820 
22821 		cs_offset += cs_object->paging_offset;
22822 
22823 		/*
22824 		 * cs_object could be backed by a:
22825 		 *      vnode_pager
22826 		 *	apple_protect_pager
22827 		 *      shared_region_pager
22828 		 *	fourk_pager (multiple backing objects -> fail?)
22829 		 * ask the pager if it has a backing VM object
22830 		 */
22831 		if (!memory_object_backing_object(cs_object->pager,
22832 		    cs_offset,
22833 		    &backing_object,
22834 		    &backing_offset)) {
22835 			/* no backing object: cs_object is it */
22836 			break;
22837 		}
22838 
22839 		/* look down the backing object's shadow chain */
22840 		vm_object_lock_shared(backing_object);
22841 		vm_object_unlock(cs_object);
22842 		cs_object = backing_object;
22843 		cs_offset = backing_offset;
22844 	}
22845 
22846 	cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
22847 	if (cs_vnode == NULL) {
22848 		/* no vnode, no code signatures to associate */
22849 		cs_ret = KERN_SUCCESS;
22850 	} else {
22851 		cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
22852 		    &cs_blobs);
22853 		assert(cs_ret == KERN_SUCCESS);
22854 		cs_ret = cs_associate_blob_with_mapping(map->pmap,
22855 		    entry->vme_start,
22856 		    (entry->vme_end - entry->vme_start),
22857 		    cs_offset,
22858 		    cs_blobs);
22859 	}
22860 	vm_object_unlock(cs_object);
22861 	cs_object = VM_OBJECT_NULL;
22862 
22863 done:
22864 	if (cs_ret == KERN_SUCCESS) {
22865 		DTRACE_VM2(vm_map_entry_cs_associate_success,
22866 		    vm_map_offset_t, entry->vme_start,
22867 		    vm_map_offset_t, entry->vme_end);
22868 		if (vm_map_executable_immutable) {
22869 			/*
22870 			 * Prevent this executable
22871 			 * mapping from being unmapped
22872 			 * or modified.
22873 			 */
22874 			entry->vme_permanent = TRUE;
22875 		}
22876 		/*
22877 		 * pmap says it will validate the
22878 		 * code-signing validity of pages
22879 		 * faulted in via this mapping, so
22880 		 * this map entry should be marked so
22881 		 * that vm_fault() bypasses code-signing
22882 		 * validation for faults coming through
22883 		 * this mapping.
22884 		 */
22885 		entry->csm_associated = TRUE;
22886 	} else if (cs_ret == KERN_NOT_SUPPORTED) {
22887 		/*
22888 		 * pmap won't check the code-signing
22889 		 * validity of pages faulted in via
22890 		 * this mapping, so VM should keep
22891 		 * doing it.
22892 		 */
22893 		DTRACE_VM3(vm_map_entry_cs_associate_off,
22894 		    vm_map_offset_t, entry->vme_start,
22895 		    vm_map_offset_t, entry->vme_end,
22896 		    int, cs_ret);
22897 	} else {
22898 		/*
22899 		 * A real error: do not allow
22900 		 * execution in this mapping.
22901 		 */
22902 		DTRACE_VM3(vm_map_entry_cs_associate_failure,
22903 		    vm_map_offset_t, entry->vme_start,
22904 		    vm_map_offset_t, entry->vme_end,
22905 		    int, cs_ret);
22906 		if (vmk_flags.vmkf_overwrite_immutable) {
22907 			/*
22908 			 * We can get here when we remap an apple_protect pager
22909 			 * on top of an already cs_associated executable mapping
22910 			 * with the same code signatures, so we don't want to
22911 			 * lose VM_PROT_EXECUTE in that case...
22912 			 */
22913 		} else {
22914 			entry->protection &= ~VM_PROT_ALLEXEC;
22915 			entry->max_protection &= ~VM_PROT_ALLEXEC;
22916 		}
22917 	}
22918 
22919 	return cs_ret;
22920 }
22921 
22922 #endif /* CODE_SIGNING_MONITOR */
22923 
22924 inline bool
vm_map_is_corpse_source(vm_map_t map)22925 vm_map_is_corpse_source(vm_map_t map)
22926 {
22927 	bool status = false;
22928 	if (map) {
22929 		vm_map_lock_read(map);
22930 		status = map->corpse_source;
22931 		vm_map_unlock_read(map);
22932 	}
22933 	return status;
22934 }
22935 
22936 inline void
vm_map_set_corpse_source(vm_map_t map)22937 vm_map_set_corpse_source(vm_map_t map)
22938 {
22939 	if (map) {
22940 		vm_map_lock(map);
22941 		map->corpse_source = true;
22942 		vm_map_unlock(map);
22943 	}
22944 }
22945 
22946 inline void
vm_map_unset_corpse_source(vm_map_t map)22947 vm_map_unset_corpse_source(vm_map_t map)
22948 {
22949 	if (map) {
22950 		vm_map_lock(map);
22951 		map->corpse_source = false;
22952 		vm_map_unlock(map);
22953 	}
22954 }
22955 /*
22956  * FORKED CORPSE FOOTPRINT
22957  *
22958  * A forked corpse gets a copy of the original VM map but its pmap is mostly
22959  * empty since it never ran and never got to fault in any pages.
22960  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
22961  * a forked corpse would therefore return very little information.
22962  *
22963  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
22964  * to vm_map_fork() to collect footprint information from the original VM map
22965  * and its pmap, and store it in the forked corpse's VM map.  That information
22966  * is stored in place of the VM map's "hole list" since we'll never need to
22967  * lookup for holes in the corpse's map.
22968  *
22969  * The corpse's footprint info looks like this:
22970  *
22971  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
22972  * as follows:
22973  *                     +---------------------------------------+
22974  *            header-> | cf_size                               |
22975  *                     +-------------------+-------------------+
22976  *                     | cf_last_region    | cf_last_zeroes    |
22977  *                     +-------------------+-------------------+
22978  *           region1-> | cfr_vaddr                             |
22979  *                     +-------------------+-------------------+
22980  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
22981  *                     +---------------------------------------+
22982  *                     | d4 | d5 | ...                         |
22983  *                     +---------------------------------------+
22984  *                     | ...                                   |
22985  *                     +-------------------+-------------------+
22986  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
22987  *                     +-------------------+-------------------+
22988  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
22989  *                     +---------------------------------------+
22990  *                     | d0 | d1 ...                           |
22991  *                     +---------------------------------------+
22992  *                       ...
22993  *                     +---------------------------------------+
22994  *       last region-> | cfr_vaddr                             |
22995  *                     +---------------------------------------+
22996  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
22997  *                     +---------------------------------------+
22998  *                       ...
22999  *                     +---------------------------------------+
23000  *                     | dx | dy | dz | na | na | na | na | na |
23001  *                     +---------------------------------------+
23002  *
23003  * where:
23004  *      cf_size:	total size of the buffer (rounded to page size)
23005  *      cf_last_region:	offset in the buffer of the last "region" sub-header
23006  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
23007  *			of last region
23008  *	cfr_vaddr:	virtual address of the start of the covered "region"
23009  *	cfr_num_pages:	number of pages in the covered "region"
23010  *	d*:		disposition of the page at that virtual address
23011  * Regions in the buffer are word-aligned.
23012  *
23013  * We estimate the size of the buffer based on the number of memory regions
23014  * and the virtual size of the address space.  While copying each memory region
23015  * during vm_map_fork(), we also collect the footprint info for that region
23016  * and store it in the buffer, packing it as much as possible (coalescing
23017  * contiguous memory regions to avoid having too many region headers and
23018  * avoiding long streaks of "zero" page dispositions by splitting footprint
23019  * "regions", so the number of regions in the footprint buffer might not match
23020  * the number of memory regions in the address space.
23021  *
23022  * We also have to copy the original task's "nonvolatile" ledgers since that's
23023  * part of the footprint and will need to be reported to any tool asking for
23024  * the footprint information of the forked corpse.
23025  */
23026 
23027 uint64_t vm_map_corpse_footprint_count = 0;
23028 uint64_t vm_map_corpse_footprint_size_avg = 0;
23029 uint64_t vm_map_corpse_footprint_size_max = 0;
23030 uint64_t vm_map_corpse_footprint_full = 0;
23031 uint64_t vm_map_corpse_footprint_no_buf = 0;
23032 
23033 struct vm_map_corpse_footprint_header {
23034 	vm_size_t       cf_size;        /* allocated buffer size */
23035 	uint32_t        cf_last_region; /* offset of last region in buffer */
23036 	union {
23037 		uint32_t cfu_last_zeroes; /* during creation:
23038 		                           * number of "zero" dispositions at
23039 		                           * end of last region */
23040 		uint32_t cfu_hint_region; /* during lookup:
23041 		                           * offset of last looked up region */
23042 #define cf_last_zeroes cfu.cfu_last_zeroes
23043 #define cf_hint_region cfu.cfu_hint_region
23044 	} cfu;
23045 };
23046 typedef uint8_t cf_disp_t;
23047 struct vm_map_corpse_footprint_region {
23048 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
23049 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
23050 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
23051 } __attribute__((packed));
23052 
23053 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)23054 vm_page_disposition_to_cf_disp(
23055 	int disposition)
23056 {
23057 	assert(sizeof(cf_disp_t) == 1);
23058 	/* relocate bits that don't fit in a "uint8_t" */
23059 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
23060 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
23061 	}
23062 	/* cast gets rid of extra bits */
23063 	return (cf_disp_t) disposition;
23064 }
23065 
23066 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)23067 vm_page_cf_disp_to_disposition(
23068 	cf_disp_t cf_disp)
23069 {
23070 	int disposition;
23071 
23072 	assert(sizeof(cf_disp_t) == 1);
23073 	disposition = (int) cf_disp;
23074 	/* move relocated bits back in place */
23075 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
23076 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
23077 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
23078 	}
23079 	return disposition;
23080 }
23081 
23082 /*
23083  * vm_map_corpse_footprint_new_region:
23084  *      closes the current footprint "region" and creates a new one
23085  *
23086  * Returns NULL if there's not enough space in the buffer for a new region.
23087  */
23088 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)23089 vm_map_corpse_footprint_new_region(
23090 	struct vm_map_corpse_footprint_header *footprint_header)
23091 {
23092 	uintptr_t       footprint_edge;
23093 	uint32_t        new_region_offset;
23094 	struct vm_map_corpse_footprint_region *footprint_region;
23095 	struct vm_map_corpse_footprint_region *new_footprint_region;
23096 
23097 	footprint_edge = ((uintptr_t)footprint_header +
23098 	    footprint_header->cf_size);
23099 	footprint_region = ((struct vm_map_corpse_footprint_region *)
23100 	    ((char *)footprint_header +
23101 	    footprint_header->cf_last_region));
23102 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
23103 	    footprint_edge);
23104 
23105 	/* get rid of trailing zeroes in the last region */
23106 	assert(footprint_region->cfr_num_pages >=
23107 	    footprint_header->cf_last_zeroes);
23108 	footprint_region->cfr_num_pages -=
23109 	    footprint_header->cf_last_zeroes;
23110 	footprint_header->cf_last_zeroes = 0;
23111 
23112 	/* reuse this region if it's now empty */
23113 	if (footprint_region->cfr_num_pages == 0) {
23114 		return footprint_region;
23115 	}
23116 
23117 	/* compute offset of new region */
23118 	new_region_offset = footprint_header->cf_last_region;
23119 	new_region_offset += sizeof(*footprint_region);
23120 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23121 	new_region_offset = roundup(new_region_offset, sizeof(int));
23122 
23123 	/* check if we're going over the edge */
23124 	if (((uintptr_t)footprint_header +
23125 	    new_region_offset +
23126 	    sizeof(*footprint_region)) >=
23127 	    footprint_edge) {
23128 		/* over the edge: no new region */
23129 		return NULL;
23130 	}
23131 
23132 	/* adjust offset of last region in header */
23133 	footprint_header->cf_last_region = new_region_offset;
23134 
23135 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
23136 	    ((char *)footprint_header +
23137 	    footprint_header->cf_last_region);
23138 	new_footprint_region->cfr_vaddr = 0;
23139 	new_footprint_region->cfr_num_pages = 0;
23140 	/* caller needs to initialize new region */
23141 
23142 	return new_footprint_region;
23143 }
23144 
23145 /*
23146  * vm_map_corpse_footprint_collect:
23147  *	collect footprint information for "old_entry" in "old_map" and
23148  *	stores it in "new_map"'s vmmap_footprint_info.
23149  */
23150 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)23151 vm_map_corpse_footprint_collect(
23152 	vm_map_t        old_map,
23153 	vm_map_entry_t  old_entry,
23154 	vm_map_t        new_map)
23155 {
23156 	vm_map_offset_t va;
23157 	kern_return_t   kr;
23158 	struct vm_map_corpse_footprint_header *footprint_header;
23159 	struct vm_map_corpse_footprint_region *footprint_region;
23160 	struct vm_map_corpse_footprint_region *new_footprint_region;
23161 	cf_disp_t       *next_disp_p;
23162 	uintptr_t       footprint_edge;
23163 	uint32_t        num_pages_tmp;
23164 	int             effective_page_size;
23165 
23166 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
23167 
23168 	va = old_entry->vme_start;
23169 
23170 	vm_map_lock_assert_exclusive(old_map);
23171 	vm_map_lock_assert_exclusive(new_map);
23172 
23173 	assert(new_map->has_corpse_footprint);
23174 	assert(!old_map->has_corpse_footprint);
23175 	if (!new_map->has_corpse_footprint ||
23176 	    old_map->has_corpse_footprint) {
23177 		/*
23178 		 * This can only transfer footprint info from a
23179 		 * map with a live pmap to a map with a corpse footprint.
23180 		 */
23181 		return KERN_NOT_SUPPORTED;
23182 	}
23183 
23184 	if (new_map->vmmap_corpse_footprint == NULL) {
23185 		vm_offset_t     buf;
23186 		vm_size_t       buf_size;
23187 
23188 		buf = 0;
23189 		buf_size = (sizeof(*footprint_header) +
23190 		    (old_map->hdr.nentries
23191 		    *
23192 		    (sizeof(*footprint_region) +
23193 		    +3))            /* potential alignment for each region */
23194 		    +
23195 		    ((old_map->size / effective_page_size)
23196 		    *
23197 		    sizeof(cf_disp_t)));      /* disposition for each page */
23198 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
23199 		buf_size = round_page(buf_size);
23200 
23201 		/* limit buffer to 1 page to validate overflow detection */
23202 //		buf_size = PAGE_SIZE;
23203 
23204 		/* limit size to a somewhat sane amount */
23205 #if XNU_TARGET_OS_OSX
23206 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
23207 #else /* XNU_TARGET_OS_OSX */
23208 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
23209 #endif /* XNU_TARGET_OS_OSX */
23210 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
23211 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
23212 		}
23213 
23214 		/*
23215 		 * Allocate the pageable buffer (with a trailing guard page).
23216 		 * It will be zero-filled on demand.
23217 		 */
23218 		kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
23219 		    KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
23220 		    VM_KERN_MEMORY_DIAG);
23221 		if (kr != KERN_SUCCESS) {
23222 			vm_map_corpse_footprint_no_buf++;
23223 			return kr;
23224 		}
23225 
23226 		/* initialize header and 1st region */
23227 		footprint_header = (struct vm_map_corpse_footprint_header *)buf;
23228 		new_map->vmmap_corpse_footprint = footprint_header;
23229 
23230 		footprint_header->cf_size = buf_size;
23231 		footprint_header->cf_last_region =
23232 		    sizeof(*footprint_header);
23233 		footprint_header->cf_last_zeroes = 0;
23234 
23235 		footprint_region = (struct vm_map_corpse_footprint_region *)
23236 		    ((char *)footprint_header +
23237 		    footprint_header->cf_last_region);
23238 		footprint_region->cfr_vaddr = 0;
23239 		footprint_region->cfr_num_pages = 0;
23240 	} else {
23241 		/* retrieve header and last region */
23242 		footprint_header = (struct vm_map_corpse_footprint_header *)
23243 		    new_map->vmmap_corpse_footprint;
23244 		footprint_region = (struct vm_map_corpse_footprint_region *)
23245 		    ((char *)footprint_header +
23246 		    footprint_header->cf_last_region);
23247 	}
23248 	footprint_edge = ((uintptr_t)footprint_header +
23249 	    footprint_header->cf_size);
23250 
23251 	if ((footprint_region->cfr_vaddr +
23252 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
23253 	    effective_page_size))
23254 	    != old_entry->vme_start) {
23255 		uint64_t num_pages_delta, num_pages_delta_size;
23256 		uint32_t region_offset_delta_size;
23257 
23258 		/*
23259 		 * Not the next contiguous virtual address:
23260 		 * start a new region or store "zero" dispositions for
23261 		 * the missing pages?
23262 		 */
23263 		/* size of gap in actual page dispositions */
23264 		num_pages_delta = ((old_entry->vme_start -
23265 		    footprint_region->cfr_vaddr) / effective_page_size)
23266 		    - footprint_region->cfr_num_pages;
23267 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
23268 		/* size of gap as a new footprint region header */
23269 		region_offset_delta_size =
23270 		    (sizeof(*footprint_region) +
23271 		    roundup(((footprint_region->cfr_num_pages -
23272 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
23273 		    sizeof(int)) -
23274 		    ((footprint_region->cfr_num_pages -
23275 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
23276 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
23277 		if (region_offset_delta_size < num_pages_delta_size ||
23278 		    os_add3_overflow(footprint_region->cfr_num_pages,
23279 		    (uint32_t) num_pages_delta,
23280 		    1,
23281 		    &num_pages_tmp)) {
23282 			/*
23283 			 * Storing data for this gap would take more space
23284 			 * than inserting a new footprint region header:
23285 			 * let's start a new region and save space. If it's a
23286 			 * tie, let's avoid using a new region, since that
23287 			 * would require more region hops to find the right
23288 			 * range during lookups.
23289 			 *
23290 			 * If the current region's cfr_num_pages would overflow
23291 			 * if we added "zero" page dispositions for the gap,
23292 			 * no choice but to start a new region.
23293 			 */
23294 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
23295 			new_footprint_region =
23296 			    vm_map_corpse_footprint_new_region(footprint_header);
23297 			/* check that we're not going over the edge */
23298 			if (new_footprint_region == NULL) {
23299 				goto over_the_edge;
23300 			}
23301 			footprint_region = new_footprint_region;
23302 			/* initialize new region as empty */
23303 			footprint_region->cfr_vaddr = old_entry->vme_start;
23304 			footprint_region->cfr_num_pages = 0;
23305 		} else {
23306 			/*
23307 			 * Store "zero" page dispositions for the missing
23308 			 * pages.
23309 			 */
23310 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
23311 			for (; num_pages_delta > 0; num_pages_delta--) {
23312 				next_disp_p = (cf_disp_t *)
23313 				    ((uintptr_t) footprint_region +
23314 				    sizeof(*footprint_region));
23315 				next_disp_p += footprint_region->cfr_num_pages;
23316 				/* check that we're not going over the edge */
23317 				if ((uintptr_t)next_disp_p >= footprint_edge) {
23318 					goto over_the_edge;
23319 				}
23320 				/* store "zero" disposition for this gap page */
23321 				footprint_region->cfr_num_pages++;
23322 				*next_disp_p = (cf_disp_t) 0;
23323 				footprint_header->cf_last_zeroes++;
23324 			}
23325 		}
23326 	}
23327 
23328 	for (va = old_entry->vme_start;
23329 	    va < old_entry->vme_end;
23330 	    va += effective_page_size) {
23331 		int             disposition;
23332 		cf_disp_t       cf_disp;
23333 
23334 		vm_map_footprint_query_page_info(old_map,
23335 		    old_entry,
23336 		    va,
23337 		    &disposition);
23338 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
23339 
23340 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
23341 
23342 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
23343 			/*
23344 			 * Ignore "zero" dispositions at start of
23345 			 * region: just move start of region.
23346 			 */
23347 			footprint_region->cfr_vaddr += effective_page_size;
23348 			continue;
23349 		}
23350 
23351 		/* would region's cfr_num_pages overflow? */
23352 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
23353 		    &num_pages_tmp)) {
23354 			/* overflow: create a new region */
23355 			new_footprint_region =
23356 			    vm_map_corpse_footprint_new_region(
23357 				footprint_header);
23358 			if (new_footprint_region == NULL) {
23359 				goto over_the_edge;
23360 			}
23361 			footprint_region = new_footprint_region;
23362 			footprint_region->cfr_vaddr = va;
23363 			footprint_region->cfr_num_pages = 0;
23364 		}
23365 
23366 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
23367 		    sizeof(*footprint_region));
23368 		next_disp_p += footprint_region->cfr_num_pages;
23369 		/* check that we're not going over the edge */
23370 		if ((uintptr_t)next_disp_p >= footprint_edge) {
23371 			goto over_the_edge;
23372 		}
23373 		/* store this dispostion */
23374 		*next_disp_p = cf_disp;
23375 		footprint_region->cfr_num_pages++;
23376 
23377 		if (cf_disp != 0) {
23378 			/* non-zero disp: break the current zero streak */
23379 			footprint_header->cf_last_zeroes = 0;
23380 			/* done */
23381 			continue;
23382 		}
23383 
23384 		/* zero disp: add to the current streak of zeroes */
23385 		footprint_header->cf_last_zeroes++;
23386 		if ((footprint_header->cf_last_zeroes +
23387 		    roundup(((footprint_region->cfr_num_pages -
23388 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23389 		    (sizeof(int) - 1),
23390 		    sizeof(int))) <
23391 		    (sizeof(*footprint_header))) {
23392 			/*
23393 			 * There are not enough trailing "zero" dispositions
23394 			 * (+ the extra padding we would need for the previous
23395 			 * region); creating a new region would not save space
23396 			 * at this point, so let's keep this "zero" disposition
23397 			 * in this region and reconsider later.
23398 			 */
23399 			continue;
23400 		}
23401 		/*
23402 		 * Create a new region to avoid having too many consecutive
23403 		 * "zero" dispositions.
23404 		 */
23405 		new_footprint_region =
23406 		    vm_map_corpse_footprint_new_region(footprint_header);
23407 		if (new_footprint_region == NULL) {
23408 			goto over_the_edge;
23409 		}
23410 		footprint_region = new_footprint_region;
23411 		/* initialize the new region as empty ... */
23412 		footprint_region->cfr_num_pages = 0;
23413 		/* ... and skip this "zero" disp */
23414 		footprint_region->cfr_vaddr = va + effective_page_size;
23415 	}
23416 
23417 	return KERN_SUCCESS;
23418 
23419 over_the_edge:
23420 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23421 	vm_map_corpse_footprint_full++;
23422 	return KERN_RESOURCE_SHORTAGE;
23423 }
23424 
23425 /*
23426  * vm_map_corpse_footprint_collect_done:
23427  *	completes the footprint collection by getting rid of any remaining
23428  *	trailing "zero" dispositions and trimming the unused part of the
23429  *	kernel buffer
23430  */
23431 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23432 vm_map_corpse_footprint_collect_done(
23433 	vm_map_t        new_map)
23434 {
23435 	struct vm_map_corpse_footprint_header *footprint_header;
23436 	struct vm_map_corpse_footprint_region *footprint_region;
23437 	vm_size_t       buf_size, actual_size;
23438 	kern_return_t   kr;
23439 
23440 	assert(new_map->has_corpse_footprint);
23441 	if (!new_map->has_corpse_footprint ||
23442 	    new_map->vmmap_corpse_footprint == NULL) {
23443 		return;
23444 	}
23445 
23446 	footprint_header = (struct vm_map_corpse_footprint_header *)
23447 	    new_map->vmmap_corpse_footprint;
23448 	buf_size = footprint_header->cf_size;
23449 
23450 	footprint_region = (struct vm_map_corpse_footprint_region *)
23451 	    ((char *)footprint_header +
23452 	    footprint_header->cf_last_region);
23453 
23454 	/* get rid of trailing zeroes in last region */
23455 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23456 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23457 	footprint_header->cf_last_zeroes = 0;
23458 
23459 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
23460 	    sizeof(*footprint_region) +
23461 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23462 
23463 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23464 	vm_map_corpse_footprint_size_avg =
23465 	    (((vm_map_corpse_footprint_size_avg *
23466 	    vm_map_corpse_footprint_count) +
23467 	    actual_size) /
23468 	    (vm_map_corpse_footprint_count + 1));
23469 	vm_map_corpse_footprint_count++;
23470 	if (actual_size > vm_map_corpse_footprint_size_max) {
23471 		vm_map_corpse_footprint_size_max = actual_size;
23472 	}
23473 
23474 	actual_size = round_page(actual_size);
23475 	if (buf_size > actual_size) {
23476 		kr = vm_deallocate(kernel_map,
23477 		    vm_sanitize_wrap_addr((vm_address_t)footprint_header +
23478 		    actual_size + PAGE_SIZE), /* trailing guard page */
23479 		    vm_sanitize_wrap_size(buf_size - actual_size));
23480 		assertf(kr == KERN_SUCCESS,
23481 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23482 		    footprint_header,
23483 		    (uint64_t) buf_size,
23484 		    (uint64_t) actual_size,
23485 		    kr);
23486 		kr = vm_protect(kernel_map,
23487 		    (vm_address_t)footprint_header + actual_size,
23488 		    PAGE_SIZE,
23489 		    FALSE,             /* set_maximum */
23490 		    vm_sanitize_wrap_prot(VM_PROT_NONE));
23491 		assertf(kr == KERN_SUCCESS,
23492 		    "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23493 		    footprint_header,
23494 		    (uint64_t) buf_size,
23495 		    (uint64_t) actual_size,
23496 		    kr);
23497 	}
23498 
23499 	footprint_header->cf_size = actual_size;
23500 }
23501 
23502 /*
23503  * vm_map_corpse_footprint_query_page_info:
23504  *	retrieves the disposition of the page at virtual address "vaddr"
23505  *	in the forked corpse's VM map
23506  *
23507  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23508  */
23509 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23510 vm_map_corpse_footprint_query_page_info(
23511 	vm_map_t        map,
23512 	vm_map_offset_t va,
23513 	int             *disposition_p)
23514 {
23515 	struct vm_map_corpse_footprint_header *footprint_header;
23516 	struct vm_map_corpse_footprint_region *footprint_region;
23517 	uint32_t        footprint_region_offset;
23518 	vm_map_offset_t region_start, region_end;
23519 	int             disp_idx;
23520 	kern_return_t   kr;
23521 	int             effective_page_size;
23522 	cf_disp_t       cf_disp;
23523 
23524 	if (!map->has_corpse_footprint) {
23525 		*disposition_p = 0;
23526 		kr = KERN_INVALID_ARGUMENT;
23527 		goto done;
23528 	}
23529 
23530 	footprint_header = map->vmmap_corpse_footprint;
23531 	if (footprint_header == NULL) {
23532 		*disposition_p = 0;
23533 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23534 		kr = KERN_INVALID_ARGUMENT;
23535 		goto done;
23536 	}
23537 
23538 	/* start looking at the hint ("cf_hint_region") */
23539 	footprint_region_offset = footprint_header->cf_hint_region;
23540 
23541 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23542 
23543 lookup_again:
23544 	if (footprint_region_offset < sizeof(*footprint_header)) {
23545 		/* hint too low: start from 1st region */
23546 		footprint_region_offset = sizeof(*footprint_header);
23547 	}
23548 	if (footprint_region_offset > footprint_header->cf_last_region) {
23549 		/* hint too high: re-start from 1st region */
23550 		footprint_region_offset = sizeof(*footprint_header);
23551 	}
23552 	footprint_region = (struct vm_map_corpse_footprint_region *)
23553 	    ((char *)footprint_header + footprint_region_offset);
23554 	region_start = footprint_region->cfr_vaddr;
23555 	region_end = (region_start +
23556 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23557 	    effective_page_size));
23558 	if (va < region_start &&
23559 	    footprint_region_offset != sizeof(*footprint_header)) {
23560 		/* our range starts before the hint region */
23561 
23562 		/* reset the hint (in a racy way...) */
23563 		footprint_header->cf_hint_region = sizeof(*footprint_header);
23564 		/* lookup "va" again from 1st region */
23565 		footprint_region_offset = sizeof(*footprint_header);
23566 		goto lookup_again;
23567 	}
23568 
23569 	while (va >= region_end) {
23570 		if (footprint_region_offset >= footprint_header->cf_last_region) {
23571 			break;
23572 		}
23573 		/* skip the region's header */
23574 		footprint_region_offset += sizeof(*footprint_region);
23575 		/* skip the region's page dispositions */
23576 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23577 		/* align to next word boundary */
23578 		footprint_region_offset =
23579 		    roundup(footprint_region_offset,
23580 		    sizeof(int));
23581 		footprint_region = (struct vm_map_corpse_footprint_region *)
23582 		    ((char *)footprint_header + footprint_region_offset);
23583 		region_start = footprint_region->cfr_vaddr;
23584 		region_end = (region_start +
23585 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23586 		    effective_page_size));
23587 	}
23588 	if (va < region_start || va >= region_end) {
23589 		/* page not found */
23590 		*disposition_p = 0;
23591 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23592 		kr = KERN_SUCCESS;
23593 		goto done;
23594 	}
23595 
23596 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
23597 	footprint_header->cf_hint_region = footprint_region_offset;
23598 
23599 	/* get page disposition for "va" in this region */
23600 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23601 	cf_disp = footprint_region->cfr_disposition[disp_idx];
23602 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23603 	kr = KERN_SUCCESS;
23604 done:
23605 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23606 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23607 	DTRACE_VM4(footprint_query_page_info,
23608 	    vm_map_t, map,
23609 	    vm_map_offset_t, va,
23610 	    int, *disposition_p,
23611 	    kern_return_t, kr);
23612 
23613 	return kr;
23614 }
23615 
23616 void
vm_map_corpse_footprint_destroy(vm_map_t map)23617 vm_map_corpse_footprint_destroy(
23618 	vm_map_t        map)
23619 {
23620 	if (map->has_corpse_footprint &&
23621 	    map->vmmap_corpse_footprint != 0) {
23622 		struct vm_map_corpse_footprint_header *footprint_header;
23623 		vm_size_t buf_size;
23624 		kern_return_t kr;
23625 
23626 		footprint_header = map->vmmap_corpse_footprint;
23627 		buf_size = footprint_header->cf_size;
23628 		kr = vm_deallocate(kernel_map,
23629 		    vm_sanitize_wrap_addr((vm_offset_t) map->vmmap_corpse_footprint),
23630 		    vm_sanitize_wrap_size(buf_size + PAGE_SIZE)); /* trailing guard page */
23631 		assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
23632 		map->vmmap_corpse_footprint = 0;
23633 		map->has_corpse_footprint = FALSE;
23634 	}
23635 }
23636 
23637 /*
23638  * vm_map_copy_footprint_ledgers:
23639  *	copies any ledger that's relevant to the memory footprint of "old_task"
23640  *	into the forked corpse's task ("new_task")
23641  */
23642 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23643 vm_map_copy_footprint_ledgers(
23644 	task_t  old_task,
23645 	task_t  new_task)
23646 {
23647 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23648 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23649 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23650 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23651 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23652 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23653 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23654 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23655 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23656 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23657 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23658 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23659 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23660 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23661 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23662 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23663 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23664 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23665 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23666 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23667 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_nofootprint_total);
23668 }
23669 
23670 /*
23671  * vm_map_copy_ledger:
23672  *	copy a single ledger from "old_task" to "new_task"
23673  */
23674 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23675 vm_map_copy_ledger(
23676 	task_t  old_task,
23677 	task_t  new_task,
23678 	int     ledger_entry)
23679 {
23680 	ledger_amount_t old_balance, new_balance, delta;
23681 
23682 	assert(new_task->map->has_corpse_footprint);
23683 	if (!new_task->map->has_corpse_footprint) {
23684 		return;
23685 	}
23686 
23687 	/* turn off sanity checks for the ledger we're about to mess with */
23688 	ledger_disable_panic_on_negative(new_task->ledger,
23689 	    ledger_entry);
23690 
23691 	/* adjust "new_task" to match "old_task" */
23692 	ledger_get_balance(old_task->ledger,
23693 	    ledger_entry,
23694 	    &old_balance);
23695 	ledger_get_balance(new_task->ledger,
23696 	    ledger_entry,
23697 	    &new_balance);
23698 	if (new_balance == old_balance) {
23699 		/* new == old: done */
23700 	} else if (new_balance > old_balance) {
23701 		/* new > old ==> new -= new - old */
23702 		delta = new_balance - old_balance;
23703 		ledger_debit(new_task->ledger,
23704 		    ledger_entry,
23705 		    delta);
23706 	} else {
23707 		/* new < old ==> new += old - new */
23708 		delta = old_balance - new_balance;
23709 		ledger_credit(new_task->ledger,
23710 		    ledger_entry,
23711 		    delta);
23712 	}
23713 }
23714 
23715 /*
23716  * vm_map_get_pmap:
23717  * returns the pmap associated with the vm_map
23718  */
23719 pmap_t
vm_map_get_pmap(vm_map_t map)23720 vm_map_get_pmap(vm_map_t map)
23721 {
23722 	return vm_map_pmap(map);
23723 }
23724 
23725 ppnum_t
vm_map_get_phys_page(vm_map_t map,vm_offset_t addr)23726 vm_map_get_phys_page(
23727 	vm_map_t                map,
23728 	vm_offset_t             addr)
23729 {
23730 	vm_object_offset_t      offset;
23731 	vm_object_t             object;
23732 	vm_map_offset_t         map_offset;
23733 	vm_map_entry_t          entry;
23734 	ppnum_t                 phys_page = 0;
23735 
23736 	map_offset = vm_map_trunc_page(addr, PAGE_MASK);
23737 
23738 	vm_map_lock(map);
23739 	while (vm_map_lookup_entry(map, map_offset, &entry)) {
23740 		if (entry->is_sub_map) {
23741 			vm_map_t        old_map;
23742 			vm_map_lock(VME_SUBMAP(entry));
23743 			old_map = map;
23744 			map = VME_SUBMAP(entry);
23745 			map_offset = (VME_OFFSET(entry) +
23746 			    (map_offset - entry->vme_start));
23747 			vm_map_unlock(old_map);
23748 			continue;
23749 		}
23750 		if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
23751 			vm_map_unlock(map);
23752 			return (ppnum_t) 0;
23753 		}
23754 		if (VME_OBJECT(entry)->phys_contiguous) {
23755 			/* These are  not standard pageable memory mappings */
23756 			/* If they are not present in the object they will  */
23757 			/* have to be picked up from the pager through the  */
23758 			/* fault mechanism.  */
23759 			if (VME_OBJECT(entry)->vo_shadow_offset == 0) {
23760 				/* need to call vm_fault */
23761 				vm_map_unlock(map);
23762 				vm_fault(map, map_offset, VM_PROT_NONE,
23763 				    FALSE /* change_wiring */, VM_KERN_MEMORY_NONE,
23764 				    THREAD_UNINT, NULL, 0);
23765 				vm_map_lock(map);
23766 				continue;
23767 			}
23768 			offset = (VME_OFFSET(entry) +
23769 			    (map_offset - entry->vme_start));
23770 			phys_page = (ppnum_t)
23771 			    ((VME_OBJECT(entry)->vo_shadow_offset
23772 			    + offset) >> PAGE_SHIFT);
23773 			break;
23774 		}
23775 		offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start));
23776 		object = VME_OBJECT(entry);
23777 		vm_object_lock(object);
23778 		while (TRUE) {
23779 			vm_page_t dst_page = vm_page_lookup(object, offset);
23780 			if (dst_page == VM_PAGE_NULL) {
23781 				if (object->shadow) {
23782 					vm_object_t old_object;
23783 					vm_object_lock(object->shadow);
23784 					old_object = object;
23785 					offset = offset + object->vo_shadow_offset;
23786 					object = object->shadow;
23787 					vm_object_unlock(old_object);
23788 				} else {
23789 					vm_object_unlock(object);
23790 					break;
23791 				}
23792 			} else {
23793 				phys_page = (ppnum_t)(VM_PAGE_GET_PHYS_PAGE(dst_page));
23794 				vm_object_unlock(object);
23795 				break;
23796 			}
23797 		}
23798 		break;
23799 	}
23800 
23801 	vm_map_unlock(map);
23802 	return phys_page;
23803 }
23804 
23805 #if CONFIG_MAP_RANGES
23806 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23807 static bitmap_t vm_map_user_range_large_file_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23808 
23809 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23810 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23811 
23812 /*
23813  * vm_map_range_map_init:
23814  *  initializes the VM range ID map to enable index lookup
23815  *  of user VM ranges based on VM tag from userspace.
23816  */
23817 static void
vm_map_range_map_init(void)23818 vm_map_range_map_init(void)
23819 {
23820 	/*
23821 	 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
23822 	 * - the former is malloc metadata which should be kept separate
23823 	 * - the latter has its own ranges
23824 	 */
23825 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
23826 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
23827 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
23828 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
23829 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
23830 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
23831 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
23832 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
23833 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
23834 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
23835 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
23836 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
23837 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
23838 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
23839 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
23840 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
23841 	bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOACCELERATOR);
23842 	bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOSURFACE);
23843 }
23844 
23845 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)23846 vm_map_range_random_uniform(
23847 	vm_map_size_t           req_size,
23848 	vm_map_offset_t         min_addr,
23849 	vm_map_offset_t         max_addr,
23850 	vm_map_offset_t         offmask)
23851 {
23852 	vm_map_offset_t random_addr;
23853 	struct mach_vm_range alloc;
23854 
23855 	req_size = (req_size + offmask) & ~offmask;
23856 	min_addr = (min_addr + offmask) & ~offmask;
23857 	max_addr = max_addr & ~offmask;
23858 
23859 	read_random(&random_addr, sizeof(random_addr));
23860 	random_addr %= (max_addr - req_size - min_addr);
23861 	random_addr &= ~offmask;
23862 
23863 	alloc.min_address = min_addr + random_addr;
23864 	alloc.max_address = min_addr + random_addr + req_size;
23865 	return alloc;
23866 }
23867 
23868 static vm_map_offset_t
vm_map_range_offmask(void)23869 vm_map_range_offmask(void)
23870 {
23871 	uint32_t pte_depth;
23872 
23873 	/*
23874 	 * PTE optimizations
23875 	 *
23876 	 *
23877 	 * 16k pages systems
23878 	 * ~~~~~~~~~~~~~~~~~
23879 	 *
23880 	 * A single L1 (sub-)page covers the address space.
23881 	 * - L2 pages cover 64G,
23882 	 * - L3 pages cover 32M.
23883 	 *
23884 	 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
23885 	 * As a result, we really only need to align the ranges to 32M to avoid
23886 	 * partial L3 pages.
23887 	 *
23888 	 * On macOS, the usage of L2 pages will increase, so as a result we will
23889 	 * want to align ranges to 64G in order to utilize them fully.
23890 	 *
23891 	 *
23892 	 * 4k pages systems
23893 	 * ~~~~~~~~~~~~~~~~
23894 	 *
23895 	 * A single L0 (sub-)page covers the address space.
23896 	 * - L1 pages cover 512G,
23897 	 * - L2 pages cover 1G,
23898 	 * - L3 pages cover 2M.
23899 	 *
23900 	 * The long tail of processes on a system will tend to have a VA usage
23901 	 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
23902 	 * This is achievable with a single L1 and a few L2s without
23903 	 * randomization.
23904 	 *
23905 	 * However once randomization is introduced, the system will immediately
23906 	 * need several L1s and many more L2s. As a result:
23907 	 *
23908 	 * - on embedded devices, the cost of these extra pages isn't
23909 	 *   sustainable, and we just disable the feature entirely,
23910 	 *
23911 	 * - on macOS we align ranges to a 512G boundary so that the extra L1
23912 	 *   pages can be used to their full potential.
23913 	 */
23914 
23915 	/*
23916 	 * note, this function assumes _non exotic mappings_
23917 	 * which is why it uses the native kernel's PAGE_SHIFT.
23918 	 */
23919 #if XNU_PLATFORM_MacOSX
23920 	pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
23921 #else /* !XNU_PLATFORM_MacOSX */
23922 	pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
23923 #endif /* !XNU_PLATFORM_MacOSX */
23924 
23925 	if (pte_depth == 0) {
23926 		return 0;
23927 	}
23928 
23929 	return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
23930 }
23931 
23932 /*
23933  * vm_map_range_configure:
23934  *	configures the user vm_map ranges by increasing the maximum VA range of
23935  *  the map and carving out a range at the end of VA space (searching backwards
23936  *  in the newly expanded map).
23937  */
23938 kern_return_t
vm_map_range_configure(vm_map_t map,__unused bool needs_extra_jumbo_va)23939 vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va)
23940 {
23941 	const vm_map_offset_t offmask = vm_map_range_offmask();
23942 	struct mach_vm_range data_range;
23943 	vm_map_offset_t default_end;
23944 	kern_return_t kr;
23945 
23946 	if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
23947 		/*
23948 		 * No point doing vm ranges in a 32bit address space.
23949 		 */
23950 		return KERN_NOT_SUPPORTED;
23951 	}
23952 
23953 	/* Should not be applying ranges to kernel map or kernel map submaps */
23954 	assert(vm_map_pmap(map) != kernel_pmap);
23955 
23956 #if XNU_PLATFORM_MacOSX
23957 
23958 	/*
23959 	 * on macOS, the address space is a massive 47 bits (128T),
23960 	 * with several carve outs that processes can't use:
23961 	 * - the shared region
23962 	 * - the commpage region
23963 	 * - the GPU carve out (if applicable)
23964 	 *
23965 	 * and when nano-malloc is in use it desires memory at the 96T mark.
23966 	 *
23967 	 * However, their location is architecture dependent:
23968 	 * - On intel, the shared region and commpage are
23969 	 *   at the very end of the usable address space (above +127T),
23970 	 *   and there is no GPU carve out, and pthread wants to place
23971 	 *   threads at the 112T mark (0x70T).
23972 	 *
23973 	 * - On arm64, these are in the same spot as on embedded devices:
23974 	 *   o shared region:   [ 6G,  10G)  [ will likely grow over time ]
23975 	 *   o commpage region: [63G,  64G)
23976 	 *   o GPU carve out:   [64G, 448G)
23977 	 *
23978 	 * This is conveninent because the mappings at the end of the address
23979 	 * space (when they exist) are made by the kernel.
23980 	 *
23981 	 * The policy is to allocate a random 1T for the data heap
23982 	 * in the end of the address-space in the:
23983 	 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
23984 	 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
23985 	 */
23986 
23987 	/* see NANOZONE_SIGNATURE in libmalloc */
23988 #if __x86_64__
23989 	default_end = 0x71ull << 40;
23990 #else
23991 	default_end = 0x61ull << 40;
23992 #endif
23993 	data_range  = vm_map_range_random_uniform(1ull << 40,
23994 	        default_end, 0x7full << 40, offmask);
23995 
23996 #else /* !XNU_PLATFORM_MacOSX */
23997 
23998 	/*
23999 	 * Embedded devices:
24000 	 *
24001 	 *   The default VA Size scales with the device physical memory.
24002 	 *
24003 	 *   Out of that:
24004 	 *   - the "zero" page typically uses 4G + some slide
24005 	 *   - the shared region uses SHARED_REGION_SIZE bytes (4G)
24006 	 *
24007 	 *   Without the use of jumbo or any adjustment to the address space,
24008 	 *   a default VM map typically looks like this:
24009 	 *
24010 	 *       0G -->╒════════════╕
24011 	 *             │  pagezero  │
24012 	 *             │  + slide   │
24013 	 *      ~4G -->╞════════════╡<-- vm_map_min(map)
24014 	 *             │            │
24015 	 *       6G -->├────────────┤
24016 	 *             │   shared   │
24017 	 *             │   region   │
24018 	 *      10G -->├────────────┤
24019 	 *             │            │
24020 	 *   max_va -->├────────────┤<-- vm_map_max(map)
24021 	 *             │            │
24022 	 *             ╎   jumbo    ╎
24023 	 *             ╎            ╎
24024 	 *             │            │
24025 	 *      63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
24026 	 *             │  commpage  │
24027 	 *      64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
24028 	 *             │            │
24029 	 *             ╎    GPU     ╎
24030 	 *             ╎  carveout  ╎
24031 	 *             │            │
24032 	 *     448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
24033 	 *             │            │
24034 	 *             ╎            ╎
24035 	 *             ╎            ╎
24036 	 *             │            │
24037 	 *     512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
24038 	 *
24039 	 *   When this drawing was made, "max_va" was smaller than
24040 	 *   ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
24041 	 *   12G of address space for the zero-page, slide, files,
24042 	 *   binaries, heap ...
24043 	 *
24044 	 *   We will want to make a "heap/data" carve out inside
24045 	 *   the jumbo range of half of that usable space, assuming
24046 	 *   that this is less than a forth of the jumbo range.
24047 	 *
24048 	 *   The assert below intends to catch when max_va grows
24049 	 *   too large for this heuristic.
24050 	 */
24051 
24052 	vm_map_lock_read(map);
24053 	default_end = vm_map_max(map);
24054 	vm_map_unlock_read(map);
24055 
24056 	/*
24057 	 * Check that we're not already jumbo'd,
24058 	 * or our address space was somehow modified.
24059 	 *
24060 	 * If so we cannot guarantee that we can set up the ranges
24061 	 * safely without interfering with the existing map.
24062 	 */
24063 	if (default_end > vm_compute_max_offset(true)) {
24064 		return KERN_NO_SPACE;
24065 	}
24066 
24067 	if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
24068 		/*
24069 		 * an override boot-arg was set, disable user-ranges
24070 		 *
24071 		 * XXX: this is problematic because it means these boot-args
24072 		 *      no longer test the behavior changing the value
24073 		 *      of ARM64_MAX_OFFSET_DEVICE_* would have.
24074 		 */
24075 		return KERN_NOT_SUPPORTED;
24076 	}
24077 
24078 	/* expand the default VM space to 64GB */
24079 	vm_map_set_jumbo(map);
24080 
24081 	assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
24082 	data_range = vm_map_range_random_uniform(GiB(10),
24083 	    default_end + PAGE_SIZE, vm_map_max(map), offmask);
24084 
24085 #endif /* !XNU_PLATFORM_MacOSX */
24086 
24087 	/*
24088 	 * Poke holes so that ASAN or people listing regions
24089 	 * do not think this space is free.
24090 	 */
24091 
24092 	if (default_end != data_range.min_address) {
24093 		kr = vm_map_enter(map, &default_end,
24094 		    data_range.min_address - default_end,
24095 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24096 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24097 		assert(kr == KERN_SUCCESS);
24098 	}
24099 
24100 	if (data_range.max_address != vm_map_max(map)) {
24101 		vm_map_entry_t entry;
24102 		vm_size_t size;
24103 
24104 		/*
24105 		 * Extend the end of the hole to the next VM entry or the end of the map,
24106 		 * whichever comes first.
24107 		 */
24108 		vm_map_lock_read(map);
24109 		vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
24110 		if (entry == vm_map_to_entry(map) || entry->vme_start > vm_map_max(map)) {
24111 			size = vm_map_max(map) - data_range.max_address;
24112 		} else {
24113 			size = entry->vme_start - data_range.max_address;
24114 		}
24115 		vm_map_unlock_read(map);
24116 
24117 		kr = vm_map_enter(map, &data_range.max_address, size,
24118 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24119 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24120 		assert(kr == KERN_SUCCESS);
24121 	}
24122 
24123 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24124 	if (needs_extra_jumbo_va) {
24125 		/* This will grow the address space to MACH_VM_MAX_ADDRESS */
24126 		vm_map_set_extra_jumbo(map);
24127 	}
24128 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24129 
24130 	vm_map_lock(map);
24131 	map->default_range.min_address = vm_map_min(map);
24132 	map->default_range.max_address = default_end;
24133 	map->data_range = data_range;
24134 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24135 	/* If process has "extra jumbo" entitlement, enable large file range */
24136 	if (needs_extra_jumbo_va) {
24137 		map->large_file_range = vm_map_range_random_uniform(TiB(1),
24138 		    MACH_VM_JUMBO_ADDRESS, MACH_VM_MAX_ADDRESS, offmask);
24139 	}
24140 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24141 	map->uses_user_ranges = true;
24142 	vm_map_unlock(map);
24143 
24144 	return KERN_SUCCESS;
24145 }
24146 
24147 /*
24148  * vm_map_range_fork:
24149  *	clones the array of ranges from old_map to new_map in support
24150  *  of a VM map fork.
24151  */
24152 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)24153 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
24154 {
24155 	if (!old_map->uses_user_ranges) {
24156 		/* nothing to do */
24157 		return;
24158 	}
24159 
24160 	new_map->default_range = old_map->default_range;
24161 	new_map->data_range = old_map->data_range;
24162 
24163 	if (old_map->extra_ranges_count) {
24164 		vm_map_user_range_t otable, ntable;
24165 		uint16_t count;
24166 
24167 		otable = old_map->extra_ranges;
24168 		count  = old_map->extra_ranges_count;
24169 		ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
24170 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
24171 		memcpy(ntable, otable,
24172 		    count * sizeof(struct vm_map_user_range));
24173 
24174 		new_map->extra_ranges_count = count;
24175 		new_map->extra_ranges = ntable;
24176 	}
24177 
24178 	new_map->uses_user_ranges = true;
24179 }
24180 
24181 /*
24182  * vm_map_get_user_range:
24183  *	copy the VM user range for the given VM map and range ID.
24184  */
24185 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)24186 vm_map_get_user_range(
24187 	vm_map_t                map,
24188 	vm_map_range_id_t       range_id,
24189 	mach_vm_range_t         range)
24190 {
24191 	if (map == NULL || !map->uses_user_ranges || range == NULL) {
24192 		return KERN_INVALID_ARGUMENT;
24193 	}
24194 
24195 	switch (range_id) {
24196 	case UMEM_RANGE_ID_DEFAULT:
24197 		*range = map->default_range;
24198 		return KERN_SUCCESS;
24199 
24200 	case UMEM_RANGE_ID_HEAP:
24201 		*range = map->data_range;
24202 		return KERN_SUCCESS;
24203 
24204 	case UMEM_RANGE_ID_LARGE_FILE:
24205 		/*
24206 		 * Because this function tells a user-space process about the user
24207 		 * ranges in its VM map, this case communicates whether the large file
24208 		 * range is in use. Note that this is different from how the large file
24209 		 * range ID is handled in `vm_map_get_range()`: there, we "resolve" the
24210 		 * VA policy and return either the large file range or data range,
24211 		 * depending on whether the large file range is enabled.
24212 		 */
24213 		if (map->large_file_range.min_address != map->large_file_range.max_address) {
24214 			/* large file range is configured and should be used */
24215 			*range = map->large_file_range;
24216 		} else {
24217 			return KERN_INVALID_ARGUMENT;
24218 		}
24219 		return KERN_SUCCESS;
24220 
24221 	default:
24222 		return KERN_INVALID_ARGUMENT;
24223 	}
24224 }
24225 
24226 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)24227 vm_map_user_range_resolve(
24228 	vm_map_t                map,
24229 	mach_vm_address_t       addr,
24230 	mach_vm_size_t          size,
24231 	mach_vm_range_t         range)
24232 {
24233 	struct mach_vm_range tmp;
24234 
24235 	vm_map_lock_assert_held(map);
24236 
24237 	static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
24238 	static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
24239 
24240 	if (mach_vm_range_contains(&map->default_range, addr, size)) {
24241 		if (range) {
24242 			*range = map->default_range;
24243 		}
24244 		return UMEM_RANGE_ID_DEFAULT;
24245 	}
24246 
24247 	if (mach_vm_range_contains(&map->data_range, addr, size)) {
24248 		if (range) {
24249 			*range = map->data_range;
24250 		}
24251 		return UMEM_RANGE_ID_HEAP;
24252 	}
24253 
24254 	if (mach_vm_range_contains(&map->large_file_range, addr, size)) {
24255 		if (range) {
24256 			*range = map->large_file_range;
24257 		}
24258 		return UMEM_RANGE_ID_LARGE_FILE;
24259 	}
24260 
24261 	for (size_t i = 0; i < map->extra_ranges_count; i++) {
24262 		vm_map_user_range_t r = &map->extra_ranges[i];
24263 
24264 		tmp.min_address = r->vmur_min_address;
24265 		tmp.max_address = r->vmur_max_address;
24266 
24267 		if (mach_vm_range_contains(&tmp, addr, size)) {
24268 			if (range) {
24269 				*range = tmp;
24270 			}
24271 			return r->vmur_range_id;
24272 		}
24273 	}
24274 
24275 	if (range) {
24276 		range->min_address = range->max_address = 0;
24277 	}
24278 	return UMEM_RANGE_ID_DEFAULT;
24279 }
24280 #endif /* CONFIG_MAP_RANGES */
24281 
24282 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map,__unused vm_map_size_t size)24283 vm_map_kernel_flags_update_range_id(
24284 	vm_map_kernel_flags_t *vmkf,
24285 	vm_map_t map,
24286 	__unused vm_map_size_t size)
24287 {
24288 	if (map == kernel_map) {
24289 		if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
24290 			vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
24291 		}
24292 #if CONFIG_MAP_RANGES
24293 	} else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
24294 	    vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT) {
24295 		if (bitmap_test(vm_map_user_range_large_file_map, vmkf->vm_tag)
24296 		    || size >= VM_LARGE_FILE_THRESHOLD) {
24297 			/*
24298 			 * if the map doesn't have the large file range configured,
24299 			 * the range will get resolved to the heap range in `vm_map_get_range`
24300 			 */
24301 			vmkf->vmkf_range_id = UMEM_RANGE_ID_LARGE_FILE;
24302 		} else if (bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
24303 			vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
24304 		}
24305 #endif /* CONFIG_MAP_RANGES */
24306 	}
24307 }
24308 
24309 /*
24310  * vm_map_entry_has_device_pager:
24311  * Check if the vm map entry specified by the virtual address has a device pager.
24312  * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
24313  */
24314 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)24315 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
24316 {
24317 	vm_map_entry_t entry;
24318 	vm_object_t object;
24319 	boolean_t result;
24320 
24321 	if (map == NULL) {
24322 		return FALSE;
24323 	}
24324 
24325 	vm_map_lock(map);
24326 	while (TRUE) {
24327 		if (!vm_map_lookup_entry(map, vaddr, &entry)) {
24328 			result = FALSE;
24329 			break;
24330 		}
24331 		if (entry->is_sub_map) {
24332 			// Check the submap
24333 			vm_map_t submap = VME_SUBMAP(entry);
24334 			assert(submap != NULL);
24335 			vm_map_lock(submap);
24336 			vm_map_unlock(map);
24337 			map = submap;
24338 			continue;
24339 		}
24340 		object = VME_OBJECT(entry);
24341 		if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
24342 			result = TRUE;
24343 			break;
24344 		}
24345 		result = FALSE;
24346 		break;
24347 	}
24348 
24349 	vm_map_unlock(map);
24350 	return result;
24351 }
24352 
24353 
24354 #if MACH_ASSERT
24355 
24356 extern int pmap_ledgers_panic;
24357 extern int pmap_ledgers_panic_leeway;
24358 
24359 #define LEDGER_DRIFT(__LEDGER)                    \
24360 	int             __LEDGER##_over;          \
24361 	ledger_amount_t __LEDGER##_over_total;    \
24362 	ledger_amount_t __LEDGER##_over_max;      \
24363 	int             __LEDGER##_under;         \
24364 	ledger_amount_t __LEDGER##_under_total;   \
24365 	ledger_amount_t __LEDGER##_under_max
24366 
24367 struct {
24368 	uint64_t        num_pmaps_checked;
24369 
24370 	LEDGER_DRIFT(phys_footprint);
24371 	LEDGER_DRIFT(internal);
24372 	LEDGER_DRIFT(internal_compressed);
24373 	LEDGER_DRIFT(external);
24374 	LEDGER_DRIFT(reusable);
24375 	LEDGER_DRIFT(iokit_mapped);
24376 	LEDGER_DRIFT(alternate_accounting);
24377 	LEDGER_DRIFT(alternate_accounting_compressed);
24378 	LEDGER_DRIFT(page_table);
24379 	LEDGER_DRIFT(purgeable_volatile);
24380 	LEDGER_DRIFT(purgeable_nonvolatile);
24381 	LEDGER_DRIFT(purgeable_volatile_compressed);
24382 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24383 	LEDGER_DRIFT(tagged_nofootprint);
24384 	LEDGER_DRIFT(tagged_footprint);
24385 	LEDGER_DRIFT(tagged_nofootprint_compressed);
24386 	LEDGER_DRIFT(tagged_footprint_compressed);
24387 	LEDGER_DRIFT(network_volatile);
24388 	LEDGER_DRIFT(network_nonvolatile);
24389 	LEDGER_DRIFT(network_volatile_compressed);
24390 	LEDGER_DRIFT(network_nonvolatile_compressed);
24391 	LEDGER_DRIFT(media_nofootprint);
24392 	LEDGER_DRIFT(media_footprint);
24393 	LEDGER_DRIFT(media_nofootprint_compressed);
24394 	LEDGER_DRIFT(media_footprint_compressed);
24395 	LEDGER_DRIFT(graphics_nofootprint);
24396 	LEDGER_DRIFT(graphics_footprint);
24397 	LEDGER_DRIFT(graphics_nofootprint_compressed);
24398 	LEDGER_DRIFT(graphics_footprint_compressed);
24399 	LEDGER_DRIFT(neural_nofootprint);
24400 	LEDGER_DRIFT(neural_footprint);
24401 	LEDGER_DRIFT(neural_nofootprint_compressed);
24402 	LEDGER_DRIFT(neural_footprint_compressed);
24403 	LEDGER_DRIFT(neural_nofootprint_total);
24404 } pmap_ledgers_drift;
24405 
24406 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24407 vm_map_pmap_check_ledgers(
24408 	pmap_t          pmap,
24409 	ledger_t        ledger,
24410 	int             pid,
24411 	char            *procname)
24412 {
24413 	ledger_amount_t bal;
24414 	boolean_t       do_panic;
24415 
24416 	do_panic = FALSE;
24417 
24418 	pmap_ledgers_drift.num_pmaps_checked++;
24419 
24420 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
24421 MACRO_BEGIN                                                             \
24422 	int panic_on_negative = TRUE;                                   \
24423 	ledger_get_balance(ledger,                                      \
24424 	                   task_ledgers.__LEDGER,                       \
24425 	                   &bal);                                       \
24426 	ledger_get_panic_on_negative(ledger,                            \
24427 	                             task_ledgers.__LEDGER,             \
24428 	                             &panic_on_negative);               \
24429 	if (bal != 0) {                                                 \
24430 	        if (panic_on_negative ||                                \
24431 	            (pmap_ledgers_panic &&                              \
24432 	             pmap_ledgers_panic_leeway > 0 &&                   \
24433 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
24434 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24435 	                do_panic = TRUE;                                \
24436 	        }                                                       \
24437 	        printf("LEDGER BALANCE proc %d (%s) "                   \
24438 	               "\"%s\" = %lld\n",                               \
24439 	               pid, procname, #__LEDGER, bal);                  \
24440 	        if (bal > 0) {                                          \
24441 	                pmap_ledgers_drift.__LEDGER##_over++;           \
24442 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24443 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24444 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24445 	                }                                               \
24446 	        } else if (bal < 0) {                                   \
24447 	                pmap_ledgers_drift.__LEDGER##_under++;          \
24448 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24449 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24450 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24451 	                }                                               \
24452 	        }                                                       \
24453 	}                                                               \
24454 MACRO_END
24455 
24456 	LEDGER_CHECK_BALANCE(phys_footprint);
24457 	LEDGER_CHECK_BALANCE(internal);
24458 	LEDGER_CHECK_BALANCE(internal_compressed);
24459 	LEDGER_CHECK_BALANCE(external);
24460 	LEDGER_CHECK_BALANCE(reusable);
24461 	LEDGER_CHECK_BALANCE(iokit_mapped);
24462 	LEDGER_CHECK_BALANCE(alternate_accounting);
24463 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24464 	LEDGER_CHECK_BALANCE(page_table);
24465 	LEDGER_CHECK_BALANCE(purgeable_volatile);
24466 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24467 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24468 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24469 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
24470 	LEDGER_CHECK_BALANCE(tagged_footprint);
24471 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24472 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24473 	LEDGER_CHECK_BALANCE(network_volatile);
24474 	LEDGER_CHECK_BALANCE(network_nonvolatile);
24475 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
24476 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24477 	LEDGER_CHECK_BALANCE(media_nofootprint);
24478 	LEDGER_CHECK_BALANCE(media_footprint);
24479 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24480 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
24481 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
24482 	LEDGER_CHECK_BALANCE(graphics_footprint);
24483 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24484 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24485 	LEDGER_CHECK_BALANCE(neural_nofootprint);
24486 	LEDGER_CHECK_BALANCE(neural_footprint);
24487 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24488 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24489 	LEDGER_CHECK_BALANCE(neural_nofootprint_total);
24490 
24491 	if (do_panic) {
24492 		if (pmap_ledgers_panic) {
24493 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24494 			    pmap, pid, procname);
24495 		} else {
24496 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24497 			    pmap, pid, procname);
24498 		}
24499 	}
24500 }
24501 
24502 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24503 vm_map_pmap_set_process(
24504 	vm_map_t map,
24505 	int pid,
24506 	char *procname)
24507 {
24508 	pmap_set_process(vm_map_pmap(map), pid, procname);
24509 }
24510 
24511 #endif /* MACH_ASSERT */
24512