xref: /xnu-11215.81.4/osfmk/vm/vm_map.c (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452) !
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68 
69 #include <vm/vm_options.h>
70 
71 #include <libkern/OSAtomic.h>
72 
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm_server.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83 
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 
91 #include <vm/cpm_internal.h>
92 #include <vm/memory_types.h>
93 #include <vm/vm_compressor_xnu.h>
94 #include <vm/vm_compressor_pager_internal.h>
95 #include <vm/vm_init_xnu.h>
96 #include <vm/vm_fault_internal.h>
97 #include <vm/vm_map_internal.h>
98 #include <vm/vm_object_internal.h>
99 #include <vm/vm_page_internal.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/pmap.h>
102 #include <vm/vm_kern_internal.h>
103 #include <ipc/ipc_port.h>
104 #include <kern/sched_prim.h>
105 #include <kern/misc_protos.h>
106 
107 #include <mach/vm_map_server.h>
108 #include <mach/mach_host_server.h>
109 #include <vm/vm_memtag.h>
110 #include <vm/vm_protos_internal.h>
111 #include <vm/vm_purgeable_internal.h>
112 
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_shared_region_internal.h>
115 #include <vm/vm_map_store_internal.h>
116 #include <vm/vm_memory_entry_xnu.h>
117 #include <vm/memory_object_internal.h>
118 #include <vm/vm_memory_entry.h>
119 #include <vm/vm_sanitize_internal.h>
120 #if DEVELOPMENT || DEBUG
121 #include <vm/vm_compressor_info.h>
122 #endif /* DEVELOPMENT || DEBUG */
123 #include <san/kasan.h>
124 
125 #include <sys/resource.h>
126 #include <sys/random.h>
127 #include <sys/codesign.h>
128 #include <sys/code_signing.h>
129 #include <sys/mman.h>
130 #include <sys/reboot.h>
131 #include <sys/kdebug_triage.h>
132 #include <sys/reason.h>
133 
134 #include <libkern/section_keywords.h>
135 
136 #if DEVELOPMENT || DEBUG
137 extern int proc_selfcsflags(void);
138 int vm_log_xnu_user_debug = 0;
139 int panic_on_unsigned_execute = 0;
140 int panic_on_mlock_failure = 0;
141 #endif /* DEVELOPMENT || DEBUG */
142 
143 #if DEVELOPMENT || DEBUG
144 int debug4k_filter = 0;
145 char debug4k_proc_name[1024] = "";
146 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
147 int debug4k_panic_on_misaligned_sharing = 0;
148 const char *debug4k_category_name[] = {
149 	"error",        /* 0 */
150 	"life",         /* 1 */
151 	"load",         /* 2 */
152 	"fault",        /* 3 */
153 	"copy",         /* 4 */
154 	"share",        /* 5 */
155 	"adjust",       /* 6 */
156 	"pmap",         /* 7 */
157 	"mementry",     /* 8 */
158 	"iokit",        /* 9 */
159 	"upl",          /* 10 */
160 	"exc",          /* 11 */
161 	"vfs"           /* 12 */
162 };
163 #endif /* DEVELOPMENT || DEBUG */
164 int debug4k_no_cow_copyin = 0;
165 
166 
167 #if __arm64__
168 extern const int fourk_binary_compatibility_unsafe;
169 #endif /* __arm64__ */
170 extern int proc_selfpid(void);
171 extern char *proc_name_address(void *p);
172 extern const char *proc_best_name(struct proc *p);
173 
174 #if VM_MAP_DEBUG_APPLE_PROTECT
175 int vm_map_debug_apple_protect = 0;
176 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
177 #if VM_MAP_DEBUG_FOURK
178 int vm_map_debug_fourk = 0;
179 #endif /* VM_MAP_DEBUG_FOURK */
180 
181 #if DEBUG || DEVELOPMENT
182 static TUNABLE(bool, vm_map_executable_immutable,
183     "vm_map_executable_immutable", true);
184 #else
185 #define vm_map_executable_immutable true
186 #endif
187 
188 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
189 
190 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
191 /* Internal prototypes
192  */
193 
194 typedef struct vm_map_zap {
195 	vm_map_entry_t          vmz_head;
196 	vm_map_entry_t         *vmz_tail;
197 } *vm_map_zap_t;
198 
199 #define VM_MAP_ZAP_DECLARE(zap) \
200 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
201 
202 extern kern_return_t vm_map_wire_external(
203 	vm_map_t                map,
204 	vm_map_offset_ut        start_u,
205 	vm_map_offset_ut        end_u,
206 	vm_prot_ut              prot_u,
207 	boolean_t               user_wire) __exported;
208 
209 #if XNU_PLATFORM_MacOSX
210 extern /* exported via Private.<arch>.MacOSX.exports on macOS */
211 #else
212 static
213 #endif
214 kern_return_t vm_map_copyin_common(
215 	vm_map_t                src_map,
216 	vm_map_address_ut       src_addr,
217 	vm_map_size_ut          len,
218 	boolean_t               src_destroy,
219 	boolean_t               src_volatile,
220 	vm_map_copy_t          *copy_result,                           /* OUT */
221 	boolean_t               use_maxprot);
222 
223 static vm_map_entry_t   vm_map_entry_insert(
224 	vm_map_t                map,
225 	vm_map_entry_t          insp_entry,
226 	vm_map_offset_t         start,
227 	vm_map_offset_t         end,
228 	vm_object_t             object,
229 	vm_object_offset_t      offset,
230 	vm_map_kernel_flags_t   vmk_flags,
231 	boolean_t               needs_copy,
232 	vm_prot_t               cur_protection,
233 	vm_prot_t               max_protection,
234 	vm_inherit_t            inheritance,
235 	boolean_t               clear_map_aligned);
236 
237 static void vm_map_simplify_range(
238 	vm_map_t        map,
239 	vm_map_offset_t start,
240 	vm_map_offset_t end);   /* forward */
241 
242 static boolean_t        vm_map_range_check(
243 	vm_map_t        map,
244 	vm_map_offset_t start,
245 	vm_map_offset_t end,
246 	vm_map_entry_t  *entry);
247 
248 static void vm_map_submap_pmap_clean(
249 	vm_map_t        map,
250 	vm_map_offset_t start,
251 	vm_map_offset_t end,
252 	vm_map_t        sub_map,
253 	vm_map_offset_t offset);
254 
255 static void             vm_map_pmap_enter(
256 	vm_map_t                map,
257 	vm_map_offset_t         addr,
258 	vm_map_offset_t         end_addr,
259 	vm_object_t             object,
260 	vm_object_offset_t      offset,
261 	vm_prot_t               protection);
262 
263 static void             _vm_map_clip_end(
264 	struct vm_map_header    *map_header,
265 	vm_map_entry_t          entry,
266 	vm_map_offset_t         end);
267 
268 static void             _vm_map_clip_start(
269 	struct vm_map_header    *map_header,
270 	vm_map_entry_t          entry,
271 	vm_map_offset_t         start);
272 
273 static kmem_return_t vm_map_delete(
274 	vm_map_t        map,
275 	vm_map_offset_t start,
276 	vm_map_offset_t end,
277 	vmr_flags_t     flags,
278 	kmem_guard_t    guard,
279 	vm_map_zap_t    zap);
280 
281 static void             vm_map_copy_insert(
282 	vm_map_t        map,
283 	vm_map_entry_t  after_where,
284 	vm_map_copy_t   copy);
285 
286 static kern_return_t    vm_map_copy_overwrite_unaligned(
287 	vm_map_t        dst_map,
288 	vm_map_entry_t  entry,
289 	vm_map_copy_t   copy,
290 	vm_map_address_t start,
291 	boolean_t       discard_on_success);
292 
293 static kern_return_t    vm_map_copy_overwrite_aligned(
294 	vm_map_t        dst_map,
295 	vm_map_entry_t  tmp_entry,
296 	vm_map_copy_t   copy,
297 	vm_map_offset_t start,
298 	pmap_t          pmap);
299 
300 static kern_return_t    vm_map_copyin_kernel_buffer(
301 	vm_map_t        src_map,
302 	vm_map_address_t src_addr,
303 	vm_map_size_t   len,
304 	boolean_t       src_destroy,
305 	vm_map_copy_t   *copy_result);  /* OUT */
306 
307 static kern_return_t    vm_map_copyout_kernel_buffer(
308 	vm_map_t        map,
309 	vm_map_address_t *addr, /* IN/OUT */
310 	vm_map_copy_t   copy,
311 	vm_map_size_t   copy_size,
312 	boolean_t       overwrite,
313 	boolean_t       consume_on_success);
314 
315 static void             vm_map_fork_share(
316 	vm_map_t        old_map,
317 	vm_map_entry_t  old_entry,
318 	vm_map_t        new_map);
319 
320 static boolean_t        vm_map_fork_copy(
321 	vm_map_t        old_map,
322 	vm_map_entry_t  *old_entry_p,
323 	vm_map_t        new_map,
324 	int             vm_map_copyin_flags);
325 
326 static kern_return_t    vm_map_wire_nested(
327 	vm_map_t                   map,
328 	vm_map_offset_t            start,
329 	vm_map_offset_t            end,
330 	vm_prot_t                  caller_prot,
331 	vm_tag_t                   tag,
332 	boolean_t                  user_wire,
333 	pmap_t                     map_pmap,
334 	vm_map_offset_t            pmap_addr,
335 	ppnum_t                   *physpage_p);
336 
337 static kern_return_t    vm_map_unwire_nested(
338 	vm_map_t                   map,
339 	vm_map_offset_t            start,
340 	vm_map_offset_t            end,
341 	boolean_t                  user_wire,
342 	pmap_t                     map_pmap,
343 	vm_map_offset_t            pmap_addr);
344 
345 static kern_return_t    vm_map_overwrite_submap_recurse(
346 	vm_map_t                   dst_map,
347 	vm_map_offset_t            dst_addr,
348 	vm_map_size_t              dst_size);
349 
350 static kern_return_t    vm_map_copy_overwrite_nested(
351 	vm_map_t                   dst_map,
352 	vm_map_offset_t            dst_addr,
353 	vm_map_copy_t              copy,
354 	boolean_t                  interruptible,
355 	pmap_t                     pmap,
356 	boolean_t                  discard_on_success);
357 
358 static kern_return_t    vm_map_remap_extract(
359 	vm_map_t                map,
360 	vm_map_offset_t         addr,
361 	vm_map_size_t           size,
362 	boolean_t               copy,
363 	vm_map_copy_t           map_copy,
364 	vm_prot_t               *cur_protection,
365 	vm_prot_t               *max_protection,
366 	vm_inherit_t            inheritance,
367 	vm_map_kernel_flags_t   vmk_flags);
368 
369 static void             vm_map_region_look_for_page(
370 	vm_map_t                   map,
371 	vm_map_offset_t            va,
372 	vm_object_t                object,
373 	vm_object_offset_t         offset,
374 	int                        max_refcnt,
375 	unsigned short             depth,
376 	vm_region_extended_info_t  extended,
377 	mach_msg_type_number_t count);
378 
379 static boolean_t        vm_map_region_has_obj_ref(
380 	vm_map_entry_t             entry,
381 	vm_object_t                object);
382 
383 
384 static kern_return_t    vm_map_willneed(
385 	vm_map_t        map,
386 	vm_map_offset_t start,
387 	vm_map_offset_t end);
388 
389 static kern_return_t    vm_map_reuse_pages(
390 	vm_map_t        map,
391 	vm_map_offset_t start,
392 	vm_map_offset_t end);
393 
394 static kern_return_t    vm_map_reusable_pages(
395 	vm_map_t        map,
396 	vm_map_offset_t start,
397 	vm_map_offset_t end);
398 
399 static kern_return_t    vm_map_can_reuse(
400 	vm_map_t        map,
401 	vm_map_offset_t start,
402 	vm_map_offset_t end);
403 
404 static kern_return_t    vm_map_zero(
405 	vm_map_t        map,
406 	vm_map_offset_t start,
407 	vm_map_offset_t end);
408 
409 static kern_return_t    vm_map_random_address_for_size(
410 	vm_map_t                map,
411 	vm_map_offset_t        *address,
412 	vm_map_size_t           size,
413 	vm_map_kernel_flags_t   vmk_flags);
414 
415 
416 #if CONFIG_MAP_RANGES
417 
418 static vm_map_range_id_t vm_map_user_range_resolve(
419 	vm_map_t                map,
420 	mach_vm_address_t       addr,
421 	mach_vm_address_t       size,
422 	mach_vm_range_t         range);
423 
424 #endif /* CONFIG_MAP_RANGES */
425 #if MACH_ASSERT
426 static kern_return_t    vm_map_pageout(
427 	vm_map_t        map,
428 	vm_map_offset_t start,
429 	vm_map_offset_t end);
430 #endif /* MACH_ASSERT */
431 
432 kern_return_t vm_map_corpse_footprint_collect(
433 	vm_map_t        old_map,
434 	vm_map_entry_t  old_entry,
435 	vm_map_t        new_map);
436 void vm_map_corpse_footprint_collect_done(
437 	vm_map_t        new_map);
438 void vm_map_corpse_footprint_destroy(
439 	vm_map_t        map);
440 kern_return_t vm_map_corpse_footprint_query_page_info(
441 	vm_map_t        map,
442 	vm_map_offset_t va,
443 	int             *disposition_p);
444 void vm_map_footprint_query_page_info(
445 	vm_map_t        map,
446 	vm_map_entry_t  map_entry,
447 	vm_map_offset_t curr_s_offset,
448 	int             *disposition_p);
449 
450 #if CONFIG_MAP_RANGES
451 static void vm_map_range_map_init(void);
452 #endif /* CONFIG_MAP_RANGES */
453 
454 pid_t find_largest_process_vm_map_entries(void);
455 
456 __attribute__((always_inline))
457 int
vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)458 vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)
459 {
460 	int flags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
461 
462 	/* in vmk flags the meaning of fixed/anywhere is inverted */
463 	return flags ^ (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
464 }
465 
466 __attribute__((always_inline, overloadable))
467 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags,vm_tag_t vm_tag)468 vm_map_kernel_flags_set_vmflags(
469 	vm_map_kernel_flags_t  *vmk_flags,
470 	int                     vm_flags,
471 	vm_tag_t                vm_tag)
472 {
473 	vm_flags ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
474 	vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
475 	vmk_flags->__vm_flags |= (vm_flags & VM_FLAGS_ANY_MASK);
476 	vmk_flags->vm_tag = vm_tag;
477 }
478 
479 __attribute__((always_inline, overloadable))
480 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_and_tag)481 vm_map_kernel_flags_set_vmflags(
482 	vm_map_kernel_flags_t  *vmk_flags,
483 	int                     vm_flags_and_tag)
484 {
485 	vm_flags_and_tag ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
486 	vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
487 	vmk_flags->__vm_flags |= (vm_flags_and_tag & VM_FLAGS_ANY_MASK);
488 	VM_GET_FLAGS_ALIAS(vm_flags_and_tag, vmk_flags->vm_tag);
489 }
490 
491 __attribute__((always_inline))
492 void
vm_map_kernel_flags_and_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_mask)493 vm_map_kernel_flags_and_vmflags(
494 	vm_map_kernel_flags_t  *vmk_flags,
495 	int                     vm_flags_mask)
496 {
497 	/* this function doesn't handle the inverted FIXED/ANYWHERE */
498 	assert(vm_flags_mask & VM_FLAGS_ANYWHERE);
499 	vmk_flags->__vm_flags &= vm_flags_mask;
500 }
501 
502 __attribute__((always_inline))
503 bool
vm_map_kernel_flags_check_vm_and_kflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)504 vm_map_kernel_flags_check_vm_and_kflags(
505 	vm_map_kernel_flags_t   vmk_flags,
506 	int                     vm_flags_mask)
507 {
508 	return (vmk_flags.__vm_flags & ~vm_flags_mask) == 0;
509 }
510 
511 bool
vm_map_kernel_flags_check_vmflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)512 vm_map_kernel_flags_check_vmflags(
513 	vm_map_kernel_flags_t   vmk_flags,
514 	int                     vm_flags_mask)
515 {
516 	int vmflags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
517 
518 	/* Note: up to 16 still has good calling conventions */
519 	static_assert(sizeof(vm_map_kernel_flags_t) == 8);
520 
521 #if DEBUG || DEVELOPMENT
522 	/*
523 	 * All of this compiles to nothing if all checks pass.
524 	 */
525 #define check(field, value)  ({ \
526 	vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE; \
527 	fl.__vm_flags = (value); \
528 	fl.field = 0; \
529 	assert(fl.__vm_flags == 0); \
530 })
531 
532 	/* bits 0-7 */
533 	check(vmf_fixed, VM_FLAGS_ANYWHERE); // kind of a lie this is inverted
534 	check(vmf_purgeable, VM_FLAGS_PURGABLE);
535 	check(vmf_4gb_chunk, VM_FLAGS_4GB_CHUNK);
536 	check(vmf_random_addr, VM_FLAGS_RANDOM_ADDR);
537 	check(vmf_no_cache, VM_FLAGS_NO_CACHE);
538 	check(vmf_resilient_codesign, VM_FLAGS_RESILIENT_CODESIGN);
539 	check(vmf_resilient_media, VM_FLAGS_RESILIENT_MEDIA);
540 	check(vmf_permanent, VM_FLAGS_PERMANENT);
541 
542 	/* bits 8-15 */
543 	check(vmf_tpro, VM_FLAGS_TPRO);
544 	check(vmf_overwrite, VM_FLAGS_OVERWRITE);
545 
546 	/* bits 16-23 */
547 	check(vmf_superpage_size, VM_FLAGS_SUPERPAGE_MASK);
548 	check(vmf_return_data_addr, VM_FLAGS_RETURN_DATA_ADDR);
549 	check(vmf_return_4k_data_addr, VM_FLAGS_RETURN_4K_DATA_ADDR);
550 
551 	{
552 		vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE;
553 
554 		/* check user tags will never clip */
555 		fl.vm_tag = VM_MEMORY_COUNT - 1;
556 		assert(fl.vm_tag == VM_MEMORY_COUNT - 1);
557 
558 		/* check kernel tags will never clip */
559 		fl.vm_tag = VM_MAX_TAG_VALUE - 1;
560 		assert(fl.vm_tag == VM_MAX_TAG_VALUE - 1);
561 	}
562 
563 
564 #undef check
565 #endif /* DEBUG || DEVELOPMENT */
566 
567 	return (vmflags & ~vm_flags_mask) == 0;
568 }
569 
570 /*
571  * Macros to copy a vm_map_entry. We must be careful to correctly
572  * manage the wired page count. vm_map_entry_copy() creates a new
573  * map entry to the same memory - the wired count in the new entry
574  * must be set to zero. vm_map_entry_copy_full() creates a new
575  * entry that is identical to the old entry.  This preserves the
576  * wire count; it's used for map splitting and zone changing in
577  * vm_map_copyout.
578  */
579 
580 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)581 vm_map_entry_copy_csm_assoc(
582 	vm_map_t map __unused,
583 	vm_map_entry_t new __unused,
584 	vm_map_entry_t old __unused)
585 {
586 #if CODE_SIGNING_MONITOR
587 	/* when code signing monitor is enabled, we want to reset on copy */
588 	new->csm_associated = FALSE;
589 #else
590 	/* when code signing monitor is not enabled, assert as a sanity check */
591 	assert(new->csm_associated == FALSE);
592 #endif
593 #if DEVELOPMENT || DEBUG
594 	if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
595 		printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug\n",
596 		    proc_selfpid(),
597 		    (get_bsdtask_info(current_task())
598 		    ? proc_name_address(get_bsdtask_info(current_task()))
599 		    : "?"),
600 		    __FUNCTION__, __LINE__,
601 		    map, new, new->vme_start, new->vme_end);
602 	}
603 #endif /* DEVELOPMENT || DEBUG */
604 #if XNU_TARGET_OS_OSX
605 	/*
606 	 * On macOS, entries with "vme_xnu_user_debug" can be copied during fork()
607 	 * and we want the child's entry to keep its "vme_xnu_user_debug" to avoid
608 	 * trigggering CSM assertions when the child accesses its mapping.
609 	 */
610 #else /* XNU_TARGET_OS_OSX */
611 	new->vme_xnu_user_debug = FALSE;
612 #endif /* XNU_TARGET_OS_OSX */
613 }
614 
615 /*
616  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
617  * But for security reasons on some platforms, we don't want the
618  * new mapping to be "used for jit", so we reset the flag here.
619  */
620 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)621 vm_map_entry_copy_code_signing(
622 	vm_map_t map,
623 	vm_map_entry_t new,
624 	vm_map_entry_t old __unused)
625 {
626 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
627 		assert(new->used_for_jit == old->used_for_jit);
628 	} else {
629 		if (old->used_for_jit) {
630 			DTRACE_VM3(cs_wx,
631 			    uint64_t, new->vme_start,
632 			    uint64_t, new->vme_end,
633 			    vm_prot_t, new->protection);
634 			printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
635 			    proc_selfpid(),
636 			    (get_bsdtask_info(current_task())
637 			    ? proc_name_address(get_bsdtask_info(current_task()))
638 			    : "?"),
639 			    __FUNCTION__,
640 			    "removing execute access");
641 			new->protection &= ~VM_PROT_EXECUTE;
642 			new->max_protection &= ~VM_PROT_EXECUTE;
643 		}
644 		new->used_for_jit = FALSE;
645 	}
646 }
647 
648 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)649 vm_map_entry_copy_full(
650 	vm_map_entry_t new,
651 	vm_map_entry_t old)
652 {
653 #if MAP_ENTRY_CREATION_DEBUG
654 	btref_put(new->vme_creation_bt);
655 	btref_retain(old->vme_creation_bt);
656 #endif
657 #if MAP_ENTRY_INSERTION_DEBUG
658 	btref_put(new->vme_insertion_bt);
659 	btref_retain(old->vme_insertion_bt);
660 #endif
661 #if VM_BTLOG_TAGS
662 	/* Discard the btref that might be in the new entry */
663 	if (new->vme_kernel_object) {
664 		btref_put(new->vme_tag_btref);
665 	}
666 	/* Retain the btref in the old entry to account for its copy */
667 	if (old->vme_kernel_object) {
668 		btref_retain(old->vme_tag_btref);
669 	}
670 #endif /* VM_BTLOG_TAGS */
671 	*new = *old;
672 }
673 
674 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)675 vm_map_entry_copy(
676 	vm_map_t map,
677 	vm_map_entry_t new,
678 	vm_map_entry_t old)
679 {
680 	vm_map_entry_copy_full(new, old);
681 
682 	new->is_shared = FALSE;
683 	new->needs_wakeup = FALSE;
684 	new->in_transition = FALSE;
685 	new->wired_count = 0;
686 	new->user_wired_count = 0;
687 	new->vme_permanent = FALSE;
688 	vm_map_entry_copy_code_signing(map, new, old);
689 	vm_map_entry_copy_csm_assoc(map, new, old);
690 	if (new->iokit_acct) {
691 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
692 		new->iokit_acct = FALSE;
693 		new->use_pmap = TRUE;
694 	}
695 	new->vme_resilient_codesign = FALSE;
696 	new->vme_resilient_media = FALSE;
697 	new->vme_atomic = FALSE;
698 	new->vme_no_copy_on_read = FALSE;
699 }
700 
701 /*
702  * Normal lock_read_to_write() returns FALSE/0 on failure.
703  * These functions evaluate to zero on success and non-zero value on failure.
704  */
705 __attribute__((always_inline))
706 int
vm_map_lock_read_to_write(vm_map_t map)707 vm_map_lock_read_to_write(vm_map_t map)
708 {
709 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
710 		DTRACE_VM(vm_map_lock_upgrade);
711 		return 0;
712 	}
713 	return 1;
714 }
715 
716 __attribute__((always_inline))
717 boolean_t
vm_map_try_lock(vm_map_t map)718 vm_map_try_lock(vm_map_t map)
719 {
720 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
721 		DTRACE_VM(vm_map_lock_w);
722 		return TRUE;
723 	}
724 	return FALSE;
725 }
726 
727 __attribute__((always_inline))
728 boolean_t
vm_map_try_lock_read(vm_map_t map)729 vm_map_try_lock_read(vm_map_t map)
730 {
731 	if (lck_rw_try_lock_shared(&(map)->lock)) {
732 		DTRACE_VM(vm_map_lock_r);
733 		return TRUE;
734 	}
735 	return FALSE;
736 }
737 
738 /*!
739  * @function kdp_vm_map_is_acquired_exclusive
740  *
741  * @abstract
742  * Checks if vm map is acquired exclusive.
743  *
744  * @discussion
745  * NOT SAFE: To be used only by kernel debugger.
746  *
747  * @param map map to check
748  *
749  * @returns TRUE if the map is acquired exclusively.
750  */
751 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)752 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
753 {
754 	return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
755 }
756 
757 /*
758  * Routines to get the page size the caller should
759  * use while inspecting the target address space.
760  * Use the "_safely" variant if the caller is dealing with a user-provided
761  * array whose size depends on the page size, to avoid any overflow or
762  * underflow of a user-allocated buffer.
763  */
764 int
vm_self_region_page_shift_safely(vm_map_t target_map)765 vm_self_region_page_shift_safely(
766 	vm_map_t target_map)
767 {
768 	int effective_page_shift = 0;
769 
770 	if (PAGE_SIZE == (4096)) {
771 		/* x86_64 and 4k watches: always use 4k */
772 		return PAGE_SHIFT;
773 	}
774 	/* did caller provide an explicit page size for this thread to use? */
775 	effective_page_shift = thread_self_region_page_shift();
776 	if (effective_page_shift) {
777 		/* use the explicitly-provided page size */
778 		return effective_page_shift;
779 	}
780 	/* no explicit page size: use the caller's page size... */
781 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
782 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
783 		/* page size match: safe to use */
784 		return effective_page_shift;
785 	}
786 	/* page size mismatch */
787 	return -1;
788 }
789 int
vm_self_region_page_shift(vm_map_t target_map)790 vm_self_region_page_shift(
791 	vm_map_t target_map)
792 {
793 	int effective_page_shift;
794 
795 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
796 	if (effective_page_shift == -1) {
797 		/* no safe value but OK to guess for caller */
798 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
799 		    VM_MAP_PAGE_SHIFT(target_map));
800 	}
801 	return effective_page_shift;
802 }
803 
804 
805 /*
806  *	Decide if we want to allow processes to execute from their data or stack areas.
807  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
808  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
809  *	or allow_stack_exec to enable data execution for that type of data area for that particular
810  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
811  *	specific pmap files since the default behavior varies according to architecture.  The
812  *	main reason it varies is because of the need to provide binary compatibility with old
813  *	applications that were written before these restrictions came into being.  In the old
814  *	days, an app could execute anything it could read, but this has slowly been tightened
815  *	up over time.  The default behavior is:
816  *
817  *	32-bit PPC apps		may execute from both stack and data areas
818  *	32-bit Intel apps	may exeucte from data areas but not stack
819  *	64-bit PPC/Intel apps	may not execute from either data or stack
820  *
821  *	An application on any architecture may override these defaults by explicitly
822  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
823  *	system call.  This code here just determines what happens when an app tries to
824  *      execute from a page that lacks execute permission.
825  *
826  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
827  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
828  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
829  *	execution from data areas for a particular binary even if the arch normally permits it. As
830  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
831  *	to support some complicated use cases, notably browsers with out-of-process plugins that
832  *	are not all NX-safe.
833  */
834 
835 extern int allow_data_exec, allow_stack_exec;
836 
837 int
override_nx(vm_map_t map,uint32_t user_tag)838 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
839 {
840 	int current_abi;
841 
842 	if (map->pmap == kernel_pmap) {
843 		return FALSE;
844 	}
845 
846 	/*
847 	 * Determine if the app is running in 32 or 64 bit mode.
848 	 */
849 
850 	if (vm_map_is_64bit(map)) {
851 		current_abi = VM_ABI_64;
852 	} else {
853 		current_abi = VM_ABI_32;
854 	}
855 
856 	/*
857 	 * Determine if we should allow the execution based on whether it's a
858 	 * stack or data area and the current architecture.
859 	 */
860 
861 	if (user_tag == VM_MEMORY_STACK) {
862 		return allow_stack_exec & current_abi;
863 	}
864 
865 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
866 }
867 
868 
869 /*
870  *	Virtual memory maps provide for the mapping, protection,
871  *	and sharing of virtual memory objects.  In addition,
872  *	this module provides for an efficient virtual copy of
873  *	memory from one map to another.
874  *
875  *	Synchronization is required prior to most operations.
876  *
877  *	Maps consist of an ordered doubly-linked list of simple
878  *	entries; a single hint is used to speed up lookups.
879  *
880  *	Sharing maps have been deleted from this version of Mach.
881  *	All shared objects are now mapped directly into the respective
882  *	maps.  This requires a change in the copy on write strategy;
883  *	the asymmetric (delayed) strategy is used for shared temporary
884  *	objects instead of the symmetric (shadow) strategy.  All maps
885  *	are now "top level" maps (either task map, kernel map or submap
886  *	of the kernel map).
887  *
888  *	Since portions of maps are specified by start/end addreses,
889  *	which may not align with existing map entries, all
890  *	routines merely "clip" entries to these start/end values.
891  *	[That is, an entry is split into two, bordering at a
892  *	start or end value.]  Note that these clippings may not
893  *	always be necessary (as the two resulting entries are then
894  *	not changed); however, the clipping is done for convenience.
895  *	No attempt is currently made to "glue back together" two
896  *	abutting entries.
897  *
898  *	The symmetric (shadow) copy strategy implements virtual copy
899  *	by copying VM object references from one map to
900  *	another, and then marking both regions as copy-on-write.
901  *	It is important to note that only one writeable reference
902  *	to a VM object region exists in any map when this strategy
903  *	is used -- this means that shadow object creation can be
904  *	delayed until a write operation occurs.  The symmetric (delayed)
905  *	strategy allows multiple maps to have writeable references to
906  *	the same region of a vm object, and hence cannot delay creating
907  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
908  *	Copying of permanent objects is completely different; see
909  *	vm_object_copy_strategically() in vm_object.c.
910  */
911 
912 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
913 
914 #define VM_MAP_ZONE_NAME        "maps"
915 #define VM_MAP_ZFLAGS           (ZC_NOENCRYPT | ZC_VM)
916 
917 #define VM_MAP_ENTRY_ZONE_NAME  "VM map entries"
918 #define VM_MAP_ENTRY_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
919 
920 #define VM_MAP_HOLES_ZONE_NAME  "VM map holes"
921 #define VM_MAP_HOLES_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
922 
923 /*
924  * Asserts that a vm_map_copy object is coming from the
925  * vm_map_copy_zone to ensure that it isn't a fake constructed
926  * anywhere else.
927  */
928 void
vm_map_copy_require(struct vm_map_copy * copy)929 vm_map_copy_require(struct vm_map_copy *copy)
930 {
931 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
932 }
933 
934 /*
935  *	vm_map_require:
936  *
937  *	Ensures that the argument is memory allocated from the genuine
938  *	vm map zone. (See zone_id_require_allow_foreign).
939  */
940 void
vm_map_require(vm_map_t map)941 vm_map_require(vm_map_t map)
942 {
943 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
944 }
945 
946 #define VM_MAP_EARLY_COUNT_MAX         16
947 static __startup_data vm_offset_t      map_data;
948 static __startup_data vm_size_t        map_data_size;
949 static __startup_data vm_offset_t      kentry_data;
950 static __startup_data vm_size_t        kentry_data_size;
951 static __startup_data vm_offset_t      map_holes_data;
952 static __startup_data vm_size_t        map_holes_data_size;
953 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
954 static __startup_data uint32_t         early_map_count;
955 
956 #if XNU_TARGET_OS_OSX
957 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
958 #else /* XNU_TARGET_OS_OSX */
959 #define         NO_COALESCE_LIMIT  0
960 #endif /* XNU_TARGET_OS_OSX */
961 
962 /* Skip acquiring locks if we're in the midst of a kernel core dump */
963 unsigned int not_in_kdp = 1;
964 
965 unsigned int vm_map_set_cache_attr_count = 0;
966 
967 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)968 vm_map_set_cache_attr(
969 	vm_map_t        map,
970 	vm_map_offset_t va)
971 {
972 	vm_map_entry_t  map_entry;
973 	vm_object_t     object;
974 	kern_return_t   kr = KERN_SUCCESS;
975 
976 	vm_map_lock_read(map);
977 
978 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
979 	    map_entry->is_sub_map) {
980 		/*
981 		 * that memory is not properly mapped
982 		 */
983 		kr = KERN_INVALID_ARGUMENT;
984 		goto done;
985 	}
986 	object = VME_OBJECT(map_entry);
987 
988 	if (object == VM_OBJECT_NULL) {
989 		/*
990 		 * there should be a VM object here at this point
991 		 */
992 		kr = KERN_INVALID_ARGUMENT;
993 		goto done;
994 	}
995 	vm_object_lock(object);
996 	object->set_cache_attr = TRUE;
997 	vm_object_unlock(object);
998 
999 	vm_map_set_cache_attr_count++;
1000 done:
1001 	vm_map_unlock_read(map);
1002 
1003 	return kr;
1004 }
1005 
1006 
1007 #if CONFIG_CODE_DECRYPTION
1008 /*
1009  * vm_map_apple_protected:
1010  * This remaps the requested part of the object with an object backed by
1011  * the decrypting pager.
1012  * crypt_info contains entry points and session data for the crypt module.
1013  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
1014  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
1015  */
1016 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)1017 vm_map_apple_protected(
1018 	vm_map_t                map,
1019 	vm_map_offset_t         start,
1020 	vm_map_offset_t         end,
1021 	vm_object_offset_t      crypto_backing_offset,
1022 	struct pager_crypt_info *crypt_info,
1023 	uint32_t                cryptid)
1024 {
1025 	boolean_t       map_locked;
1026 	kern_return_t   kr;
1027 	vm_map_entry_t  map_entry;
1028 	struct vm_map_entry tmp_entry;
1029 	memory_object_t unprotected_mem_obj;
1030 	vm_object_t     protected_object;
1031 	vm_map_offset_t map_addr;
1032 	vm_map_offset_t start_aligned, end_aligned;
1033 	vm_object_offset_t      crypto_start, crypto_end;
1034 	boolean_t       cache_pager;
1035 
1036 	map_locked = FALSE;
1037 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
1038 
1039 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
1040 		return KERN_INVALID_ADDRESS;
1041 	}
1042 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
1043 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
1044 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
1045 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
1046 
1047 #if __arm64__
1048 	/*
1049 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
1050 	 * so we might have to loop and establish up to 3 mappings:
1051 	 *
1052 	 * + the first 16K-page, which might overlap with the previous
1053 	 *   4K-aligned mapping,
1054 	 * + the center,
1055 	 * + the last 16K-page, which might overlap with the next
1056 	 *   4K-aligned mapping.
1057 	 * Each of these mapping might be backed by a vnode pager (if
1058 	 * properly page-aligned) or a "fourk_pager", itself backed by a
1059 	 * vnode pager (if 4K-aligned but not page-aligned).
1060 	 */
1061 #endif /* __arm64__ */
1062 
1063 	map_addr = start_aligned;
1064 	for (map_addr = start_aligned;
1065 	    map_addr < end;
1066 	    map_addr = tmp_entry.vme_end) {
1067 		vm_map_lock(map);
1068 		map_locked = TRUE;
1069 
1070 		/* lookup the protected VM object */
1071 		if (!vm_map_lookup_entry(map,
1072 		    map_addr,
1073 		    &map_entry) ||
1074 		    map_entry->is_sub_map ||
1075 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
1076 			/* that memory is not properly mapped */
1077 			kr = KERN_INVALID_ARGUMENT;
1078 			goto done;
1079 		}
1080 
1081 		/* ensure mapped memory is mapped as executable except
1082 		 *  except for model decryption flow */
1083 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
1084 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
1085 			kr = KERN_INVALID_ARGUMENT;
1086 			goto done;
1087 		}
1088 
1089 		/* get the protected object to be decrypted */
1090 		protected_object = VME_OBJECT(map_entry);
1091 		if (protected_object == VM_OBJECT_NULL) {
1092 			/* there should be a VM object here at this point */
1093 			kr = KERN_INVALID_ARGUMENT;
1094 			goto done;
1095 		}
1096 		/* ensure protected object stays alive while map is unlocked */
1097 		vm_object_reference(protected_object);
1098 
1099 		/* limit the map entry to the area we want to cover */
1100 		vm_map_clip_start(map, map_entry, start_aligned);
1101 		vm_map_clip_end(map, map_entry, end_aligned);
1102 
1103 		tmp_entry = *map_entry;
1104 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
1105 		vm_map_unlock(map);
1106 		map_locked = FALSE;
1107 
1108 		/*
1109 		 * This map entry might be only partially encrypted
1110 		 * (if not fully "page-aligned").
1111 		 */
1112 		crypto_start = 0;
1113 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
1114 		if (tmp_entry.vme_start < start) {
1115 			if (tmp_entry.vme_start != start_aligned) {
1116 				kr = KERN_INVALID_ADDRESS;
1117 				vm_object_deallocate(protected_object);
1118 				goto done;
1119 			}
1120 			crypto_start += (start - tmp_entry.vme_start);
1121 		}
1122 		if (tmp_entry.vme_end > end) {
1123 			if (tmp_entry.vme_end != end_aligned) {
1124 				kr = KERN_INVALID_ADDRESS;
1125 				vm_object_deallocate(protected_object);
1126 				goto done;
1127 			}
1128 			crypto_end -= (tmp_entry.vme_end - end);
1129 		}
1130 
1131 		/*
1132 		 * This "extra backing offset" is needed to get the decryption
1133 		 * routine to use the right key.  It adjusts for the possibly
1134 		 * relative offset of an interposed "4K" pager...
1135 		 */
1136 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
1137 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
1138 		}
1139 
1140 		cache_pager = TRUE;
1141 #if XNU_TARGET_OS_OSX
1142 		if (vm_map_is_alien(map)) {
1143 			cache_pager = FALSE;
1144 		}
1145 #endif /* XNU_TARGET_OS_OSX */
1146 
1147 		/*
1148 		 * Lookup (and create if necessary) the protected memory object
1149 		 * matching that VM object.
1150 		 * If successful, this also grabs a reference on the memory object,
1151 		 * to guarantee that it doesn't go away before we get a chance to map
1152 		 * it.
1153 		 */
1154 		unprotected_mem_obj = apple_protect_pager_setup(
1155 			protected_object,
1156 			VME_OFFSET(&tmp_entry),
1157 			crypto_backing_offset,
1158 			crypt_info,
1159 			crypto_start,
1160 			crypto_end,
1161 			cache_pager);
1162 
1163 		/* release extra ref on protected object */
1164 		vm_object_deallocate(protected_object);
1165 
1166 		if (unprotected_mem_obj == NULL) {
1167 			kr = KERN_FAILURE;
1168 			goto done;
1169 		}
1170 
1171 		/* can overwrite an immutable mapping */
1172 		vm_map_kernel_flags_t vmk_flags = {
1173 			.vmf_fixed = true,
1174 			.vmf_overwrite = true,
1175 			.vmkf_overwrite_immutable = true,
1176 		};
1177 		/* make the new mapping as "permanent" as the one it replaces */
1178 		vmk_flags.vmf_permanent = tmp_entry.vme_permanent;
1179 
1180 		/* map this memory object in place of the current one */
1181 		map_addr = tmp_entry.vme_start;
1182 		kr = mach_vm_map_kernel(map,
1183 		    vm_sanitize_wrap_addr_ref(&map_addr),
1184 		    (tmp_entry.vme_end -
1185 		    tmp_entry.vme_start),
1186 		    (mach_vm_offset_t) 0,
1187 		    vmk_flags,
1188 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1189 		    0,
1190 		    TRUE,
1191 		    tmp_entry.protection,
1192 		    tmp_entry.max_protection,
1193 		    tmp_entry.inheritance);
1194 		assertf(kr == KERN_SUCCESS,
1195 		    "kr = 0x%x\n", kr);
1196 		assertf(map_addr == tmp_entry.vme_start,
1197 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1198 		    (uint64_t)map_addr,
1199 		    (uint64_t) tmp_entry.vme_start,
1200 		    &tmp_entry);
1201 
1202 #if VM_MAP_DEBUG_APPLE_PROTECT
1203 		if (vm_map_debug_apple_protect) {
1204 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1205 			    " backing:[object:%p,offset:0x%llx,"
1206 			    "crypto_backing_offset:0x%llx,"
1207 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1208 			    map,
1209 			    (uint64_t) map_addr,
1210 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1211 			    tmp_entry.vme_start)),
1212 			    unprotected_mem_obj,
1213 			    protected_object,
1214 			    VME_OFFSET(&tmp_entry),
1215 			    crypto_backing_offset,
1216 			    crypto_start,
1217 			    crypto_end);
1218 		}
1219 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1220 
1221 		/*
1222 		 * Release the reference obtained by
1223 		 * apple_protect_pager_setup().
1224 		 * The mapping (if it succeeded) is now holding a reference on
1225 		 * the memory object.
1226 		 */
1227 		memory_object_deallocate(unprotected_mem_obj);
1228 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1229 
1230 		/* continue with next map entry */
1231 		crypto_backing_offset += (tmp_entry.vme_end -
1232 		    tmp_entry.vme_start);
1233 		crypto_backing_offset -= crypto_start;
1234 	}
1235 	kr = KERN_SUCCESS;
1236 
1237 done:
1238 	if (map_locked) {
1239 		vm_map_unlock(map);
1240 	}
1241 	return kr;
1242 }
1243 #endif  /* CONFIG_CODE_DECRYPTION */
1244 
1245 
1246 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1247 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1248 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1249 
1250 #if XNU_TARGET_OS_OSX
1251 #define MALLOC_NO_COW_DEFAULT 1
1252 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1253 #else /* XNU_TARGET_OS_OSX */
1254 #define MALLOC_NO_COW_DEFAULT 1
1255 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1256 #endif /* XNU_TARGET_OS_OSX */
1257 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1258 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1259 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1260 #if DEBUG
1261 int vm_check_map_sanity = 0;
1262 #endif
1263 
1264 /*
1265  *	vm_map_init:
1266  *
1267  *	Initialize the vm_map module.  Must be called before
1268  *	any other vm_map routines.
1269  *
1270  *	Map and entry structures are allocated from zones -- we must
1271  *	initialize those zones.
1272  *
1273  *	There are three zones of interest:
1274  *
1275  *	vm_map_zone:		used to allocate maps.
1276  *	vm_map_entry_zone:	used to allocate map entries.
1277  *
1278  *	LP32:
1279  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1280  *
1281  *	The kernel allocates map entries from a special zone that is initially
1282  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1283  *	the kernel to allocate more memory to a entry zone when it became
1284  *	empty since the very act of allocating memory implies the creation
1285  *	of a new entry.
1286  */
1287 __startup_func
1288 void
vm_map_init(void)1289 vm_map_init(void)
1290 {
1291 
1292 #if MACH_ASSERT
1293 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1294 	    sizeof(debug4k_filter));
1295 #endif /* MACH_ASSERT */
1296 
1297 	zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1298 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1299 
1300 	/*
1301 	 * Don't quarantine because we always need elements available
1302 	 * Disallow GC on this zone... to aid the GC.
1303 	 */
1304 	zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1305 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1306 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1307 		z->z_elems_rsv = (uint16_t)(32 *
1308 		(ml_early_cpu_max_number() + 1));
1309 	});
1310 
1311 	zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1312 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1313 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1314 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1315 	});
1316 
1317 	zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1318 	    ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1319 
1320 	/*
1321 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1322 	 */
1323 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1324 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1325 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1326 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1327 	    zone_count_free(vm_map_zone),
1328 	    zone_count_free(vm_map_entry_zone),
1329 	    zone_count_free(vm_map_holes_zone));
1330 
1331 	/*
1332 	 * Since these are covered by zones, remove them from stolen page accounting.
1333 	 */
1334 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1335 
1336 #if VM_MAP_DEBUG_APPLE_PROTECT
1337 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1338 	    &vm_map_debug_apple_protect,
1339 	    sizeof(vm_map_debug_apple_protect));
1340 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1341 #if VM_MAP_DEBUG_APPLE_FOURK
1342 	PE_parse_boot_argn("vm_map_debug_fourk",
1343 	    &vm_map_debug_fourk,
1344 	    sizeof(vm_map_debug_fourk));
1345 #endif /* VM_MAP_DEBUG_FOURK */
1346 
1347 	if (malloc_no_cow) {
1348 		vm_memory_malloc_no_cow_mask = 0ULL;
1349 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1350 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1351 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1352 #if XNU_TARGET_OS_OSX
1353 		/*
1354 		 * On macOS, keep copy-on-write for MALLOC_LARGE because
1355 		 * realloc() may use vm_copy() to transfer the old contents
1356 		 * to the new location.
1357 		 */
1358 #else /* XNU_TARGET_OS_OSX */
1359 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1360 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1361 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1362 #endif /* XNU_TARGET_OS_OSX */
1363 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1364 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1365 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1366 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1367 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1368 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1369 		    &vm_memory_malloc_no_cow_mask,
1370 		    sizeof(vm_memory_malloc_no_cow_mask));
1371 	}
1372 
1373 #if CONFIG_MAP_RANGES
1374 	vm_map_range_map_init();
1375 #endif /* CONFIG_MAP_RANGES */
1376 
1377 #if DEBUG
1378 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1379 	if (vm_check_map_sanity) {
1380 		kprintf("VM sanity checking enabled\n");
1381 	} else {
1382 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1383 	}
1384 #endif /* DEBUG */
1385 
1386 #if DEVELOPMENT || DEBUG
1387 	PE_parse_boot_argn("panic_on_unsigned_execute",
1388 	    &panic_on_unsigned_execute,
1389 	    sizeof(panic_on_unsigned_execute));
1390 	PE_parse_boot_argn("panic_on_mlock_failure",
1391 	    &panic_on_mlock_failure,
1392 	    sizeof(panic_on_mlock_failure));
1393 #endif /* DEVELOPMENT || DEBUG */
1394 }
1395 
1396 __startup_func
1397 static void
vm_map_steal_memory(void)1398 vm_map_steal_memory(void)
1399 {
1400 	/*
1401 	 * We need to reserve enough memory to support boostraping VM maps
1402 	 * and the zone subsystem.
1403 	 *
1404 	 * The VM Maps that need to function before zones can support them
1405 	 * are the ones registered with vm_map_will_allocate_early_map(),
1406 	 * which are:
1407 	 * - the kernel map
1408 	 * - the various submaps used by zones (pgz, meta, ...)
1409 	 *
1410 	 * We also need enough entries and holes to support them
1411 	 * until zone_metadata_init() is called, which is when
1412 	 * the zone allocator becomes capable of expanding dynamically.
1413 	 *
1414 	 * We need:
1415 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1416 	 * - To allow for 3-4 entries per map, but the kernel map
1417 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1418 	 *   to describe the submaps, so double it (and make it 8x too)
1419 	 * - To allow for holes between entries,
1420 	 *   hence needs the same budget as entries
1421 	 */
1422 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1423 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1424 	    VM_MAP_EARLY_COUNT_MAX);
1425 
1426 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1427 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1428 	    8 * VM_MAP_EARLY_COUNT_MAX);
1429 
1430 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1431 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1432 	    8 * VM_MAP_EARLY_COUNT_MAX);
1433 
1434 	/*
1435 	 * Steal a contiguous range of memory so that a simple range check
1436 	 * can validate early addresses being freed/crammed to these
1437 	 * zones
1438 	 */
1439 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1440 	    map_holes_data_size);
1441 	kentry_data    = map_data + map_data_size;
1442 	map_holes_data = kentry_data + kentry_data_size;
1443 }
1444 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1445 
1446 __startup_func
1447 static void
vm_kernel_boostraped(void)1448 vm_kernel_boostraped(void)
1449 {
1450 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1451 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1452 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1453 
1454 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1455 	    zone_count_free(vm_map_zone),
1456 	    zone_count_free(vm_map_entry_zone),
1457 	    zone_count_free(vm_map_holes_zone));
1458 }
1459 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1460 
1461 void
vm_map_disable_hole_optimization(vm_map_t map)1462 vm_map_disable_hole_optimization(vm_map_t map)
1463 {
1464 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1465 
1466 	if (map->holelistenabled) {
1467 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1468 
1469 		while (hole_entry != NULL) {
1470 			next_hole_entry = hole_entry->vme_next;
1471 
1472 			hole_entry->vme_next = NULL;
1473 			hole_entry->vme_prev = NULL;
1474 			zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1475 
1476 			if (next_hole_entry == head_entry) {
1477 				hole_entry = NULL;
1478 			} else {
1479 				hole_entry = next_hole_entry;
1480 			}
1481 		}
1482 
1483 		map->holes_list = NULL;
1484 		map->holelistenabled = FALSE;
1485 
1486 		map->first_free = vm_map_first_entry(map);
1487 		SAVE_HINT_HOLE_WRITE(map, NULL);
1488 	}
1489 }
1490 
1491 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1492 vm_kernel_map_is_kernel(vm_map_t map)
1493 {
1494 	return map->pmap == kernel_pmap;
1495 }
1496 
1497 /*
1498  *	vm_map_create:
1499  *
1500  *	Creates and returns a new empty VM map with
1501  *	the given physical map structure, and having
1502  *	the given lower and upper address bounds.
1503  */
1504 
1505 extern vm_map_t vm_map_create_external(
1506 	pmap_t                  pmap,
1507 	vm_map_offset_t         min_off,
1508 	vm_map_offset_t         max_off,
1509 	boolean_t               pageable);
1510 
1511 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1512 vm_map_create_external(
1513 	pmap_t                  pmap,
1514 	vm_map_offset_t         min,
1515 	vm_map_offset_t         max,
1516 	boolean_t               pageable)
1517 {
1518 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1519 
1520 	if (pageable) {
1521 		options |= VM_MAP_CREATE_PAGEABLE;
1522 	}
1523 	return vm_map_create_options(pmap, min, max, options);
1524 }
1525 
1526 __startup_func
1527 void
vm_map_will_allocate_early_map(vm_map_t * owner)1528 vm_map_will_allocate_early_map(vm_map_t *owner)
1529 {
1530 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1531 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1532 	}
1533 
1534 	early_map_owners[early_map_count++] = owner;
1535 }
1536 
1537 __startup_func
1538 void
vm_map_relocate_early_maps(vm_offset_t delta)1539 vm_map_relocate_early_maps(vm_offset_t delta)
1540 {
1541 	for (uint32_t i = 0; i < early_map_count; i++) {
1542 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1543 
1544 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1545 	}
1546 
1547 	early_map_count = ~0u;
1548 }
1549 
1550 /*
1551  *	Routine:	vm_map_relocate_early_elem
1552  *
1553  *	Purpose:
1554  *		Early zone elements are allocated in a temporary part
1555  *		of the address space.
1556  *
1557  *		Once the zones live in their final place, the early
1558  *		VM maps, map entries and map holes need to be relocated.
1559  *
1560  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1561  *		pointers to vm_map_links. Other pointers to other types
1562  *		are fine.
1563  *
1564  *		Fortunately, pointers to those types are self-contained
1565  *		in those zones, _except_ for pointers to VM maps,
1566  *		which are tracked during early boot and fixed with
1567  *		vm_map_relocate_early_maps().
1568  */
1569 __startup_func
1570 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1571 vm_map_relocate_early_elem(
1572 	uint32_t                zone_id,
1573 	vm_offset_t             new_addr,
1574 	vm_offset_t             delta)
1575 {
1576 #define relocate(type_t, field)  ({ \
1577 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1578 	if (*__field) {                                                        \
1579 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1580 	}                                                                      \
1581 })
1582 
1583 	switch (zone_id) {
1584 	case ZONE_ID_VM_MAP:
1585 	case ZONE_ID_VM_MAP_ENTRY:
1586 	case ZONE_ID_VM_MAP_HOLES:
1587 		break;
1588 
1589 	default:
1590 		panic("Unexpected zone ID %d", zone_id);
1591 	}
1592 
1593 	if (zone_id == ZONE_ID_VM_MAP) {
1594 		relocate(vm_map_t, hdr.links.prev);
1595 		relocate(vm_map_t, hdr.links.next);
1596 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1597 #ifdef VM_MAP_STORE_USE_RB
1598 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1599 #endif /* VM_MAP_STORE_USE_RB */
1600 		relocate(vm_map_t, hint);
1601 		relocate(vm_map_t, hole_hint);
1602 		relocate(vm_map_t, first_free);
1603 		return;
1604 	}
1605 
1606 	relocate(struct vm_map_links *, prev);
1607 	relocate(struct vm_map_links *, next);
1608 
1609 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1610 #ifdef VM_MAP_STORE_USE_RB
1611 		relocate(vm_map_entry_t, store.entry.rbe_left);
1612 		relocate(vm_map_entry_t, store.entry.rbe_right);
1613 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1614 #endif /* VM_MAP_STORE_USE_RB */
1615 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1616 			/* no object to relocate because we haven't made any */
1617 			((vm_map_entry_t)new_addr)->vme_submap +=
1618 			    delta >> VME_SUBMAP_SHIFT;
1619 		}
1620 #if MAP_ENTRY_CREATION_DEBUG
1621 		relocate(vm_map_entry_t, vme_creation_maphdr);
1622 #endif /* MAP_ENTRY_CREATION_DEBUG */
1623 	}
1624 
1625 #undef relocate
1626 }
1627 
1628 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1629 vm_map_create_options(
1630 	pmap_t                  pmap,
1631 	vm_map_offset_t         min,
1632 	vm_map_offset_t         max,
1633 	vm_map_create_options_t options)
1634 {
1635 	vm_map_t result;
1636 
1637 #if DEBUG || DEVELOPMENT
1638 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1639 		if (early_map_count != ~0u && early_map_count !=
1640 		    zone_count_allocated(vm_map_zone) + 1) {
1641 			panic("allocating %dth early map, owner not known",
1642 			    zone_count_allocated(vm_map_zone) + 1);
1643 		}
1644 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1645 			panic("allocating %dth early map for non kernel pmap",
1646 			    early_map_count);
1647 		}
1648 	}
1649 #endif /* DEBUG || DEVELOPMENT */
1650 
1651 	result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1652 
1653 	vm_map_store_init(&result->hdr);
1654 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1655 	vm_map_set_page_shift(result, PAGE_SHIFT);
1656 
1657 	result->size_limit      = RLIM_INFINITY;        /* default unlimited */
1658 	result->data_limit      = RLIM_INFINITY;        /* default unlimited */
1659 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1660 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1661 	result->pmap = pmap;
1662 	result->min_offset = min;
1663 	result->max_offset = max;
1664 	result->first_free = vm_map_to_entry(result);
1665 	result->hint = vm_map_to_entry(result);
1666 
1667 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1668 		assert(pmap == kernel_pmap);
1669 		result->never_faults = true;
1670 	}
1671 
1672 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1673 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1674 		result->has_corpse_footprint = true;
1675 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1676 		struct vm_map_links *hole_entry;
1677 
1678 		hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1679 		hole_entry->start = min;
1680 		/*
1681 		 * Holes can be used to track ranges all the way up to
1682 		 * MACH_VM_MAX_ADDRESS or more (e.g. kernel map).
1683 		 */
1684 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1685 		result->holes_list = result->hole_hint = hole_entry;
1686 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1687 		result->holelistenabled = true;
1688 	}
1689 
1690 	vm_map_lock_init(result);
1691 
1692 	return result;
1693 }
1694 
1695 /*
1696  * Adjusts a submap that was made by kmem_suballoc()
1697  * before it knew where it would be mapped,
1698  * so that it has the right min/max offsets.
1699  *
1700  * We do not need to hold any locks:
1701  * only the caller knows about this map,
1702  * and it is not published on any entry yet.
1703  */
1704 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1705 vm_map_adjust_offsets(
1706 	vm_map_t                map,
1707 	vm_map_offset_t         min_off,
1708 	vm_map_offset_t         max_off)
1709 {
1710 	assert(map->min_offset == 0);
1711 	assert(map->max_offset == max_off - min_off);
1712 	assert(map->hdr.nentries == 0);
1713 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1714 
1715 	map->min_offset = min_off;
1716 	map->max_offset = max_off;
1717 
1718 	if (map->holelistenabled) {
1719 		struct vm_map_links *hole = map->holes_list;
1720 
1721 		hole->start = min_off;
1722 #if defined(__arm64__)
1723 		hole->end = max_off;
1724 #else
1725 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1726 #endif
1727 	}
1728 }
1729 
1730 
1731 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1732 vm_map_adjusted_size(vm_map_t map)
1733 {
1734 	const struct vm_reserved_region *regions = NULL;
1735 	size_t num_regions = 0;
1736 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1737 
1738 	if (map == NULL || (map->size == 0)) {
1739 		return 0;
1740 	}
1741 
1742 	map_size = map->size;
1743 
1744 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1745 		/*
1746 		 * No special reserved regions or not an exotic map or the task
1747 		 * is terminating and these special regions might have already
1748 		 * been deallocated.
1749 		 */
1750 		return map_size;
1751 	}
1752 
1753 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1754 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1755 
1756 	while (num_regions) {
1757 		reserved_size += regions[--num_regions].vmrr_size;
1758 	}
1759 
1760 	/*
1761 	 * There are a few places where the map is being switched out due to
1762 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1763 	 * In those cases, we could have the map's regions being deallocated on
1764 	 * a core while some accounting process is trying to get the map's size.
1765 	 * So this assert can't be enabled till all those places are uniform in
1766 	 * their use of the 'map->terminated' bit.
1767 	 *
1768 	 * assert(map_size >= reserved_size);
1769 	 */
1770 
1771 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1772 }
1773 
1774 /*
1775  *	vm_map_entry_create:	[ internal use only ]
1776  *
1777  *	Allocates a VM map entry for insertion in the
1778  *	given map (or map copy).  No fields are filled.
1779  *
1780  *	The VM entry will be zero initialized, except for:
1781  *	- behavior set to VM_BEHAVIOR_DEFAULT
1782  *	- inheritance set to VM_INHERIT_DEFAULT
1783  */
1784 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1785 
1786 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1787 
1788 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1789 _vm_map_entry_create(
1790 	struct vm_map_header    *map_header __unused)
1791 {
1792 	vm_map_entry_t entry = NULL;
1793 
1794 	entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1795 
1796 	/*
1797 	 * Help the compiler with what we know to be true,
1798 	 * so that the further bitfields inits have good codegen.
1799 	 *
1800 	 * See rdar://87041299
1801 	 */
1802 	__builtin_assume(entry->vme_object_value == 0);
1803 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1804 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1805 
1806 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1807 	    "VME_ALIAS_MASK covers tags");
1808 
1809 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1810 	    "can skip zeroing of the behavior field");
1811 	entry->inheritance = VM_INHERIT_DEFAULT;
1812 
1813 #if MAP_ENTRY_CREATION_DEBUG
1814 	entry->vme_creation_maphdr = map_header;
1815 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1816 	    BTREF_GET_NOWAIT);
1817 #endif
1818 	return entry;
1819 }
1820 
1821 /*
1822  *	vm_map_entry_dispose:	[ internal use only ]
1823  *
1824  *	Inverse of vm_map_entry_create.
1825  *
1826  *      write map lock held so no need to
1827  *	do anything special to insure correctness
1828  *      of the stores
1829  */
1830 static void
vm_map_entry_dispose(vm_map_entry_t entry)1831 vm_map_entry_dispose(
1832 	vm_map_entry_t          entry)
1833 {
1834 #if VM_BTLOG_TAGS
1835 	if (entry->vme_kernel_object) {
1836 		btref_put(entry->vme_tag_btref);
1837 	}
1838 #endif /* VM_BTLOG_TAGS */
1839 #if MAP_ENTRY_CREATION_DEBUG
1840 	btref_put(entry->vme_creation_bt);
1841 #endif
1842 #if MAP_ENTRY_INSERTION_DEBUG
1843 	btref_put(entry->vme_insertion_bt);
1844 #endif
1845 	zfree(vm_map_entry_zone, entry);
1846 }
1847 
1848 #define vm_map_copy_entry_dispose(copy_entry) \
1849 	vm_map_entry_dispose(copy_entry)
1850 
1851 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1852 vm_map_zap_first_entry(
1853 	vm_map_zap_t            list)
1854 {
1855 	return list->vmz_head;
1856 }
1857 
1858 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1859 vm_map_zap_last_entry(
1860 	vm_map_zap_t            list)
1861 {
1862 	assert(vm_map_zap_first_entry(list));
1863 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1864 }
1865 
1866 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1867 vm_map_zap_append(
1868 	vm_map_zap_t            list,
1869 	vm_map_entry_t          entry)
1870 {
1871 	entry->vme_next = VM_MAP_ENTRY_NULL;
1872 	*list->vmz_tail = entry;
1873 	list->vmz_tail = &entry->vme_next;
1874 }
1875 
1876 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1877 vm_map_zap_pop(
1878 	vm_map_zap_t            list)
1879 {
1880 	vm_map_entry_t head = list->vmz_head;
1881 
1882 	if (head != VM_MAP_ENTRY_NULL &&
1883 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1884 		list->vmz_tail = &list->vmz_head;
1885 	}
1886 
1887 	return head;
1888 }
1889 
1890 static void
vm_map_zap_dispose(vm_map_zap_t list)1891 vm_map_zap_dispose(
1892 	vm_map_zap_t            list)
1893 {
1894 	vm_map_entry_t          entry;
1895 
1896 	while ((entry = vm_map_zap_pop(list))) {
1897 		if (entry->is_sub_map) {
1898 			vm_map_deallocate(VME_SUBMAP(entry));
1899 		} else {
1900 			vm_object_deallocate(VME_OBJECT(entry));
1901 		}
1902 
1903 		vm_map_entry_dispose(entry);
1904 	}
1905 }
1906 
1907 #if MACH_ASSERT
1908 static boolean_t first_free_check = FALSE;
1909 boolean_t
first_free_is_valid(vm_map_t map)1910 first_free_is_valid(
1911 	vm_map_t        map)
1912 {
1913 	if (!first_free_check) {
1914 		return TRUE;
1915 	}
1916 
1917 	return first_free_is_valid_store( map );
1918 }
1919 #endif /* MACH_ASSERT */
1920 
1921 
1922 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1923 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1924 
1925 #define vm_map_copy_entry_unlink(copy, entry)                           \
1926 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1927 
1928 /*
1929  *	vm_map_destroy:
1930  *
1931  *	Actually destroy a map.
1932  */
1933 void
vm_map_destroy(vm_map_t map)1934 vm_map_destroy(
1935 	vm_map_t        map)
1936 {
1937 	/* final cleanup: this is not allowed to fail */
1938 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1939 
1940 	VM_MAP_ZAP_DECLARE(zap);
1941 
1942 	vm_map_lock(map);
1943 
1944 	map->terminated = true;
1945 	/* clean up regular map entries */
1946 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1947 	    KMEM_GUARD_NONE, &zap);
1948 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1949 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1950 	    KMEM_GUARD_NONE, &zap);
1951 
1952 	vm_map_disable_hole_optimization(map);
1953 	vm_map_corpse_footprint_destroy(map);
1954 
1955 	vm_map_unlock(map);
1956 
1957 	vm_map_zap_dispose(&zap);
1958 
1959 	assert(map->hdr.nentries == 0);
1960 
1961 	if (map->pmap) {
1962 		pmap_destroy(map->pmap);
1963 	}
1964 
1965 	lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1966 
1967 #if CONFIG_MAP_RANGES
1968 	kfree_data(map->extra_ranges,
1969 	    map->extra_ranges_count * sizeof(struct vm_map_user_range));
1970 #endif
1971 
1972 	zfree_id(ZONE_ID_VM_MAP, map);
1973 }
1974 
1975 /*
1976  * Returns pid of the task with the largest number of VM map entries.
1977  * Used in the zone-map-exhaustion jetsam path.
1978  */
1979 pid_t
find_largest_process_vm_map_entries(void)1980 find_largest_process_vm_map_entries(void)
1981 {
1982 	pid_t victim_pid = -1;
1983 	int max_vm_map_entries = 0;
1984 	task_t task = TASK_NULL;
1985 	queue_head_t *task_list = &tasks;
1986 
1987 	lck_mtx_lock(&tasks_threads_lock);
1988 	queue_iterate(task_list, task, task_t, tasks) {
1989 		if (task == kernel_task || !task->active) {
1990 			continue;
1991 		}
1992 
1993 		vm_map_t task_map = task->map;
1994 		if (task_map != VM_MAP_NULL) {
1995 			int task_vm_map_entries = task_map->hdr.nentries;
1996 			if (task_vm_map_entries > max_vm_map_entries) {
1997 				max_vm_map_entries = task_vm_map_entries;
1998 				victim_pid = pid_from_task(task);
1999 			}
2000 		}
2001 	}
2002 	lck_mtx_unlock(&tasks_threads_lock);
2003 
2004 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
2005 	return victim_pid;
2006 }
2007 
2008 
2009 /*
2010  *	vm_map_lookup_entry:	[ internal use only ]
2011  *
2012  *	Calls into the vm map store layer to find the map
2013  *	entry containing (or immediately preceding) the
2014  *	specified address in the given map; the entry is returned
2015  *	in the "entry" parameter.  The boolean
2016  *	result indicates whether the address is
2017  *	actually contained in the map.
2018  */
2019 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2020 vm_map_lookup_entry(
2021 	vm_map_t        map,
2022 	vm_map_offset_t address,
2023 	vm_map_entry_t  *entry)         /* OUT */
2024 {
2025 	bool result = false;
2026 
2027 #if CONFIG_KERNEL_TAGGING
2028 	if (VM_KERNEL_ADDRESS(address)) {
2029 		address = vm_memtag_canonicalize_address(address);
2030 	}
2031 #endif /* CONFIG_KERNEL_TAGGING */
2032 
2033 #if CONFIG_PROB_GZALLOC
2034 	if (map->pmap == kernel_pmap) {
2035 		assertf(!pgz_owned(address),
2036 		    "it is the responsibility of callers to unguard PGZ addresses");
2037 	}
2038 #endif /* CONFIG_PROB_GZALLOC */
2039 	result = vm_map_store_lookup_entry( map, address, entry );
2040 
2041 	return result;
2042 }
2043 
2044 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2045 vm_map_lookup_entry_or_next(
2046 	vm_map_t        map,
2047 	vm_map_offset_t address,
2048 	vm_map_entry_t  *entry)         /* OUT */
2049 {
2050 	if (vm_map_lookup_entry(map, address, entry)) {
2051 		return true;
2052 	}
2053 
2054 	*entry = (*entry)->vme_next;
2055 	return false;
2056 }
2057 
2058 #if CONFIG_PROB_GZALLOC
2059 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2060 vm_map_lookup_entry_allow_pgz(
2061 	vm_map_t        map,
2062 	vm_map_offset_t address,
2063 	vm_map_entry_t  *entry)         /* OUT */
2064 {
2065 #if CONFIG_KERNEL_TAGGING
2066 	if (VM_KERNEL_ADDRESS(address)) {
2067 		address = vm_memtag_canonicalize_address(address);
2068 	}
2069 #endif /* CONFIG_KERNEL_TAGGING */
2070 
2071 	return vm_map_store_lookup_entry( map, address, entry );
2072 }
2073 #endif /* CONFIG_PROB_GZALLOC */
2074 
2075 /*
2076  *	Routine:	vm_map_range_invalid_panic
2077  *	Purpose:
2078  *			Panic on detection of an invalid range id.
2079  */
2080 __abortlike
2081 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)2082 vm_map_range_invalid_panic(
2083 	vm_map_t                map,
2084 	vm_map_range_id_t       range_id)
2085 {
2086 	panic("invalid range ID (%u) for map %p", range_id, map);
2087 }
2088 
2089 /*
2090  *	Routine:	vm_map_get_range
2091  *	Purpose:
2092  *			Adjust bounds based on security policy.
2093  */
2094 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)2095 vm_map_get_range(
2096 	vm_map_t                map,
2097 	vm_map_address_t       *address,
2098 	vm_map_kernel_flags_t  *vmk_flags,
2099 	vm_map_size_t           size,
2100 	bool                   *is_ptr)
2101 {
2102 	struct mach_vm_range effective_range = {};
2103 	vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
2104 
2105 	if (map == kernel_map) {
2106 		effective_range = kmem_ranges[range_id];
2107 
2108 		if (startup_phase >= STARTUP_SUB_KMEM) {
2109 			/*
2110 			 * Hint provided by caller is zeroed as the range is restricted to a
2111 			 * subset of the entire kernel_map VA, which could put the hint outside
2112 			 * the range, causing vm_map_store_find_space to fail.
2113 			 */
2114 			*address = 0ull;
2115 			/*
2116 			 * Ensure that range_id passed in by the caller is within meaningful
2117 			 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
2118 			 * to fail as the corresponding range is invalid. Range id larger than
2119 			 * KMEM_RANGE_ID_MAX will lead to an OOB access.
2120 			 */
2121 			if ((range_id == KMEM_RANGE_ID_NONE) ||
2122 			    (range_id > KMEM_RANGE_ID_MAX)) {
2123 				vm_map_range_invalid_panic(map, range_id);
2124 			}
2125 
2126 			/*
2127 			 * Pointer ranges use kmem_locate_space to do allocations.
2128 			 *
2129 			 * Non pointer fronts look like [ Small | Large | Permanent ]
2130 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
2131 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
2132 			 * use the entire range.
2133 			 */
2134 			if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2135 				*is_ptr = true;
2136 			} else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2137 				effective_range = kmem_large_ranges[range_id];
2138 			}
2139 		}
2140 #if CONFIG_MAP_RANGES
2141 	} else if (map->uses_user_ranges) {
2142 		switch (range_id) {
2143 		case UMEM_RANGE_ID_DEFAULT:
2144 			effective_range = map->default_range;
2145 			break;
2146 		case UMEM_RANGE_ID_HEAP:
2147 			effective_range = map->data_range;
2148 			break;
2149 		case UMEM_RANGE_ID_LARGE_FILE:
2150 			if (map->large_file_range.min_address != map->large_file_range.max_address) {
2151 				/* large file range is configured and should be used */
2152 				effective_range = map->large_file_range;
2153 			} else {
2154 				/*
2155 				 * the user asking for this user range might not have the
2156 				 * permissions to use the large file range (i.e., it doesn't
2157 				 * hold the correct entitlement), so we give it the data range
2158 				 * instead
2159 				 */
2160 				effective_range = map->data_range;
2161 			}
2162 			break;
2163 		case UMEM_RANGE_ID_FIXED:
2164 			/*
2165 			 * anywhere allocations with an address in "FIXED"
2166 			 * makes no sense, leave the range empty
2167 			 */
2168 			break;
2169 
2170 		default:
2171 			vm_map_range_invalid_panic(map, range_id);
2172 		}
2173 #endif /* CONFIG_MAP_RANGES */
2174 	} else {
2175 		/*
2176 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
2177 		 * allocations of PAGEZERO to explicit requests since its
2178 		 * normal use is to catch dereferences of NULL and many
2179 		 * applications also treat pointers with a value of 0 as
2180 		 * special and suddenly having address 0 contain useable
2181 		 * memory would tend to confuse those applications.
2182 		 */
2183 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2184 		effective_range.max_address = map->max_offset;
2185 	}
2186 
2187 	return effective_range;
2188 }
2189 
2190 kern_return_t
vm_map_locate_space_anywhere(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2191 vm_map_locate_space_anywhere(
2192 	vm_map_t                map,
2193 	vm_map_size_t           size,
2194 	vm_map_offset_t         mask,
2195 	vm_map_kernel_flags_t   vmk_flags,
2196 	vm_map_offset_t        *start_inout,
2197 	vm_map_entry_t         *entry_out)
2198 {
2199 	struct mach_vm_range effective_range = {};
2200 	vm_map_size_t   guard_offset;
2201 	vm_map_offset_t hint, limit;
2202 	vm_map_entry_t  entry;
2203 	bool            is_kmem_ptr_range = false;
2204 
2205 	/*
2206 	 * Only supported by vm_map_enter() with a fixed address.
2207 	 */
2208 	assert(!vmk_flags.vmf_fixed);
2209 	assert(!vmk_flags.vmkf_beyond_max);
2210 
2211 	if (__improbable(map->wait_for_space)) {
2212 		/*
2213 		 * support for "wait_for_space" is minimal,
2214 		 * its only consumer is the ipc_kernel_copy_map.
2215 		 */
2216 		assert(!map->holelistenabled &&
2217 		    !vmk_flags.vmkf_last_free &&
2218 		    !vmk_flags.vmkf_keep_map_locked &&
2219 		    !vmk_flags.vmkf_map_jit &&
2220 		    !vmk_flags.vmf_random_addr &&
2221 		    *start_inout <= map->min_offset);
2222 	} else if (vmk_flags.vmkf_last_free) {
2223 		assert(!vmk_flags.vmkf_map_jit &&
2224 		    !vmk_flags.vmf_random_addr);
2225 	}
2226 
2227 	if (vmk_flags.vmkf_guard_before) {
2228 		guard_offset = VM_MAP_PAGE_SIZE(map);
2229 		assert(size > guard_offset);
2230 		size -= guard_offset;
2231 	} else {
2232 		assert(size != 0);
2233 		guard_offset = 0;
2234 	}
2235 
2236 	/*
2237 	 * Validate range_id from flags and get associated range
2238 	 */
2239 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2240 	    &is_kmem_ptr_range);
2241 
2242 	if (is_kmem_ptr_range) {
2243 		return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2244 		           vmk_flags.vmkf_last_free, start_inout, entry_out);
2245 	}
2246 
2247 #if XNU_TARGET_OS_OSX
2248 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2249 		assert(map != kernel_map);
2250 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2251 	}
2252 #endif /* XNU_TARGET_OS_OSX */
2253 
2254 again:
2255 	if (vmk_flags.vmkf_last_free) {
2256 		hint = *start_inout;
2257 
2258 		if (hint == 0 || hint > effective_range.max_address) {
2259 			hint = effective_range.max_address;
2260 		}
2261 		if (hint <= effective_range.min_address) {
2262 			return KERN_NO_SPACE;
2263 		}
2264 		limit = effective_range.min_address;
2265 	} else {
2266 		hint = *start_inout;
2267 
2268 		if (vmk_flags.vmkf_map_jit) {
2269 			if (map->jit_entry_exists &&
2270 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2271 				return KERN_INVALID_ARGUMENT;
2272 			}
2273 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2274 				vmk_flags.vmf_random_addr = true;
2275 			}
2276 		}
2277 
2278 		if (vmk_flags.vmf_random_addr) {
2279 			kern_return_t kr;
2280 
2281 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2282 			if (kr != KERN_SUCCESS) {
2283 				return kr;
2284 			}
2285 		}
2286 #if __x86_64__
2287 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2288 		    !map->disable_vmentry_reuse &&
2289 		    map->vmmap_high_start != 0) {
2290 			hint = map->vmmap_high_start;
2291 		}
2292 #endif /* __x86_64__ */
2293 
2294 		if (hint < effective_range.min_address) {
2295 			hint = effective_range.min_address;
2296 		}
2297 		if (effective_range.max_address <= hint) {
2298 			return KERN_NO_SPACE;
2299 		}
2300 
2301 		limit = effective_range.max_address;
2302 	}
2303 	entry = vm_map_store_find_space(map,
2304 	    hint, limit, vmk_flags.vmkf_last_free,
2305 	    guard_offset, size, mask,
2306 	    start_inout);
2307 
2308 	if (__improbable(entry == NULL)) {
2309 		if (map->wait_for_space &&
2310 		    guard_offset + size <=
2311 		    effective_range.max_address - effective_range.min_address) {
2312 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2313 			vm_map_unlock(map);
2314 			thread_block(THREAD_CONTINUE_NULL);
2315 			vm_map_lock(map);
2316 			goto again;
2317 		}
2318 		return KERN_NO_SPACE;
2319 	}
2320 
2321 	if (entry_out) {
2322 		*entry_out = entry;
2323 	}
2324 	return KERN_SUCCESS;
2325 }
2326 
2327 /*!
2328  * @function vm_map_locate_space_fixed()
2329  *
2330  * @brief
2331  * Locate (no reservation) a range in the specified VM map at a fixed address.
2332  *
2333  * @param map           the map to scan for memory, must be locked.
2334  * @param start         the fixed address trying to be reserved
2335  * @param size          the size of the allocation to make.
2336  * @param mask          an alignment mask the allocation must respect,
2337  * @param vmk_flags     the vm map kernel flags to influence this call.
2338  *                      vmk_flags.vmf_anywhere must not be set.
2339  * @param entry_out     the entry right before the hole.
2340  * @param zap_list      a zap list of entries to clean up after the call.
2341  *
2342  * @returns
2343  * - KERN_SUCCESS in case of success and no conflicting entry is found,
2344  *   in which case entry_out is set to the entry before the hole.
2345  *
2346  * - KERN_MEMORY_PRESENT if a conflicting entry is found,
2347  *   in which case entry_out is set the conflicting entry,
2348  *   the callers MUST handle this error explicitly.
2349  *
2350  * - KERN_INVALID_ADDRESS if the specified @c start or @c size
2351  *   would result in a mapping outside of the map.
2352  *
2353  * - KERN_NO_SPACE for various cases of unrecoverable failures.
2354  */
2355 static kern_return_t
vm_map_locate_space_fixed(vm_map_t map,vm_map_offset_t start,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * entry_out,vm_map_zap_t zap_list)2356 vm_map_locate_space_fixed(
2357 	vm_map_t                map,
2358 	vm_map_offset_t         start,
2359 	vm_map_size_t           size,
2360 	vm_map_offset_t         mask,
2361 	vm_map_kernel_flags_t   vmk_flags,
2362 	vm_map_entry_t         *entry_out,
2363 	vm_map_zap_t            zap_list)
2364 {
2365 	vm_map_offset_t effective_min_offset, effective_max_offset;
2366 	vm_map_entry_t  entry;
2367 	vm_map_offset_t end;
2368 
2369 	assert(vmk_flags.vmf_fixed);
2370 
2371 	effective_min_offset = map->min_offset;
2372 	effective_max_offset = map->max_offset;
2373 
2374 	if (vmk_flags.vmkf_beyond_max) {
2375 		/*
2376 		 * Allow an insertion beyond the map's max offset.
2377 		 */
2378 		effective_max_offset = 0x00000000FFFFF000ULL;
2379 		if (vm_map_is_64bit(map)) {
2380 			effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2381 		}
2382 #if XNU_TARGET_OS_OSX
2383 	} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2384 		effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2385 #endif /* XNU_TARGET_OS_OSX */
2386 	}
2387 
2388 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2389 	    !vmk_flags.vmf_overwrite &&
2390 	    map->pmap == kernel_pmap &&
2391 	    vmk_flags.vm_tag == VM_MEMORY_REALLOC) {
2392 		/*
2393 		 * Force realloc() to switch to a new allocation,
2394 		 * to prevent 4k-fragmented virtual ranges.
2395 		 */
2396 //		DEBUG4K_ERROR("no realloc in place");
2397 		return KERN_NO_SPACE;
2398 	}
2399 
2400 	/*
2401 	 *	Verify that:
2402 	 *		the address doesn't itself violate
2403 	 *		the mask requirement.
2404 	 */
2405 
2406 	if ((start & mask) != 0) {
2407 		return KERN_NO_SPACE;
2408 	}
2409 
2410 #if CONFIG_MAP_RANGES
2411 	if (map->uses_user_ranges) {
2412 		struct mach_vm_range r;
2413 
2414 		vm_map_user_range_resolve(map, start, 1, &r);
2415 		if (r.max_address == 0) {
2416 			return KERN_INVALID_ADDRESS;
2417 		}
2418 		effective_min_offset = r.min_address;
2419 		effective_max_offset = r.max_address;
2420 	}
2421 #endif /* CONFIG_MAP_RANGES */
2422 
2423 	if ((startup_phase >= STARTUP_SUB_KMEM) && !vmk_flags.vmkf_submap &&
2424 	    (map == kernel_map)) {
2425 		mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2426 		effective_min_offset = r->min_address;
2427 		effective_max_offset = r->max_address;
2428 	}
2429 
2430 	/*
2431 	 *	...	the address is within bounds
2432 	 */
2433 
2434 	end = start + size;
2435 
2436 	if ((start < effective_min_offset) ||
2437 	    (end > effective_max_offset) ||
2438 	    (start >= end)) {
2439 		return KERN_INVALID_ADDRESS;
2440 	}
2441 
2442 	if (vmk_flags.vmf_overwrite) {
2443 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2444 		kern_return_t remove_kr;
2445 
2446 		/*
2447 		 * Fixed mapping and "overwrite" flag: attempt to
2448 		 * remove all existing mappings in the specified
2449 		 * address range, saving them in our "zap_list".
2450 		 *
2451 		 * This avoids releasing the VM map lock in
2452 		 * vm_map_entry_delete() and allows atomicity
2453 		 * when we want to replace some mappings with a new one.
2454 		 * It also allows us to restore the old VM mappings if the
2455 		 * new mapping fails.
2456 		 */
2457 		remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2458 
2459 		if (vmk_flags.vmkf_overwrite_immutable) {
2460 			/* we can overwrite immutable mappings */
2461 			remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2462 		}
2463 		if (vmk_flags.vmkf_remap_prot_copy) {
2464 			remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2465 		}
2466 		remove_kr = vm_map_delete(map, start, end, remove_flags,
2467 		    KMEM_GUARD_NONE, zap_list).kmr_return;
2468 		if (remove_kr) {
2469 			/* XXX FBDP restore zap_list? */
2470 			return remove_kr;
2471 		}
2472 	}
2473 
2474 	/*
2475 	 *	...	the starting address isn't allocated
2476 	 */
2477 
2478 	if (vm_map_lookup_entry(map, start, &entry)) {
2479 		*entry_out = entry;
2480 		return KERN_MEMORY_PRESENT;
2481 	}
2482 
2483 	/*
2484 	 *	...	the next region doesn't overlap the
2485 	 *		end point.
2486 	 */
2487 
2488 	if ((entry->vme_next != vm_map_to_entry(map)) &&
2489 	    (entry->vme_next->vme_start < end)) {
2490 		return KERN_NO_SPACE;
2491 	}
2492 
2493 	*entry_out = entry;
2494 	return KERN_SUCCESS;
2495 }
2496 
2497 /*
2498  *	Routine:	vm_map_find_space
2499  *	Purpose:
2500  *		Allocate a range in the specified virtual address map,
2501  *		returning the entry allocated for that range.
2502  *		Used by kmem_alloc, etc.
2503  *
2504  *		The map must be NOT be locked. It will be returned locked
2505  *		on KERN_SUCCESS, unlocked on failure.
2506  *
2507  *		If an entry is allocated, the object/offset fields
2508  *		are initialized to zero.
2509  */
2510 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2511 vm_map_find_space(
2512 	vm_map_t                map,
2513 	vm_map_offset_t         hint_address,
2514 	vm_map_size_t           size,
2515 	vm_map_offset_t         mask,
2516 	vm_map_kernel_flags_t   vmk_flags,
2517 	vm_map_entry_t          *o_entry)       /* OUT */
2518 {
2519 	vm_map_entry_t          new_entry, entry;
2520 	kern_return_t           kr;
2521 
2522 	if (size == 0) {
2523 		return KERN_INVALID_ARGUMENT;
2524 	}
2525 
2526 	new_entry = vm_map_entry_create(map);
2527 	new_entry->use_pmap = true;
2528 	new_entry->protection = VM_PROT_DEFAULT;
2529 	new_entry->max_protection = VM_PROT_ALL;
2530 
2531 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2532 		new_entry->map_aligned = true;
2533 	}
2534 	if (vmk_flags.vmf_permanent) {
2535 		new_entry->vme_permanent = true;
2536 	}
2537 
2538 	vm_map_lock(map);
2539 
2540 	kr = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
2541 	    &hint_address, &entry);
2542 	if (kr != KERN_SUCCESS) {
2543 		vm_map_unlock(map);
2544 		vm_map_entry_dispose(new_entry);
2545 		return kr;
2546 	}
2547 	new_entry->vme_start = hint_address;
2548 	new_entry->vme_end = hint_address + size;
2549 
2550 	/*
2551 	 *	At this point,
2552 	 *
2553 	 *	- new_entry's "vme_start" and "vme_end" should define
2554 	 *	  the endpoints of the available new range,
2555 	 *
2556 	 *	- and "entry" should refer to the region before
2557 	 *	  the new range,
2558 	 *
2559 	 *	- and the map should still be locked.
2560 	 */
2561 
2562 	assert(page_aligned(new_entry->vme_start));
2563 	assert(page_aligned(new_entry->vme_end));
2564 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2565 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2566 
2567 	/*
2568 	 *	Insert the new entry into the list
2569 	 */
2570 
2571 	vm_map_store_entry_link(map, entry, new_entry,
2572 	    VM_MAP_KERNEL_FLAGS_NONE);
2573 	map->size += size;
2574 
2575 	/*
2576 	 *	Update the lookup hint
2577 	 */
2578 	SAVE_HINT_MAP_WRITE(map, new_entry);
2579 
2580 	*o_entry = new_entry;
2581 	return KERN_SUCCESS;
2582 }
2583 
2584 int vm_map_pmap_enter_print = FALSE;
2585 int vm_map_pmap_enter_enable = FALSE;
2586 
2587 /*
2588  *	Routine:	vm_map_pmap_enter [internal only]
2589  *
2590  *	Description:
2591  *		Force pages from the specified object to be entered into
2592  *		the pmap at the specified address if they are present.
2593  *		As soon as a page not found in the object the scan ends.
2594  *
2595  *	Returns:
2596  *		Nothing.
2597  *
2598  *	In/out conditions:
2599  *		The source map should not be locked on entry.
2600  */
2601 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2602 vm_map_pmap_enter(
2603 	vm_map_t                map,
2604 	vm_map_offset_t         addr,
2605 	vm_map_offset_t         end_addr,
2606 	vm_object_t             object,
2607 	vm_object_offset_t      offset,
2608 	vm_prot_t               protection)
2609 {
2610 	int                     type_of_fault;
2611 	kern_return_t           kr;
2612 	uint8_t                 object_lock_type = 0;
2613 	struct vm_object_fault_info fault_info = {};
2614 
2615 	if (map->pmap == 0) {
2616 		return;
2617 	}
2618 
2619 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2620 
2621 	while (addr < end_addr) {
2622 		vm_page_t       m;
2623 
2624 
2625 		/*
2626 		 * TODO:
2627 		 * From vm_map_enter(), we come into this function without the map
2628 		 * lock held or the object lock held.
2629 		 * We haven't taken a reference on the object either.
2630 		 * We should do a proper lookup on the map to make sure
2631 		 * that things are sane before we go locking objects that
2632 		 * could have been deallocated from under us.
2633 		 */
2634 
2635 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2636 		vm_object_lock(object);
2637 
2638 		m = vm_page_lookup(object, offset);
2639 
2640 		if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2641 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2642 			vm_object_unlock(object);
2643 			return;
2644 		}
2645 
2646 		if (vm_map_pmap_enter_print) {
2647 			printf("vm_map_pmap_enter:");
2648 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2649 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2650 		}
2651 		type_of_fault = DBG_CACHE_HIT_FAULT;
2652 		kr = vm_fault_enter(m, map->pmap,
2653 		    addr,
2654 		    PAGE_SIZE, 0,
2655 		    protection, protection,
2656 		    VM_PAGE_WIRED(m),
2657 		    FALSE,                 /* change_wiring */
2658 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2659 		    &fault_info,
2660 		    NULL,                  /* need_retry */
2661 		    &type_of_fault,
2662 		    &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2663 
2664 		vm_object_unlock(object);
2665 
2666 		offset += PAGE_SIZE_64;
2667 		addr += PAGE_SIZE;
2668 	}
2669 }
2670 
2671 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2672 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2673 vm_map_random_address_for_size(
2674 	vm_map_t                map,
2675 	vm_map_offset_t        *address,
2676 	vm_map_size_t           size,
2677 	vm_map_kernel_flags_t   vmk_flags)
2678 {
2679 	kern_return_t   kr = KERN_SUCCESS;
2680 	int             tries = 0;
2681 	vm_map_offset_t random_addr = 0;
2682 	vm_map_offset_t hole_end;
2683 
2684 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2685 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2686 	vm_map_size_t   vm_hole_size = 0;
2687 	vm_map_size_t   addr_space_size;
2688 	bool            is_kmem_ptr;
2689 	struct mach_vm_range effective_range;
2690 
2691 	effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2692 	    &is_kmem_ptr);
2693 
2694 	addr_space_size = effective_range.max_address - effective_range.min_address;
2695 	if (size >= addr_space_size) {
2696 		return KERN_NO_SPACE;
2697 	}
2698 	addr_space_size -= size;
2699 
2700 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2701 
2702 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2703 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2704 			random_addr = (vm_map_offset_t)early_random();
2705 		} else {
2706 			random_addr = (vm_map_offset_t)random();
2707 		}
2708 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2709 		random_addr = vm_map_trunc_page(
2710 			effective_range.min_address + (random_addr % addr_space_size),
2711 			VM_MAP_PAGE_MASK(map));
2712 
2713 #if CONFIG_PROB_GZALLOC
2714 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2715 			continue;
2716 		}
2717 #endif /* CONFIG_PROB_GZALLOC */
2718 
2719 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2720 			if (prev_entry == vm_map_to_entry(map)) {
2721 				next_entry = vm_map_first_entry(map);
2722 			} else {
2723 				next_entry = prev_entry->vme_next;
2724 			}
2725 			if (next_entry == vm_map_to_entry(map)) {
2726 				hole_end = vm_map_max(map);
2727 			} else {
2728 				hole_end = next_entry->vme_start;
2729 			}
2730 			vm_hole_size = hole_end - random_addr;
2731 			if (vm_hole_size >= size) {
2732 				*address = random_addr;
2733 				break;
2734 			}
2735 		}
2736 		tries++;
2737 	}
2738 
2739 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2740 		kr = KERN_NO_SPACE;
2741 	}
2742 	return kr;
2743 }
2744 
2745 static boolean_t
vm_memory_malloc_no_cow(int alias)2746 vm_memory_malloc_no_cow(
2747 	int alias)
2748 {
2749 	uint64_t alias_mask;
2750 
2751 	if (!malloc_no_cow) {
2752 		return FALSE;
2753 	}
2754 	if (alias > 63) {
2755 		return FALSE;
2756 	}
2757 	alias_mask = 1ULL << alias;
2758 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2759 		return TRUE;
2760 	}
2761 	return FALSE;
2762 }
2763 
2764 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2765 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2766 /*
2767  *	Routine:	vm_map_enter
2768  *
2769  *	Description:
2770  *		Allocate a range in the specified virtual address map.
2771  *		The resulting range will refer to memory defined by
2772  *		the given memory object and offset into that object.
2773  *
2774  *		Arguments are as defined in the vm_map call.
2775  */
2776 static unsigned int vm_map_enter_restore_successes = 0;
2777 static unsigned int vm_map_enter_restore_failures = 0;
2778 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2779 vm_map_enter(
2780 	vm_map_t                map,
2781 	vm_map_offset_t         *address,       /* IN/OUT */
2782 	vm_map_size_t           size,
2783 	vm_map_offset_t         mask,
2784 	vm_map_kernel_flags_t   vmk_flags,
2785 	vm_object_t             object,
2786 	vm_object_offset_t      offset,
2787 	boolean_t               needs_copy,
2788 	vm_prot_t               cur_protection,
2789 	vm_prot_t               max_protection,
2790 	vm_inherit_t            inheritance)
2791 {
2792 	vm_map_entry_t          entry, new_entry;
2793 	vm_map_offset_t         start, tmp_start, tmp_offset;
2794 	vm_map_offset_t         end, tmp_end;
2795 	vm_map_offset_t         tmp2_start, tmp2_end;
2796 	vm_map_offset_t         step;
2797 	kern_return_t           result = KERN_SUCCESS;
2798 	bool                    map_locked = FALSE;
2799 	bool                    pmap_empty = TRUE;
2800 	bool                    new_mapping_established = FALSE;
2801 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2802 	const bool              anywhere = !vmk_flags.vmf_fixed;
2803 	const bool              purgable = vmk_flags.vmf_purgeable;
2804 	const bool              no_cache = vmk_flags.vmf_no_cache;
2805 	const bool              is_submap = vmk_flags.vmkf_submap;
2806 	const bool              permanent = vmk_flags.vmf_permanent;
2807 	const bool              no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2808 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
2809 	const bool              iokit_acct = vmk_flags.vmkf_iokit_acct;
2810 	const bool              resilient_codesign = vmk_flags.vmf_resilient_codesign;
2811 	const bool              resilient_media = vmk_flags.vmf_resilient_media;
2812 	const bool              entry_for_tpro = vmk_flags.vmf_tpro;
2813 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
2814 	const vm_tag_t          alias = vmk_flags.vm_tag;
2815 	vm_tag_t                user_alias;
2816 	kern_return_t           kr;
2817 	bool                    clear_map_aligned = FALSE;
2818 	vm_map_size_t           chunk_size = 0;
2819 	vm_object_t             caller_object;
2820 	VM_MAP_ZAP_DECLARE(zap_old_list);
2821 	VM_MAP_ZAP_DECLARE(zap_new_list);
2822 
2823 	caller_object = object;
2824 
2825 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2826 
2827 	if (vmk_flags.vmf_4gb_chunk) {
2828 #if defined(__LP64__)
2829 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2830 #else /* __LP64__ */
2831 		chunk_size = ANON_CHUNK_SIZE;
2832 #endif /* __LP64__ */
2833 	} else {
2834 		chunk_size = ANON_CHUNK_SIZE;
2835 	}
2836 
2837 
2838 
2839 	if (superpage_size) {
2840 		if (object != VM_OBJECT_NULL) {
2841 			/* caller can't provide their own VM object */
2842 			return KERN_INVALID_ARGUMENT;
2843 		}
2844 		switch (superpage_size) {
2845 			/*
2846 			 * Note that the current implementation only supports
2847 			 * a single size for superpages, SUPERPAGE_SIZE, per
2848 			 * architecture. As soon as more sizes are supposed
2849 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2850 			 * with a lookup of the size depending on superpage_size.
2851 			 */
2852 #ifdef __x86_64__
2853 		case SUPERPAGE_SIZE_ANY:
2854 			/* handle it like 2 MB and round up to page size */
2855 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2856 			OS_FALLTHROUGH;
2857 		case SUPERPAGE_SIZE_2MB:
2858 			break;
2859 #endif
2860 		default:
2861 			return KERN_INVALID_ARGUMENT;
2862 		}
2863 		mask = SUPERPAGE_SIZE - 1;
2864 		if (size & (SUPERPAGE_SIZE - 1)) {
2865 			return KERN_INVALID_ARGUMENT;
2866 		}
2867 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2868 	}
2869 
2870 
2871 	if ((cur_protection & VM_PROT_WRITE) &&
2872 	    (cur_protection & VM_PROT_EXECUTE) &&
2873 #if XNU_TARGET_OS_OSX
2874 	    map->pmap != kernel_pmap &&
2875 	    (cs_process_global_enforcement() ||
2876 	    (vmk_flags.vmkf_cs_enforcement_override
2877 	    ? vmk_flags.vmkf_cs_enforcement
2878 	    : (vm_map_cs_enforcement(map)
2879 #if __arm64__
2880 	    || !VM_MAP_IS_EXOTIC(map)
2881 #endif /* __arm64__ */
2882 	    ))) &&
2883 #endif /* XNU_TARGET_OS_OSX */
2884 #if CODE_SIGNING_MONITOR
2885 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2886 #endif
2887 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2888 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2889 	    !entry_for_jit) {
2890 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2891 
2892 		DTRACE_VM3(cs_wx,
2893 		    uint64_t, 0,
2894 		    uint64_t, 0,
2895 		    vm_prot_t, cur_protection);
2896 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2897 		    proc_selfpid(),
2898 		    (get_bsdtask_info(current_task())
2899 		    ? proc_name_address(get_bsdtask_info(current_task()))
2900 		    : "?"),
2901 		    __FUNCTION__,
2902 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2903 		cur_protection &= ~VM_PROT_EXECUTE;
2904 		if (vm_protect_wx_fail) {
2905 			return KERN_PROTECTION_FAILURE;
2906 		}
2907 	}
2908 
2909 	if (entry_for_jit
2910 	    && cur_protection != VM_PROT_ALL) {
2911 		/*
2912 		 * Native macOS processes and all non-macOS processes are
2913 		 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2914 		 * the RWX requirement was not enforced, and thus, we must live
2915 		 * with our sins. We are now dealing with a JIT mapping without
2916 		 * RWX.
2917 		 *
2918 		 * We deal with these by letting the MAP_JIT stick in order
2919 		 * to avoid CS violations when these pages are mapped executable
2920 		 * down the line. In order to appease the page table monitor (you
2921 		 * know what I'm talking about), these pages will end up being
2922 		 * marked as XNU_USER_DEBUG, which will be allowed because we
2923 		 * don't enforce the code signing monitor on macOS systems. If
2924 		 * the user-space application ever changes permissions to RWX,
2925 		 * which they are allowed to since the mapping was originally
2926 		 * created with MAP_JIT, then they'll switch over to using the
2927 		 * XNU_USER_JIT type, and won't be allowed to downgrade any
2928 		 * more after that.
2929 		 *
2930 		 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
2931 		 * strictly disallowed.
2932 		 */
2933 
2934 #if XNU_TARGET_OS_OSX
2935 		/*
2936 		 * Continue to allow non-RWX JIT
2937 		 */
2938 #else
2939 		/* non-macOS: reject JIT regions without RWX */
2940 		DTRACE_VM3(cs_wx,
2941 		    uint64_t, 0,
2942 		    uint64_t, 0,
2943 		    vm_prot_t, cur_protection);
2944 		printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
2945 		    proc_selfpid(),
2946 		    (get_bsdtask_info(current_task())
2947 		    ? proc_name_address(get_bsdtask_info(current_task()))
2948 		    : "?"),
2949 		    __FUNCTION__,
2950 		    cur_protection);
2951 		return KERN_PROTECTION_FAILURE;
2952 #endif
2953 	}
2954 
2955 	/*
2956 	 * If the task has requested executable lockdown,
2957 	 * deny any new executable mapping.
2958 	 */
2959 	if (map->map_disallow_new_exec == TRUE) {
2960 		if (cur_protection & VM_PROT_EXECUTE) {
2961 			return KERN_PROTECTION_FAILURE;
2962 		}
2963 	}
2964 
2965 	if (resilient_codesign) {
2966 		assert(!is_submap);
2967 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2968 		if ((cur_protection | max_protection) & reject_prot) {
2969 			return KERN_PROTECTION_FAILURE;
2970 		}
2971 	}
2972 
2973 	if (resilient_media) {
2974 		assert(!is_submap);
2975 //		assert(!needs_copy);
2976 		if (object != VM_OBJECT_NULL &&
2977 		    !object->internal) {
2978 			/*
2979 			 * This mapping is directly backed by an external
2980 			 * memory manager (e.g. a vnode pager for a file):
2981 			 * we would not have any safe place to inject
2982 			 * a zero-filled page if an actual page is not
2983 			 * available, without possibly impacting the actual
2984 			 * contents of the mapped object (e.g. the file),
2985 			 * so we can't provide any media resiliency here.
2986 			 */
2987 			return KERN_INVALID_ARGUMENT;
2988 		}
2989 	}
2990 
2991 	if (entry_for_tpro) {
2992 		/*
2993 		 * TPRO overrides the effective permissions of the region
2994 		 * and explicitly maps as RW. Ensure we have been passed
2995 		 * the expected permissions. We accept `cur_protections`
2996 		 * RO as that will be handled on fault.
2997 		 */
2998 		if (!(max_protection & VM_PROT_READ) ||
2999 		    !(max_protection & VM_PROT_WRITE) ||
3000 		    !(cur_protection & VM_PROT_READ)) {
3001 			return KERN_PROTECTION_FAILURE;
3002 		}
3003 
3004 		/*
3005 		 * We can now downgrade the cur_protection to RO. This is a mild lie
3006 		 * to the VM layer. But TPRO will be responsible for toggling the
3007 		 * protections between RO/RW
3008 		 */
3009 		cur_protection = VM_PROT_READ;
3010 	}
3011 
3012 	if (is_submap) {
3013 		vm_map_t submap;
3014 		if (purgable) {
3015 			/* submaps can not be purgeable */
3016 			return KERN_INVALID_ARGUMENT;
3017 		}
3018 		if (object == VM_OBJECT_NULL) {
3019 			/* submaps can not be created lazily */
3020 			return KERN_INVALID_ARGUMENT;
3021 		}
3022 		submap = (vm_map_t) object;
3023 		if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
3024 			/* page size mismatch */
3025 			return KERN_INVALID_ARGUMENT;
3026 		}
3027 	}
3028 	if (vmk_flags.vmkf_already) {
3029 		/*
3030 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
3031 		 * is already present.  For it to be meaningul, the requested
3032 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
3033 		 * we shouldn't try and remove what was mapped there first
3034 		 * (!VM_FLAGS_OVERWRITE).
3035 		 */
3036 		if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
3037 			return KERN_INVALID_ARGUMENT;
3038 		}
3039 	}
3040 
3041 	if (size == 0 ||
3042 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
3043 		*address = 0;
3044 		return KERN_INVALID_ARGUMENT;
3045 	}
3046 
3047 	if (map->pmap == kernel_pmap) {
3048 		user_alias = VM_KERN_MEMORY_NONE;
3049 	} else {
3050 		user_alias = alias;
3051 	}
3052 
3053 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
3054 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
3055 	}
3056 
3057 #define RETURN(value)   { result = value; goto BailOut; }
3058 
3059 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
3060 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
3061 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
3062 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
3063 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
3064 	}
3065 
3066 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3067 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
3068 		/*
3069 		 * In most cases, the caller rounds the size up to the
3070 		 * map's page size.
3071 		 * If we get a size that is explicitly not map-aligned here,
3072 		 * we'll have to respect the caller's wish and mark the
3073 		 * mapping as "not map-aligned" to avoid tripping the
3074 		 * map alignment checks later.
3075 		 */
3076 		clear_map_aligned = TRUE;
3077 	}
3078 	if (!anywhere &&
3079 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3080 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
3081 		/*
3082 		 * We've been asked to map at a fixed address and that
3083 		 * address is not aligned to the map's specific alignment.
3084 		 * The caller should know what it's doing (i.e. most likely
3085 		 * mapping some fragmented copy map, transferring memory from
3086 		 * a VM map with a different alignment), so clear map_aligned
3087 		 * for this new VM map entry and proceed.
3088 		 */
3089 		clear_map_aligned = TRUE;
3090 	}
3091 
3092 	/*
3093 	 * Only zero-fill objects are allowed to be purgable.
3094 	 * LP64todo - limit purgable objects to 32-bits for now
3095 	 */
3096 	if (purgable &&
3097 	    (offset != 0 ||
3098 	    (object != VM_OBJECT_NULL &&
3099 	    (object->vo_size != size ||
3100 	    object->purgable == VM_PURGABLE_DENY))
3101 #if __LP64__
3102 	    || size > ANON_MAX_SIZE
3103 #endif
3104 	    )) {
3105 		return KERN_INVALID_ARGUMENT;
3106 	}
3107 
3108 	vm_map_lock(map);
3109 	map_locked = TRUE;
3110 
3111 	if (anywhere) {
3112 		result = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
3113 		    address, &entry);
3114 		start = *address;
3115 	} else {
3116 		start = *address;
3117 		result = vm_map_locate_space_fixed(map, start, size, mask,
3118 		    vmk_flags, &entry, &zap_old_list);
3119 	}
3120 
3121 	end = start + size;
3122 
3123 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
3124 
3125 	/*
3126 	 * Check if what's already there is what we want.
3127 	 */
3128 	if (result == KERN_MEMORY_PRESENT) {
3129 		assert(!anywhere);
3130 		if (!(vmk_flags.vmkf_already)) {
3131 			RETURN(KERN_NO_SPACE);
3132 		}
3133 		tmp_start = start;
3134 		tmp_offset = offset;
3135 		if (entry->vme_start < start) {
3136 			tmp_start -= start - entry->vme_start;
3137 			tmp_offset -= start - entry->vme_start;
3138 		}
3139 		for (; entry->vme_start < end;
3140 		    entry = entry->vme_next) {
3141 			/*
3142 			 * Check if the mapping's attributes
3143 			 * match the existing map entry.
3144 			 */
3145 			if (entry == vm_map_to_entry(map) ||
3146 			    entry->vme_start != tmp_start ||
3147 			    entry->is_sub_map != is_submap ||
3148 			    VME_OFFSET(entry) != tmp_offset ||
3149 			    entry->needs_copy != needs_copy ||
3150 			    entry->protection != cur_protection ||
3151 			    entry->max_protection != max_protection ||
3152 			    entry->inheritance != inheritance ||
3153 			    entry->iokit_acct != iokit_acct ||
3154 			    VME_ALIAS(entry) != alias) {
3155 				/* not the same mapping ! */
3156 				RETURN(KERN_NO_SPACE);
3157 			}
3158 			/*
3159 			 * Check if the same object is being mapped.
3160 			 */
3161 			if (is_submap) {
3162 				if (VME_SUBMAP(entry) !=
3163 				    (vm_map_t) object) {
3164 					/* not the same submap */
3165 					RETURN(KERN_NO_SPACE);
3166 				}
3167 			} else {
3168 				if (VME_OBJECT(entry) != object) {
3169 					/* not the same VM object... */
3170 					vm_object_t obj2;
3171 
3172 					obj2 = VME_OBJECT(entry);
3173 					if ((obj2 == VM_OBJECT_NULL || obj2->internal) &&
3174 					    (object == VM_OBJECT_NULL || object->internal)) {
3175 						/*
3176 						 * ... but both are
3177 						 * anonymous memory,
3178 						 * so equivalent.
3179 						 */
3180 					} else {
3181 						RETURN(KERN_NO_SPACE);
3182 					}
3183 				}
3184 			}
3185 
3186 			tmp_offset += entry->vme_end - entry->vme_start;
3187 			tmp_start += entry->vme_end - entry->vme_start;
3188 			if (entry->vme_end >= end) {
3189 				/* reached the end of our mapping */
3190 				break;
3191 			}
3192 		}
3193 		/* it all matches:  let's use what's already there ! */
3194 		RETURN(KERN_MEMORY_PRESENT);
3195 	}
3196 
3197 	if (result != KERN_SUCCESS) {
3198 		goto BailOut;
3199 	}
3200 
3201 
3202 	/*
3203 	 *	At this point,
3204 	 *		"start" and "end" should define the endpoints of the
3205 	 *			available new range, and
3206 	 *		"entry" should refer to the region before the new
3207 	 *			range, and
3208 	 *
3209 	 *		the map should be locked.
3210 	 */
3211 
3212 	/*
3213 	 *	See whether we can avoid creating a new entry (and object) by
3214 	 *	extending one of our neighbors.  [So far, we only attempt to
3215 	 *	extend from below.]  Note that we can never extend/join
3216 	 *	purgable objects because they need to remain distinct
3217 	 *	entities in order to implement their "volatile object"
3218 	 *	semantics.
3219 	 */
3220 
3221 	if (purgable ||
3222 	    entry_for_jit ||
3223 	    entry_for_tpro ||
3224 	    vm_memory_malloc_no_cow(user_alias)) {
3225 		if (superpage_size) {
3226 			/*
3227 			 * For "super page" allocations, we will allocate
3228 			 * special physically-contiguous VM objects later on,
3229 			 * so we should not have flags instructing us to create
3230 			 * a differently special VM object here.
3231 			 */
3232 			RETURN(KERN_INVALID_ARGUMENT);
3233 		}
3234 
3235 		if (object == VM_OBJECT_NULL) {
3236 			assert(!superpage_size);
3237 			object = vm_object_allocate(size);
3238 			vm_object_lock(object);
3239 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3240 			VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3241 			if (malloc_no_cow_except_fork &&
3242 			    !purgable &&
3243 			    !entry_for_jit &&
3244 			    !entry_for_tpro &&
3245 			    vm_memory_malloc_no_cow(user_alias)) {
3246 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3247 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3248 			}
3249 			if (entry_for_jit) {
3250 				object->vo_inherit_copy_none = true;
3251 			}
3252 			if (purgable) {
3253 				task_t owner;
3254 				VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3255 				if (map->pmap == kernel_pmap) {
3256 					/*
3257 					 * Purgeable mappings made in a kernel
3258 					 * map are "owned" by the kernel itself
3259 					 * rather than the current user task
3260 					 * because they're likely to be used by
3261 					 * more than this user task (see
3262 					 * execargs_purgeable_allocate(), for
3263 					 * example).
3264 					 */
3265 					owner = kernel_task;
3266 				} else {
3267 					owner = current_task();
3268 				}
3269 				assert(object->vo_owner == NULL);
3270 				assert(object->resident_page_count == 0);
3271 				assert(object->wired_page_count == 0);
3272 				vm_purgeable_nonvolatile_enqueue(object, owner);
3273 			}
3274 			vm_object_unlock(object);
3275 			offset = (vm_object_offset_t)0;
3276 		}
3277 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3278 		/* no coalescing if address space uses sub-pages */
3279 	} else if ((is_submap == FALSE) &&
3280 	    (object == VM_OBJECT_NULL) &&
3281 	    (entry != vm_map_to_entry(map)) &&
3282 	    (entry->vme_end == start) &&
3283 	    (!entry->is_shared) &&
3284 	    (!entry->is_sub_map) &&
3285 	    (!entry->in_transition) &&
3286 	    (!entry->needs_wakeup) &&
3287 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3288 	    (entry->protection == cur_protection) &&
3289 	    (entry->max_protection == max_protection) &&
3290 	    (entry->inheritance == inheritance) &&
3291 	    ((user_alias == VM_MEMORY_REALLOC) ||
3292 	    (VME_ALIAS(entry) == alias)) &&
3293 	    (entry->no_cache == no_cache) &&
3294 	    (entry->vme_permanent == permanent) &&
3295 	    /* no coalescing for immutable executable mappings */
3296 	    !((entry->protection & VM_PROT_EXECUTE) &&
3297 	    entry->vme_permanent) &&
3298 	    (!entry->superpage_size && !superpage_size) &&
3299 	    /*
3300 	     * No coalescing if not map-aligned, to avoid propagating
3301 	     * that condition any further than needed:
3302 	     */
3303 	    (!entry->map_aligned || !clear_map_aligned) &&
3304 	    (!entry->zero_wired_pages) &&
3305 	    (!entry->used_for_jit && !entry_for_jit) &&
3306 #if __arm64e__
3307 	    (!entry->used_for_tpro && !entry_for_tpro) &&
3308 #endif
3309 	    (!entry->csm_associated) &&
3310 	    (entry->iokit_acct == iokit_acct) &&
3311 	    (!entry->vme_resilient_codesign) &&
3312 	    (!entry->vme_resilient_media) &&
3313 	    (!entry->vme_atomic) &&
3314 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
3315 
3316 	    ((entry->vme_end - entry->vme_start) + size <=
3317 	    (user_alias == VM_MEMORY_REALLOC ?
3318 	    ANON_CHUNK_SIZE :
3319 	    NO_COALESCE_LIMIT)) &&
3320 
3321 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
3322 		if (vm_object_coalesce(VME_OBJECT(entry),
3323 		    VM_OBJECT_NULL,
3324 		    VME_OFFSET(entry),
3325 		    (vm_object_offset_t) 0,
3326 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
3327 		    (vm_map_size_t)(end - entry->vme_end))) {
3328 			/*
3329 			 *	Coalesced the two objects - can extend
3330 			 *	the previous map entry to include the
3331 			 *	new range.
3332 			 */
3333 			map->size += (end - entry->vme_end);
3334 			assert(entry->vme_start < end);
3335 			assert(VM_MAP_PAGE_ALIGNED(end,
3336 			    VM_MAP_PAGE_MASK(map)));
3337 			if (__improbable(vm_debug_events)) {
3338 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3339 			}
3340 			entry->vme_end = end;
3341 			if (map->holelistenabled) {
3342 				vm_map_store_update_first_free(map, entry, TRUE);
3343 			} else {
3344 				vm_map_store_update_first_free(map, map->first_free, TRUE);
3345 			}
3346 			new_mapping_established = TRUE;
3347 			RETURN(KERN_SUCCESS);
3348 		}
3349 	}
3350 
3351 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3352 	new_entry = NULL;
3353 
3354 	if (vmk_flags.vmkf_submap_adjust) {
3355 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3356 		offset = start;
3357 	}
3358 
3359 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3360 		tmp2_end = tmp2_start + step;
3361 		/*
3362 		 *	Create a new entry
3363 		 *
3364 		 * XXX FBDP
3365 		 * The reserved "page zero" in each process's address space can
3366 		 * be arbitrarily large.  Splitting it into separate objects and
3367 		 * therefore different VM map entries serves no purpose and just
3368 		 * slows down operations on the VM map, so let's not split the
3369 		 * allocation into chunks if the max protection is NONE.  That
3370 		 * memory should never be accessible, so it will never get to the
3371 		 * default pager.
3372 		 */
3373 		tmp_start = tmp2_start;
3374 		if (!is_submap &&
3375 		    object == VM_OBJECT_NULL &&
3376 		    size > chunk_size &&
3377 		    max_protection != VM_PROT_NONE &&
3378 		    superpage_size == 0) {
3379 			tmp_end = tmp_start + chunk_size;
3380 		} else {
3381 			tmp_end = tmp2_end;
3382 		}
3383 		do {
3384 			if (!is_submap &&
3385 			    object != VM_OBJECT_NULL &&
3386 			    object->internal &&
3387 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3388 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3389 				DTRACE_VM5(vm_map_enter_overmap,
3390 				    vm_map_t, map,
3391 				    vm_map_address_t, tmp_start,
3392 				    vm_map_address_t, tmp_end,
3393 				    vm_object_offset_t, offset,
3394 				    vm_object_size_t, object->vo_size);
3395 			}
3396 			new_entry = vm_map_entry_insert(map,
3397 			    entry, tmp_start, tmp_end,
3398 			    object, offset, vmk_flags,
3399 			    needs_copy,
3400 			    cur_protection, max_protection,
3401 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3402 			    VM_INHERIT_NONE : inheritance),
3403 			    clear_map_aligned);
3404 
3405 			assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3406 
3407 			if (resilient_codesign) {
3408 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3409 				if (!((cur_protection | max_protection) & reject_prot)) {
3410 					new_entry->vme_resilient_codesign = TRUE;
3411 				}
3412 			}
3413 
3414 			if (resilient_media &&
3415 			    (object == VM_OBJECT_NULL ||
3416 			    object->internal)) {
3417 				new_entry->vme_resilient_media = TRUE;
3418 			}
3419 
3420 			assert(!new_entry->iokit_acct);
3421 			if (!is_submap &&
3422 			    object != VM_OBJECT_NULL &&
3423 			    object->internal &&
3424 			    (object->purgable != VM_PURGABLE_DENY ||
3425 			    object->vo_ledger_tag)) {
3426 				assert(new_entry->use_pmap);
3427 				assert(!new_entry->iokit_acct);
3428 				/*
3429 				 * Turn off pmap accounting since
3430 				 * purgeable (or tagged) objects have their
3431 				 * own ledgers.
3432 				 */
3433 				new_entry->use_pmap = FALSE;
3434 			} else if (!is_submap &&
3435 			    iokit_acct &&
3436 			    object != VM_OBJECT_NULL &&
3437 			    object->internal) {
3438 				/* alternate accounting */
3439 				assert(!new_entry->iokit_acct);
3440 				assert(new_entry->use_pmap);
3441 				new_entry->iokit_acct = TRUE;
3442 				new_entry->use_pmap = FALSE;
3443 				DTRACE_VM4(
3444 					vm_map_iokit_mapped_region,
3445 					vm_map_t, map,
3446 					vm_map_offset_t, new_entry->vme_start,
3447 					vm_map_offset_t, new_entry->vme_end,
3448 					int, VME_ALIAS(new_entry));
3449 				vm_map_iokit_mapped_region(
3450 					map,
3451 					(new_entry->vme_end -
3452 					new_entry->vme_start));
3453 			} else if (!is_submap) {
3454 				assert(!new_entry->iokit_acct);
3455 				assert(new_entry->use_pmap);
3456 			}
3457 
3458 			if (is_submap) {
3459 				vm_map_t        submap;
3460 				boolean_t       submap_is_64bit;
3461 				boolean_t       use_pmap;
3462 
3463 				assert(new_entry->is_sub_map);
3464 				assert(!new_entry->use_pmap);
3465 				assert(!new_entry->iokit_acct);
3466 				submap = (vm_map_t) object;
3467 				submap_is_64bit = vm_map_is_64bit(submap);
3468 				use_pmap = vmk_flags.vmkf_nested_pmap;
3469 #ifndef NO_NESTED_PMAP
3470 				if (use_pmap && submap->pmap == NULL) {
3471 					ledger_t ledger = map->pmap->ledger;
3472 					/* we need a sub pmap to nest... */
3473 					submap->pmap = pmap_create_options(ledger, 0,
3474 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3475 					if (submap->pmap == NULL) {
3476 						/* let's proceed without nesting... */
3477 					}
3478 #if defined(__arm64__)
3479 					else {
3480 						pmap_set_nested(submap->pmap);
3481 					}
3482 #endif
3483 				}
3484 				if (use_pmap && submap->pmap != NULL) {
3485 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3486 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3487 						kr = KERN_FAILURE;
3488 					} else {
3489 						kr = pmap_nest(map->pmap,
3490 						    submap->pmap,
3491 						    tmp_start,
3492 						    tmp_end - tmp_start);
3493 					}
3494 					if (kr != KERN_SUCCESS) {
3495 						printf("vm_map_enter: "
3496 						    "pmap_nest(0x%llx,0x%llx) "
3497 						    "error 0x%x\n",
3498 						    (long long)tmp_start,
3499 						    (long long)tmp_end,
3500 						    kr);
3501 					} else {
3502 						/* we're now nested ! */
3503 						new_entry->use_pmap = TRUE;
3504 						pmap_empty = FALSE;
3505 					}
3506 				}
3507 #endif /* NO_NESTED_PMAP */
3508 			}
3509 			entry = new_entry;
3510 
3511 			if (superpage_size) {
3512 				vm_page_t pages, m;
3513 				vm_object_t sp_object;
3514 				vm_object_offset_t sp_offset;
3515 
3516 				assert(object == VM_OBJECT_NULL);
3517 				VME_OFFSET_SET(entry, 0);
3518 
3519 				/* allocate one superpage */
3520 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3521 				if (kr != KERN_SUCCESS) {
3522 					/* deallocate whole range... */
3523 					new_mapping_established = TRUE;
3524 					/* ... but only up to "tmp_end" */
3525 					size -= end - tmp_end;
3526 					RETURN(kr);
3527 				}
3528 
3529 				/* create one vm_object per superpage */
3530 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3531 				vm_object_lock(sp_object);
3532 				sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3533 				VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3534 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3535 				VME_OBJECT_SET(entry, sp_object, false, 0);
3536 				assert(entry->use_pmap);
3537 
3538 				/* enter the base pages into the object */
3539 				for (sp_offset = 0;
3540 				    sp_offset < SUPERPAGE_SIZE;
3541 				    sp_offset += PAGE_SIZE) {
3542 					m = pages;
3543 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3544 					pages = NEXT_PAGE(m);
3545 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3546 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3547 				}
3548 				vm_object_unlock(sp_object);
3549 			}
3550 		} while (tmp_end != tmp2_end &&
3551 		    (tmp_start = tmp_end) &&
3552 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3553 		    tmp_end + chunk_size : tmp2_end));
3554 	}
3555 
3556 	new_mapping_established = TRUE;
3557 
3558 BailOut:
3559 	assert(map_locked == TRUE);
3560 
3561 	/*
3562 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3563 	 * If we have identified and possibly established the new mapping(s),
3564 	 * make sure we did not go beyond the address space limit.
3565 	 */
3566 	if (result == KERN_SUCCESS) {
3567 		if (map->size_limit != RLIM_INFINITY &&
3568 		    map->size > map->size_limit) {
3569 			/*
3570 			 * Establishing the requested mappings would exceed
3571 			 * the process's RLIMIT_AS limit: fail with
3572 			 * KERN_NO_SPACE.
3573 			 */
3574 			result = KERN_NO_SPACE;
3575 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3576 			    proc_selfpid(),
3577 			    (get_bsdtask_info(current_task())
3578 			    ? proc_name_address(get_bsdtask_info(current_task()))
3579 			    : "?"),
3580 			    __FUNCTION__,
3581 			    (uint64_t) map->size,
3582 			    (uint64_t) map->size_limit);
3583 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3584 			    vm_map_size_t, map->size,
3585 			    uint64_t, map->size_limit);
3586 			vm_map_enter_RLIMIT_AS_count++;
3587 		} else if (map->data_limit != RLIM_INFINITY &&
3588 		    map->size > map->data_limit) {
3589 			/*
3590 			 * Establishing the requested mappings would exceed
3591 			 * the process's RLIMIT_DATA limit: fail with
3592 			 * KERN_NO_SPACE.
3593 			 */
3594 			result = KERN_NO_SPACE;
3595 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3596 			    proc_selfpid(),
3597 			    (get_bsdtask_info(current_task())
3598 			    ? proc_name_address(get_bsdtask_info(current_task()))
3599 			    : "?"),
3600 			    __FUNCTION__,
3601 			    (uint64_t) map->size,
3602 			    (uint64_t) map->data_limit);
3603 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3604 			    vm_map_size_t, map->size,
3605 			    uint64_t, map->data_limit);
3606 			vm_map_enter_RLIMIT_DATA_count++;
3607 		}
3608 	}
3609 
3610 	if (result == KERN_SUCCESS) {
3611 		vm_prot_t pager_prot;
3612 		memory_object_t pager;
3613 
3614 #if DEBUG
3615 		if (pmap_empty &&
3616 		    !(vmk_flags.vmkf_no_pmap_check)) {
3617 			assert(pmap_is_empty(map->pmap,
3618 			    *address,
3619 			    *address + size));
3620 		}
3621 #endif /* DEBUG */
3622 
3623 		/*
3624 		 * For "named" VM objects, let the pager know that the
3625 		 * memory object is being mapped.  Some pagers need to keep
3626 		 * track of this, to know when they can reclaim the memory
3627 		 * object, for example.
3628 		 * VM calls memory_object_map() for each mapping (specifying
3629 		 * the protection of each mapping) and calls
3630 		 * memory_object_last_unmap() when all the mappings are gone.
3631 		 */
3632 		pager_prot = max_protection;
3633 		if (needs_copy) {
3634 			/*
3635 			 * Copy-On-Write mapping: won't modify
3636 			 * the memory object.
3637 			 */
3638 			pager_prot &= ~VM_PROT_WRITE;
3639 		}
3640 		if (!is_submap &&
3641 		    object != VM_OBJECT_NULL &&
3642 		    object->named &&
3643 		    object->pager != MEMORY_OBJECT_NULL) {
3644 			vm_object_lock(object);
3645 			pager = object->pager;
3646 			if (object->named &&
3647 			    pager != MEMORY_OBJECT_NULL) {
3648 				assert(object->pager_ready);
3649 				vm_object_mapping_wait(object, THREAD_UNINT);
3650 				/* object might have lost its pager while waiting */
3651 				pager = object->pager;
3652 				if (object->named && pager != MEMORY_OBJECT_NULL) {
3653 					vm_object_mapping_begin(object);
3654 					vm_object_unlock(object);
3655 
3656 					kr = memory_object_map(pager, pager_prot);
3657 					assert(kr == KERN_SUCCESS);
3658 
3659 					vm_object_lock(object);
3660 					vm_object_mapping_end(object);
3661 				}
3662 			}
3663 			vm_object_unlock(object);
3664 		}
3665 	}
3666 
3667 	assert(map_locked == TRUE);
3668 
3669 	if (new_mapping_established) {
3670 		/*
3671 		 * If we release the map lock for any reason below,
3672 		 * another thread could deallocate our new mapping,
3673 		 * releasing the caller's reference on "caller_object",
3674 		 * which was transferred to the mapping.
3675 		 * If this was the only reference, the object could be
3676 		 * destroyed.
3677 		 *
3678 		 * We need to take an extra reference on "caller_object"
3679 		 * to keep it alive if we need to return the caller's
3680 		 * reference to the caller in case of failure.
3681 		 */
3682 		if (is_submap) {
3683 			vm_map_reference((vm_map_t)caller_object);
3684 		} else {
3685 			vm_object_reference(caller_object);
3686 		}
3687 	}
3688 
3689 	if (!keep_map_locked) {
3690 		vm_map_unlock(map);
3691 		map_locked = FALSE;
3692 		entry = VM_MAP_ENTRY_NULL;
3693 		new_entry = VM_MAP_ENTRY_NULL;
3694 	}
3695 
3696 	/*
3697 	 * We can't hold the map lock if we enter this block.
3698 	 */
3699 
3700 	if (result == KERN_SUCCESS) {
3701 		/*	Wire down the new entry if the user
3702 		 *	requested all new map entries be wired.
3703 		 */
3704 		if ((map->wiring_required) || (superpage_size)) {
3705 			assert(!keep_map_locked);
3706 			pmap_empty = FALSE; /* pmap won't be empty */
3707 			kr = vm_map_wire_nested(map, start, end,
3708 			    cur_protection, VM_KERN_MEMORY_MLOCK,
3709 			    TRUE, PMAP_NULL, 0, NULL);
3710 			result = kr;
3711 		}
3712 
3713 	}
3714 
3715 	if (result != KERN_SUCCESS) {
3716 		if (new_mapping_established) {
3717 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3718 
3719 			/*
3720 			 * We have to get rid of the new mappings since we
3721 			 * won't make them available to the user.
3722 			 * Try and do that atomically, to minimize the risk
3723 			 * that someone else create new mappings that range.
3724 			 */
3725 			if (!map_locked) {
3726 				vm_map_lock(map);
3727 				map_locked = TRUE;
3728 			}
3729 			remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3730 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3731 			if (permanent) {
3732 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3733 			}
3734 			(void) vm_map_delete(map,
3735 			    *address, *address + size,
3736 			    remove_flags,
3737 			    KMEM_GUARD_NONE, &zap_new_list);
3738 		}
3739 
3740 		if (vm_map_zap_first_entry(&zap_old_list)) {
3741 			vm_map_entry_t entry1, entry2;
3742 
3743 			/*
3744 			 * The new mapping failed.  Attempt to restore
3745 			 * the old mappings, saved in the "zap_old_map".
3746 			 */
3747 			if (!map_locked) {
3748 				vm_map_lock(map);
3749 				map_locked = TRUE;
3750 			}
3751 
3752 			/* first check if the coast is still clear */
3753 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3754 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3755 
3756 			if (vm_map_lookup_entry(map, start, &entry1) ||
3757 			    vm_map_lookup_entry(map, end, &entry2) ||
3758 			    entry1 != entry2) {
3759 				/*
3760 				 * Part of that range has already been
3761 				 * re-mapped:  we can't restore the old
3762 				 * mappings...
3763 				 */
3764 				vm_map_enter_restore_failures++;
3765 			} else {
3766 				/*
3767 				 * Transfer the saved map entries from
3768 				 * "zap_old_map" to the original "map",
3769 				 * inserting them all after "entry1".
3770 				 */
3771 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3772 					vm_map_size_t entry_size;
3773 
3774 					entry_size = (entry2->vme_end -
3775 					    entry2->vme_start);
3776 					vm_map_store_entry_link(map, entry1, entry2,
3777 					    VM_MAP_KERNEL_FLAGS_NONE);
3778 					map->size += entry_size;
3779 					entry1 = entry2;
3780 				}
3781 				if (map->wiring_required) {
3782 					/*
3783 					 * XXX TODO: we should rewire the
3784 					 * old pages here...
3785 					 */
3786 				}
3787 				vm_map_enter_restore_successes++;
3788 			}
3789 		}
3790 	}
3791 
3792 	/*
3793 	 * The caller is responsible for releasing the lock if it requested to
3794 	 * keep the map locked.
3795 	 */
3796 	if (map_locked && !keep_map_locked) {
3797 		vm_map_unlock(map);
3798 	}
3799 
3800 	vm_map_zap_dispose(&zap_old_list);
3801 	vm_map_zap_dispose(&zap_new_list);
3802 
3803 	if (new_mapping_established) {
3804 		/*
3805 		 * The caller had a reference on "caller_object" and we
3806 		 * transferred that reference to the mapping.
3807 		 * We also took an extra reference on "caller_object" to keep
3808 		 * it alive while the map was unlocked.
3809 		 */
3810 		if (result == KERN_SUCCESS) {
3811 			/*
3812 			 * On success, the caller's reference on the object gets
3813 			 * tranferred to the mapping.
3814 			 * Release our extra reference.
3815 			 */
3816 			if (is_submap) {
3817 				vm_map_deallocate((vm_map_t)caller_object);
3818 			} else {
3819 				vm_object_deallocate(caller_object);
3820 			}
3821 		} else {
3822 			/*
3823 			 * On error, the caller expects to still have a
3824 			 * reference on the object it gave us.
3825 			 * Let's use our extra reference for that.
3826 			 */
3827 		}
3828 	}
3829 
3830 	return result;
3831 
3832 #undef  RETURN
3833 }
3834 
3835 /*
3836  * Counters for the prefault optimization.
3837  */
3838 int64_t vm_prefault_nb_pages = 0;
3839 int64_t vm_prefault_nb_bailout = 0;
3840 
3841 static kern_return_t
vm_map_enter_adjust_offset(vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_offset_t quantity)3842 vm_map_enter_adjust_offset(
3843 	vm_object_offset_t *obj_offs,
3844 	vm_object_offset_t *obj_end,
3845 	vm_object_offset_t  quantity)
3846 {
3847 	if (os_add_overflow(*obj_offs, quantity, obj_offs) ||
3848 	    os_add_overflow(*obj_end, quantity, obj_end) ||
3849 	    vm_map_round_page_mask(*obj_end, PAGE_MASK) == 0) {
3850 		return KERN_INVALID_ARGUMENT;
3851 	}
3852 
3853 	return KERN_SUCCESS;
3854 }
3855 
3856 static __attribute__((always_inline, warn_unused_result))
3857 kern_return_t
vm_map_enter_mem_object_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)3858 vm_map_enter_mem_object_sanitize(
3859 	vm_map_t                target_map,
3860 	vm_map_offset_ut        address_u,
3861 	vm_map_size_ut          initial_size_u,
3862 	vm_map_offset_ut        mask_u,
3863 	vm_object_offset_ut     offset_u,
3864 	vm_prot_ut              cur_protection_u,
3865 	vm_prot_ut              max_protection_u,
3866 	vm_inherit_ut           inheritance_u,
3867 	vm_map_kernel_flags_t   vmk_flags,
3868 	ipc_port_t              port,
3869 	vm_map_address_t       *map_addr,
3870 	vm_map_size_t          *map_size,
3871 	vm_map_offset_t        *mask,
3872 	vm_object_offset_t     *obj_offs,
3873 	vm_object_offset_t     *obj_end,
3874 	vm_object_size_t       *obj_size,
3875 	vm_prot_t              *cur_protection,
3876 	vm_prot_t              *max_protection,
3877 	vm_inherit_t           *inheritance)
3878 {
3879 	kern_return_t           result;
3880 
3881 	result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
3882 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3883 	    VM_PROT_IS_MASK, cur_protection,
3884 	    max_protection);
3885 	if (__improbable(result != KERN_SUCCESS)) {
3886 		return result;
3887 	}
3888 
3889 	result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3890 	    inheritance);
3891 	if (__improbable(result != KERN_SUCCESS)) {
3892 		return result;
3893 	}
3894 
3895 	result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, mask);
3896 	if (__improbable(result != KERN_SUCCESS)) {
3897 		return result;
3898 	}
3899 
3900 	if (vmk_flags.vmf_fixed) {
3901 		vm_map_address_t        map_end;
3902 
3903 		result = vm_sanitize_addr_size(address_u, initial_size_u,
3904 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3905 		    target_map,
3906 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_REALIGN_START,
3907 		    map_addr, &map_end, map_size);
3908 		if (__improbable(result != KERN_SUCCESS)) {
3909 			return result;
3910 		}
3911 	} else {
3912 		*map_addr = vm_sanitize_addr(target_map, address_u);
3913 		result = vm_sanitize_size(0, initial_size_u,
3914 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3915 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
3916 		if (__improbable(result != KERN_SUCCESS)) {
3917 			return result;
3918 		}
3919 	}
3920 
3921 	*obj_size = vm_object_round_page(*map_size);
3922 	if (__improbable(*obj_size == 0)) {
3923 		return KERN_INVALID_ARGUMENT;
3924 	}
3925 
3926 	if (IP_VALID(port)) {
3927 		result = vm_sanitize_addr_size(offset_u, *obj_size,
3928 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3929 		    PAGE_MASK,
3930 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
3931 		    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
3932 		    obj_offs, obj_end, obj_size);
3933 		if (__improbable(result != KERN_SUCCESS)) {
3934 			return result;
3935 		}
3936 	} else {
3937 		*obj_offs = 0;
3938 		*obj_end  = *obj_size;
3939 	}
3940 
3941 	return KERN_SUCCESS;
3942 }
3943 
3944 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset_u,boolean_t copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,upl_page_list_ptr_t page_list,unsigned int page_list_count)3945 vm_map_enter_mem_object(
3946 	vm_map_t                target_map,
3947 	vm_map_offset_ut       *address_u,
3948 	vm_map_size_ut          initial_size_u,
3949 	vm_map_offset_ut        mask_u,
3950 	vm_map_kernel_flags_t   vmk_flags,
3951 	ipc_port_t              port,
3952 	vm_object_offset_ut     offset_u,
3953 	boolean_t               copy,
3954 	vm_prot_ut              cur_protection_u,
3955 	vm_prot_ut              max_protection_u,
3956 	vm_inherit_ut           inheritance_u,
3957 	upl_page_list_ptr_t     page_list,
3958 	unsigned int            page_list_count)
3959 {
3960 	vm_map_offset_t         mask;
3961 	vm_prot_t               cur_protection;
3962 	vm_prot_t               max_protection;
3963 	vm_inherit_t            inheritance;
3964 	vm_map_address_t        map_addr, map_mask;
3965 	vm_map_size_t           map_size;
3966 	vm_object_t             object = VM_OBJECT_NULL;
3967 	vm_object_offset_t      obj_offs, obj_end;
3968 	vm_object_size_t        obj_size;
3969 	kern_return_t           result;
3970 	boolean_t               mask_cur_protection, mask_max_protection;
3971 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
3972 	vm_map_offset_t         offset_in_mapping = 0;
3973 
3974 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
3975 		/* XXX TODO4K prefaulting depends on page size... */
3976 		try_prefault = FALSE;
3977 	}
3978 
3979 	/*
3980 	 * Check arguments for validity
3981 	 */
3982 	if ((target_map == VM_MAP_NULL) ||
3983 	    (try_prefault && (copy || !page_list))) {
3984 		return KERN_INVALID_ARGUMENT;
3985 	}
3986 
3987 	map_mask = vm_map_page_mask(target_map);
3988 
3989 	/*
3990 	 * Sanitize any input parameters that are addr/size/prot/inherit
3991 	 */
3992 	result = vm_map_enter_mem_object_sanitize(
3993 		target_map,
3994 		*address_u,
3995 		initial_size_u,
3996 		mask_u,
3997 		offset_u,
3998 		cur_protection_u,
3999 		max_protection_u,
4000 		inheritance_u,
4001 		vmk_flags,
4002 		port,
4003 		&map_addr,
4004 		&map_size,
4005 		&mask,
4006 		&obj_offs,
4007 		&obj_end,
4008 		&obj_size,
4009 		&cur_protection,
4010 		&max_protection,
4011 		&inheritance);
4012 	if (__improbable(result != KERN_SUCCESS)) {
4013 		return vm_sanitize_get_kr(result);
4014 	}
4015 
4016 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4017 	vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, map_size);
4018 
4019 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4020 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
4021 	cur_protection &= ~VM_PROT_IS_MASK;
4022 	max_protection &= ~VM_PROT_IS_MASK;
4023 
4024 #if __arm64__
4025 	if (cur_protection & VM_PROT_EXECUTE) {
4026 		cur_protection |= VM_PROT_READ;
4027 	}
4028 #endif /* __arm64__ */
4029 
4030 	/*
4031 	 * Find the vm object (if any) corresponding to this port.
4032 	 */
4033 	if (!IP_VALID(port)) {
4034 		object = VM_OBJECT_NULL;
4035 		copy = FALSE;
4036 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4037 		vm_named_entry_t        named_entry;
4038 		vm_object_size_t        initial_size;
4039 
4040 		named_entry = mach_memory_entry_from_port(port);
4041 
4042 		if (vmk_flags.vmf_return_data_addr ||
4043 		    vmk_flags.vmf_return_4k_data_addr) {
4044 			result = vm_map_enter_adjust_offset(&obj_offs,
4045 			    &obj_end, named_entry->data_offset);
4046 			if (__improbable(result)) {
4047 				return result;
4048 			}
4049 		}
4050 
4051 		/* a few checks to make sure user is obeying rules */
4052 		if (mask_max_protection) {
4053 			max_protection &= named_entry->protection;
4054 		}
4055 		if (mask_cur_protection) {
4056 			cur_protection &= named_entry->protection;
4057 		}
4058 		if ((named_entry->protection & max_protection) !=
4059 		    max_protection) {
4060 			return KERN_INVALID_RIGHT;
4061 		}
4062 		if ((named_entry->protection & cur_protection) !=
4063 		    cur_protection) {
4064 			return KERN_INVALID_RIGHT;
4065 		}
4066 
4067 		/*
4068 		 * unwrap is safe because we know obj_size is larger and doesn't
4069 		 * overflow
4070 		 */
4071 		initial_size = VM_SANITIZE_UNSAFE_UNWRAP(initial_size_u);
4072 		if (named_entry->size < obj_offs + initial_size) {
4073 			return KERN_INVALID_ARGUMENT;
4074 		}
4075 
4076 		/* for a vm_map_copy, we can only map it whole */
4077 		if (named_entry->is_copy &&
4078 		    (obj_size != named_entry->size) &&
4079 		    (vm_map_round_page(obj_size, map_mask) == named_entry->size)) {
4080 			/* XXX FBDP use the rounded size... */
4081 			obj_end += named_entry->size - obj_size;
4082 			obj_size = named_entry->size;
4083 		}
4084 
4085 		if (named_entry->offset) {
4086 			/*
4087 			 * the callers parameter offset is defined to be the
4088 			 * offset from beginning of named entry offset in object
4089 			 *
4090 			 * Because we checked above that
4091 			 *   obj_offs + obj_size < named_entry_size
4092 			 * these overflow checks should be redundant...
4093 			 */
4094 			result = vm_map_enter_adjust_offset(&obj_offs,
4095 			    &obj_end, named_entry->offset);
4096 			if (__improbable(result)) {
4097 				return result;
4098 			}
4099 		}
4100 
4101 		if (!VM_MAP_PAGE_ALIGNED(obj_size, map_mask)) {
4102 			/*
4103 			 * Let's not map more than requested;
4104 			 * vm_map_enter() will handle this "not map-aligned"
4105 			 * case.
4106 			 */
4107 			map_size = obj_size;
4108 		}
4109 
4110 		named_entry_lock(named_entry);
4111 
4112 		// rdar://130307561 (Combine copy, object, and submap fields of vm_named_entry into an enum)
4113 		assert(named_entry->is_copy || named_entry->is_object || named_entry->is_sub_map);
4114 
4115 		if (named_entry->is_sub_map) {
4116 			vm_map_t                submap;
4117 
4118 			assert(!named_entry->is_copy);
4119 			assert(!named_entry->is_object);
4120 
4121 			if (vmk_flags.vmf_return_data_addr ||
4122 			    vmk_flags.vmf_return_4k_data_addr) {
4123 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4124 			}
4125 
4126 			submap = named_entry->backing.map;
4127 			vm_map_reference(submap);
4128 			named_entry_unlock(named_entry);
4129 
4130 			vmk_flags.vmkf_submap = TRUE;
4131 			result = vm_map_enter(target_map,
4132 			    &map_addr,
4133 			    map_size,
4134 			    mask,
4135 			    vmk_flags,
4136 			    (vm_object_t)(uintptr_t) submap,
4137 			    obj_offs,
4138 			    copy,
4139 			    cur_protection,
4140 			    max_protection,
4141 			    inheritance);
4142 			if (result != KERN_SUCCESS) {
4143 				vm_map_deallocate(submap);
4144 				return result;
4145 			}
4146 			/*
4147 			 * No need to lock "submap" just to check its
4148 			 * "mapped" flag: that flag is never reset
4149 			 * once it's been set and if we race, we'll
4150 			 * just end up setting it twice, which is OK.
4151 			 */
4152 			if (submap->mapped_in_other_pmaps == FALSE &&
4153 			    vm_map_pmap(submap) != PMAP_NULL &&
4154 			    vm_map_pmap(submap) !=
4155 			    vm_map_pmap(target_map)) {
4156 				/*
4157 				 * This submap is being mapped in a map
4158 				 * that uses a different pmap.
4159 				 * Set its "mapped_in_other_pmaps" flag
4160 				 * to indicate that we now need to
4161 				 * remove mappings from all pmaps rather
4162 				 * than just the submap's pmap.
4163 				 */
4164 				vm_map_lock(submap);
4165 				submap->mapped_in_other_pmaps = TRUE;
4166 				vm_map_unlock(submap);
4167 			}
4168 			goto out;
4169 		}
4170 
4171 		if (named_entry->is_copy) {
4172 			kern_return_t   kr;
4173 			vm_map_copy_t   copy_map;
4174 			vm_map_entry_t  copy_entry;
4175 			vm_map_offset_t copy_addr;
4176 			vm_map_copy_t   target_copy_map;
4177 			vm_map_offset_t overmap_start, overmap_end;
4178 			vm_map_offset_t trimmed_start;
4179 			vm_map_size_t   target_size;
4180 
4181 			assert(!named_entry->is_object);
4182 			assert(!named_entry->is_sub_map);
4183 
4184 			if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4185 			    (VM_FLAGS_FIXED |
4186 			    VM_FLAGS_ANYWHERE |
4187 			    VM_FLAGS_OVERWRITE |
4188 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4189 			    VM_FLAGS_RETURN_DATA_ADDR))) {
4190 				named_entry_unlock(named_entry);
4191 				return KERN_INVALID_ARGUMENT;
4192 			}
4193 
4194 			copy_map = named_entry->backing.copy;
4195 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4196 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4197 				/* unsupported type; should not happen */
4198 				printf("vm_map_enter_mem_object: "
4199 				    "memory_entry->backing.copy "
4200 				    "unsupported type 0x%x\n",
4201 				    copy_map->type);
4202 				named_entry_unlock(named_entry);
4203 				return KERN_INVALID_ARGUMENT;
4204 			}
4205 
4206 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4207 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, obj_offs, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4208 			}
4209 
4210 			if (vmk_flags.vmf_return_data_addr ||
4211 			    vmk_flags.vmf_return_4k_data_addr) {
4212 				offset_in_mapping = obj_offs & map_mask;
4213 				if (vmk_flags.vmf_return_4k_data_addr) {
4214 					offset_in_mapping &= ~((signed)(0xFFF));
4215 				}
4216 			}
4217 
4218 			target_copy_map = VM_MAP_COPY_NULL;
4219 			target_size = copy_map->size;
4220 			overmap_start = 0;
4221 			overmap_end = 0;
4222 			trimmed_start = 0;
4223 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4224 				DEBUG4K_ADJUST("adjusting...\n");
4225 				kr = vm_map_copy_adjust_to_target(
4226 					copy_map,
4227 					obj_offs,
4228 					initial_size,
4229 					target_map,
4230 					copy,
4231 					&target_copy_map,
4232 					&overmap_start,
4233 					&overmap_end,
4234 					&trimmed_start);
4235 				if (kr != KERN_SUCCESS) {
4236 					named_entry_unlock(named_entry);
4237 					return kr;
4238 				}
4239 				target_size = target_copy_map->size;
4240 			} else {
4241 				/*
4242 				 * Assert that the vm_map_copy is coming from the right
4243 				 * zone and hasn't been forged
4244 				 */
4245 				vm_map_copy_require(copy_map);
4246 				target_copy_map = copy_map;
4247 			}
4248 
4249 			vm_map_kernel_flags_t rsv_flags = vmk_flags;
4250 
4251 			vm_map_kernel_flags_and_vmflags(&rsv_flags,
4252 			    (VM_FLAGS_FIXED |
4253 			    VM_FLAGS_ANYWHERE |
4254 			    VM_FLAGS_OVERWRITE |
4255 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4256 			    VM_FLAGS_RETURN_DATA_ADDR));
4257 
4258 			/* reserve a contiguous range */
4259 			kr = vm_map_enter(target_map,
4260 			    &map_addr,
4261 			    vm_map_round_page(target_size, map_mask),
4262 			    mask,
4263 			    rsv_flags,
4264 			    VM_OBJECT_NULL,
4265 			    0,
4266 			    FALSE,               /* copy */
4267 			    cur_protection,
4268 			    max_protection,
4269 			    inheritance);
4270 			if (kr != KERN_SUCCESS) {
4271 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4272 				if (target_copy_map != copy_map) {
4273 					vm_map_copy_discard(target_copy_map);
4274 					target_copy_map = VM_MAP_COPY_NULL;
4275 				}
4276 				named_entry_unlock(named_entry);
4277 				return kr;
4278 			}
4279 
4280 			copy_addr = map_addr;
4281 
4282 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4283 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4284 			    copy_entry = copy_entry->vme_next) {
4285 				vm_map_t                copy_submap = VM_MAP_NULL;
4286 				vm_object_t             copy_object = VM_OBJECT_NULL;
4287 				vm_map_size_t           copy_size;
4288 				vm_object_offset_t      copy_offset;
4289 				boolean_t               do_copy = false;
4290 
4291 				if (copy_entry->is_sub_map) {
4292 					copy_submap = VME_SUBMAP(copy_entry);
4293 					copy_object = (vm_object_t)copy_submap;
4294 				} else {
4295 					copy_object = VME_OBJECT(copy_entry);
4296 				}
4297 				copy_offset = VME_OFFSET(copy_entry);
4298 				copy_size = (copy_entry->vme_end -
4299 				    copy_entry->vme_start);
4300 
4301 				/* sanity check */
4302 				if ((copy_addr + copy_size) >
4303 				    (map_addr +
4304 				    overmap_start + overmap_end +
4305 				    named_entry->size /* XXX full size */)) {
4306 					/* over-mapping too much !? */
4307 					kr = KERN_INVALID_ARGUMENT;
4308 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4309 					/* abort */
4310 					break;
4311 				}
4312 
4313 				/* take a reference on the object */
4314 				if (copy_entry->is_sub_map) {
4315 					vm_map_reference(copy_submap);
4316 				} else {
4317 					if (!copy &&
4318 					    copy_object != VM_OBJECT_NULL &&
4319 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4320 						bool is_writable;
4321 
4322 						/*
4323 						 * We need to resolve our side of this
4324 						 * "symmetric" copy-on-write now; we
4325 						 * need a new object to map and share,
4326 						 * instead of the current one which
4327 						 * might still be shared with the
4328 						 * original mapping.
4329 						 *
4330 						 * Note: A "vm_map_copy_t" does not
4331 						 * have a lock but we're protected by
4332 						 * the named entry's lock here.
4333 						 */
4334 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4335 						VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4336 						assert(copy_object != VME_OBJECT(copy_entry));
4337 						is_writable = false;
4338 						if (copy_entry->protection & VM_PROT_WRITE) {
4339 							is_writable = true;
4340 #if __arm64e__
4341 						} else if (copy_entry->used_for_tpro) {
4342 							is_writable = true;
4343 #endif /* __arm64e__ */
4344 						}
4345 						if (!copy_entry->needs_copy && is_writable) {
4346 							vm_prot_t prot;
4347 
4348 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4349 							vm_object_pmap_protect(copy_object,
4350 							    copy_offset,
4351 							    copy_size,
4352 							    PMAP_NULL,
4353 							    PAGE_SIZE,
4354 							    0,
4355 							    prot);
4356 						}
4357 						copy_entry->needs_copy = FALSE;
4358 						copy_entry->is_shared = TRUE;
4359 						copy_object = VME_OBJECT(copy_entry);
4360 						copy_offset = VME_OFFSET(copy_entry);
4361 						vm_object_lock(copy_object);
4362 						/* we're about to make a shared mapping of this object */
4363 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4364 						VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4365 						vm_object_unlock(copy_object);
4366 					}
4367 
4368 					if (copy_object != VM_OBJECT_NULL &&
4369 					    copy_object->named &&
4370 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4371 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4372 						memory_object_t pager;
4373 						vm_prot_t       pager_prot;
4374 
4375 						/*
4376 						 * For "named" VM objects, let the pager know that the
4377 						 * memory object is being mapped.  Some pagers need to keep
4378 						 * track of this, to know when they can reclaim the memory
4379 						 * object, for example.
4380 						 * VM calls memory_object_map() for each mapping (specifying
4381 						 * the protection of each mapping) and calls
4382 						 * memory_object_last_unmap() when all the mappings are gone.
4383 						 */
4384 						pager_prot = max_protection;
4385 						if (copy) {
4386 							/*
4387 							 * Copy-On-Write mapping: won't modify the
4388 							 * memory object.
4389 							 */
4390 							pager_prot &= ~VM_PROT_WRITE;
4391 						}
4392 						vm_object_lock(copy_object);
4393 						pager = copy_object->pager;
4394 						if (copy_object->named &&
4395 						    pager != MEMORY_OBJECT_NULL &&
4396 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4397 							assert(copy_object->pager_ready);
4398 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4399 							/*
4400 							 * Object might have lost its pager
4401 							 * while waiting.
4402 							 */
4403 							pager = copy_object->pager;
4404 							if (copy_object->named &&
4405 							    pager != MEMORY_OBJECT_NULL) {
4406 								vm_object_mapping_begin(copy_object);
4407 								vm_object_unlock(copy_object);
4408 
4409 								kr = memory_object_map(pager, pager_prot);
4410 								assert(kr == KERN_SUCCESS);
4411 
4412 								vm_object_lock(copy_object);
4413 								vm_object_mapping_end(copy_object);
4414 							}
4415 						}
4416 						vm_object_unlock(copy_object);
4417 					}
4418 
4419 					/*
4420 					 *	Perform the copy if requested
4421 					 */
4422 
4423 					if (copy && copy_object != VM_OBJECT_NULL) {
4424 						vm_object_t             new_object;
4425 						vm_object_offset_t      new_offset;
4426 
4427 						result = vm_object_copy_strategically(copy_object, copy_offset,
4428 						    copy_size,
4429 						    false,                                   /* forking */
4430 						    &new_object, &new_offset,
4431 						    &do_copy);
4432 
4433 
4434 						if (result == KERN_MEMORY_RESTART_COPY) {
4435 							boolean_t success;
4436 							boolean_t src_needs_copy;
4437 
4438 							/*
4439 							 * XXX
4440 							 * We currently ignore src_needs_copy.
4441 							 * This really is the issue of how to make
4442 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4443 							 * non-kernel users to use. Solution forthcoming.
4444 							 * In the meantime, since we don't allow non-kernel
4445 							 * memory managers to specify symmetric copy,
4446 							 * we won't run into problems here.
4447 							 */
4448 							new_object = copy_object;
4449 							new_offset = copy_offset;
4450 							success = vm_object_copy_quickly(new_object,
4451 							    new_offset,
4452 							    copy_size,
4453 							    &src_needs_copy,
4454 							    &do_copy);
4455 							assert(success);
4456 							result = KERN_SUCCESS;
4457 						}
4458 						if (result != KERN_SUCCESS) {
4459 							kr = result;
4460 							break;
4461 						}
4462 
4463 						copy_object = new_object;
4464 						copy_offset = new_offset;
4465 						/*
4466 						 * No extra object reference for the mapping:
4467 						 * the mapping should be the only thing keeping
4468 						 * this new object alive.
4469 						 */
4470 					} else {
4471 						/*
4472 						 * We already have the right object
4473 						 * to map.
4474 						 */
4475 						copy_object = VME_OBJECT(copy_entry);
4476 						/* take an extra ref for the mapping below */
4477 						vm_object_reference(copy_object);
4478 					}
4479 				}
4480 
4481 				/*
4482 				 * If the caller does not want a specific
4483 				 * tag for this new mapping:  use
4484 				 * the tag of the original mapping.
4485 				 */
4486 				vm_map_kernel_flags_t vmk_remap_flags = {
4487 					.vmkf_submap = copy_entry->is_sub_map,
4488 				};
4489 
4490 				vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4491 				    vm_map_kernel_flags_vmflags(vmk_flags),
4492 				    vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4493 
4494 				/* over-map the object into destination */
4495 				vmk_remap_flags.vmf_fixed = true;
4496 				vmk_remap_flags.vmf_overwrite = true;
4497 
4498 				if (!copy && !copy_entry->is_sub_map) {
4499 					/*
4500 					 * copy-on-write should have been
4501 					 * resolved at this point, or we would
4502 					 * end up sharing instead of copying.
4503 					 */
4504 					assert(!copy_entry->needs_copy);
4505 				}
4506 #if XNU_TARGET_OS_OSX
4507 				if (copy_entry->used_for_jit) {
4508 					vmk_remap_flags.vmkf_map_jit = TRUE;
4509 				}
4510 #endif /* XNU_TARGET_OS_OSX */
4511 
4512 				kr = vm_map_enter(target_map,
4513 				    &copy_addr,
4514 				    copy_size,
4515 				    (vm_map_offset_t) 0,
4516 				    vmk_remap_flags,
4517 				    copy_object,
4518 				    copy_offset,
4519 				    ((copy_object == NULL)
4520 				    ? FALSE
4521 				    : (copy || copy_entry->needs_copy)),
4522 				    cur_protection,
4523 				    max_protection,
4524 				    inheritance);
4525 				if (kr != KERN_SUCCESS) {
4526 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4527 					if (copy_entry->is_sub_map) {
4528 						vm_map_deallocate(copy_submap);
4529 					} else {
4530 						vm_object_deallocate(copy_object);
4531 					}
4532 					/* abort */
4533 					break;
4534 				}
4535 
4536 				/* next mapping */
4537 				copy_addr += copy_size;
4538 			}
4539 
4540 			named_entry_unlock(named_entry);
4541 			if (target_copy_map != copy_map) {
4542 				vm_map_copy_discard(target_copy_map);
4543 				target_copy_map = VM_MAP_COPY_NULL;
4544 			}
4545 
4546 			if (kr == KERN_SUCCESS) {
4547 				if (overmap_start) {
4548 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t)offset_in_mapping, (uint64_t)overmap_start, (uint64_t)(map_addr + offset_in_mapping + overmap_start));
4549 				}
4550 				offset_in_mapping += overmap_start;
4551 			} else if (!vmk_flags.vmf_overwrite) {
4552 				/* deallocate the contiguous range */
4553 				vm_map_remove(target_map, map_addr,
4554 				    map_addr + map_size);
4555 			}
4556 			result = kr;
4557 			goto out;
4558 		}
4559 
4560 		if (named_entry->is_object) {
4561 			unsigned int    access;
4562 			unsigned int    wimg_mode;
4563 
4564 			assert(!named_entry->is_copy);
4565 			assert(!named_entry->is_sub_map);
4566 
4567 			/* we are mapping a VM object */
4568 
4569 			access = named_entry->access;
4570 
4571 			if (vmk_flags.vmf_return_data_addr ||
4572 			    vmk_flags.vmf_return_4k_data_addr) {
4573 				offset_in_mapping = obj_offs & map_mask;
4574 				if (vmk_flags.vmf_return_4k_data_addr) {
4575 					offset_in_mapping &= ~((signed)(0xFFF));
4576 				}
4577 				obj_offs -= offset_in_mapping;
4578 				map_size  = vm_map_round_page(initial_size +
4579 				    offset_in_mapping, map_mask);
4580 			}
4581 
4582 			object = vm_named_entry_to_vm_object(named_entry);
4583 			assert(object != VM_OBJECT_NULL);
4584 			vm_object_lock(object);
4585 			named_entry_unlock(named_entry);
4586 
4587 			vm_object_reference_locked(object);
4588 
4589 			wimg_mode = object->wimg_bits;
4590 			vm_prot_to_wimg(access, &wimg_mode);
4591 			if (object->wimg_bits != wimg_mode) {
4592 				vm_object_change_wimg_mode(object, wimg_mode);
4593 			}
4594 
4595 			vm_object_unlock(object);
4596 		} else {
4597 			panic("invalid VM named entry %p", named_entry);
4598 		}
4599 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4600 		/*
4601 		 * JMM - This is temporary until we unify named entries
4602 		 * and raw memory objects.
4603 		 *
4604 		 * Detected fake ip_kotype for a memory object.  In
4605 		 * this case, the port isn't really a port at all, but
4606 		 * instead is just a raw memory object.
4607 		 */
4608 		if (vmk_flags.vmf_return_data_addr ||
4609 		    vmk_flags.vmf_return_4k_data_addr) {
4610 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4611 		}
4612 
4613 		object = memory_object_to_vm_object((memory_object_t)port);
4614 		if (object == VM_OBJECT_NULL) {
4615 			return KERN_INVALID_OBJECT;
4616 		}
4617 		vm_object_reference(object);
4618 
4619 		/* wait for object (if any) to be ready */
4620 		if (object != VM_OBJECT_NULL) {
4621 			if (is_kernel_object(object)) {
4622 				printf("Warning: Attempt to map kernel object"
4623 				    " by a non-private kernel entity\n");
4624 				return KERN_INVALID_OBJECT;
4625 			}
4626 			if (!object->pager_ready) {
4627 				vm_object_lock(object);
4628 
4629 				while (!object->pager_ready) {
4630 					vm_object_sleep(object,
4631 					    VM_OBJECT_EVENT_PAGER_READY,
4632 					    THREAD_UNINT,
4633 					    LCK_SLEEP_EXCLUSIVE);
4634 				}
4635 				vm_object_unlock(object);
4636 			}
4637 		}
4638 	} else {
4639 		return KERN_INVALID_OBJECT;
4640 	}
4641 
4642 	if (object != VM_OBJECT_NULL &&
4643 	    object->named &&
4644 	    object->pager != MEMORY_OBJECT_NULL &&
4645 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4646 		memory_object_t pager;
4647 		vm_prot_t       pager_prot;
4648 		kern_return_t   kr;
4649 
4650 		/*
4651 		 * For "named" VM objects, let the pager know that the
4652 		 * memory object is being mapped.  Some pagers need to keep
4653 		 * track of this, to know when they can reclaim the memory
4654 		 * object, for example.
4655 		 * VM calls memory_object_map() for each mapping (specifying
4656 		 * the protection of each mapping) and calls
4657 		 * memory_object_last_unmap() when all the mappings are gone.
4658 		 */
4659 		pager_prot = max_protection;
4660 		if (copy) {
4661 			/*
4662 			 * Copy-On-Write mapping: won't modify the
4663 			 * memory object.
4664 			 */
4665 			pager_prot &= ~VM_PROT_WRITE;
4666 		}
4667 		vm_object_lock(object);
4668 		pager = object->pager;
4669 		if (object->named &&
4670 		    pager != MEMORY_OBJECT_NULL &&
4671 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4672 			assert(object->pager_ready);
4673 			vm_object_mapping_wait(object, THREAD_UNINT);
4674 			/* object might have lost its pager while waiting */
4675 			pager = object->pager;
4676 			if (object->named && pager != MEMORY_OBJECT_NULL) {
4677 				vm_object_mapping_begin(object);
4678 				vm_object_unlock(object);
4679 
4680 				kr = memory_object_map(pager, pager_prot);
4681 				assert(kr == KERN_SUCCESS);
4682 
4683 				vm_object_lock(object);
4684 				vm_object_mapping_end(object);
4685 			}
4686 		}
4687 		vm_object_unlock(object);
4688 	}
4689 
4690 	/*
4691 	 *	Perform the copy if requested
4692 	 */
4693 
4694 	if (copy) {
4695 		vm_object_t             new_object;
4696 		vm_object_offset_t      new_offset;
4697 
4698 		result = vm_object_copy_strategically(object,
4699 		    obj_offs,
4700 		    map_size,
4701 		    false,                                   /* forking */
4702 		    &new_object, &new_offset,
4703 		    &copy);
4704 
4705 
4706 		if (result == KERN_MEMORY_RESTART_COPY) {
4707 			boolean_t success;
4708 			boolean_t src_needs_copy;
4709 
4710 			/*
4711 			 * XXX
4712 			 * We currently ignore src_needs_copy.
4713 			 * This really is the issue of how to make
4714 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4715 			 * non-kernel users to use. Solution forthcoming.
4716 			 * In the meantime, since we don't allow non-kernel
4717 			 * memory managers to specify symmetric copy,
4718 			 * we won't run into problems here.
4719 			 */
4720 			new_object = object;
4721 			new_offset = obj_offs;
4722 			success = vm_object_copy_quickly(new_object,
4723 			    new_offset,
4724 			    map_size,
4725 			    &src_needs_copy,
4726 			    &copy);
4727 			assert(success);
4728 			result = KERN_SUCCESS;
4729 		}
4730 		/*
4731 		 *	Throw away the reference to the
4732 		 *	original object, as it won't be mapped.
4733 		 */
4734 
4735 		vm_object_deallocate(object);
4736 
4737 		if (result != KERN_SUCCESS) {
4738 			return result;
4739 		}
4740 
4741 		object   = new_object;
4742 		obj_offs = new_offset;
4743 	}
4744 
4745 	/*
4746 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4747 	 * needs to be atomic.
4748 	 */
4749 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4750 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4751 
4752 	result = vm_map_enter(target_map,
4753 	    &map_addr, map_size,
4754 	    (vm_map_offset_t)mask,
4755 	    vmk_flags,
4756 	    object, obj_offs,
4757 	    copy,
4758 	    cur_protection, max_protection,
4759 	    inheritance);
4760 	if (result != KERN_SUCCESS) {
4761 		vm_object_deallocate(object);
4762 	}
4763 
4764 	/*
4765 	 * Try to prefault, and do not forget to release the vm map lock.
4766 	 */
4767 	if (result == KERN_SUCCESS && try_prefault) {
4768 		mach_vm_address_t va = map_addr;
4769 		kern_return_t kr = KERN_SUCCESS;
4770 		unsigned int i = 0;
4771 		int pmap_options;
4772 
4773 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4774 		if (object->internal) {
4775 			pmap_options |= PMAP_OPTIONS_INTERNAL;
4776 		}
4777 
4778 		for (i = 0; i < page_list_count; ++i) {
4779 			if (!UPL_VALID_PAGE(page_list, i)) {
4780 				if (kernel_prefault) {
4781 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4782 					result = KERN_MEMORY_ERROR;
4783 					break;
4784 				}
4785 			} else {
4786 				/*
4787 				 * If this function call failed, we should stop
4788 				 * trying to optimize, other calls are likely
4789 				 * going to fail too.
4790 				 *
4791 				 * We are not gonna report an error for such
4792 				 * failure though. That's an optimization, not
4793 				 * something critical.
4794 				 */
4795 				kr = pmap_enter_options(target_map->pmap,
4796 				    va, UPL_PHYS_PAGE(page_list, i),
4797 				    cur_protection, VM_PROT_NONE,
4798 				    0, TRUE, pmap_options, NULL, PMAP_MAPPING_TYPE_INFER);
4799 				if (kr != KERN_SUCCESS) {
4800 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4801 					if (kernel_prefault) {
4802 						result = kr;
4803 					}
4804 					break;
4805 				}
4806 				OSIncrementAtomic64(&vm_prefault_nb_pages);
4807 			}
4808 
4809 			/* Next virtual address */
4810 			va += PAGE_SIZE;
4811 		}
4812 		if (vmk_flags.vmkf_keep_map_locked) {
4813 			vm_map_unlock(target_map);
4814 		}
4815 	}
4816 
4817 out:
4818 	if (result == KERN_SUCCESS) {
4819 #if KASAN
4820 		if (target_map->pmap == kernel_pmap) {
4821 			kasan_notify_address(map_addr, map_size);
4822 		}
4823 #endif
4824 		*address_u = vm_sanitize_wrap_addr(map_addr + offset_in_mapping);
4825 	}
4826 	return result;
4827 }
4828 
4829 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_ut * address,vm_map_size_ut initial_size,vm_map_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset,vm_prot_ut cur_protection,vm_prot_ut max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4830 vm_map_enter_mem_object_prefault(
4831 	vm_map_t                target_map,
4832 	vm_map_offset_ut       *address,
4833 	vm_map_size_ut          initial_size,
4834 	vm_map_offset_ut        mask,
4835 	vm_map_kernel_flags_t   vmk_flags,
4836 	ipc_port_t              port,
4837 	vm_object_offset_ut     offset,
4838 	vm_prot_ut              cur_protection,
4839 	vm_prot_ut              max_protection,
4840 	upl_page_list_ptr_t     page_list,
4841 	unsigned int            page_list_count)
4842 {
4843 	/* range_id is set by vm_map_enter_mem_object */
4844 	return vm_map_enter_mem_object(target_map,
4845 	           address,
4846 	           initial_size,
4847 	           mask,
4848 	           vmk_flags,
4849 	           port,
4850 	           offset,
4851 	           FALSE,
4852 	           cur_protection,
4853 	           max_protection,
4854 	           VM_INHERIT_DEFAULT,
4855 	           page_list,
4856 	           page_list_count);
4857 }
4858 
4859 static __attribute__((always_inline, warn_unused_result))
4860 kern_return_t
vm_map_enter_mem_object_control_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)4861 vm_map_enter_mem_object_control_sanitize(
4862 	vm_map_t                target_map,
4863 	vm_map_offset_ut        address_u,
4864 	vm_map_size_ut          initial_size_u,
4865 	vm_map_offset_ut        mask_u,
4866 	vm_object_offset_ut     offset_u,
4867 	vm_prot_ut              cur_protection_u,
4868 	vm_prot_ut              max_protection_u,
4869 	vm_inherit_ut           inheritance_u,
4870 	vm_map_kernel_flags_t   vmk_flags,
4871 	vm_map_address_t       *map_addr,
4872 	vm_map_size_t          *map_size,
4873 	vm_map_offset_t        *mask,
4874 	vm_object_offset_t     *obj_offs,
4875 	vm_object_offset_t     *obj_end,
4876 	vm_object_size_t       *obj_size,
4877 	vm_prot_t              *cur_protection,
4878 	vm_prot_t              *max_protection,
4879 	vm_inherit_t           *inheritance)
4880 {
4881 	kern_return_t           kr;
4882 
4883 	kr = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
4884 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4885 	    cur_protection, max_protection);
4886 	if (__improbable(kr != KERN_SUCCESS)) {
4887 		return kr;
4888 	}
4889 
4890 	kr = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL,
4891 	    inheritance);
4892 	if (__improbable(kr != KERN_SUCCESS)) {
4893 		return kr;
4894 	}
4895 
4896 	kr = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, mask);
4897 	if (__improbable(kr != KERN_SUCCESS)) {
4898 		return kr;
4899 	}
4900 	/*
4901 	 * Ensure arithmetic doesn't overflow in vm_object space (kernel
4902 	 * pages).
4903 	 * We keep unaligned values for now. The call we eventually make to
4904 	 * vm_map_enter does guarantee that offset_u is page aligned for EITHER
4905 	 * target_map pages or kernel pages. But this isn't enough to guarantee
4906 	 * kernel space alignment.
4907 	 */
4908 	kr = vm_sanitize_addr_size(offset_u, initial_size_u,
4909 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, PAGE_MASK,
4910 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
4911 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4912 	    obj_offs, obj_end, obj_size);
4913 	if (__improbable(kr != KERN_SUCCESS)) {
4914 		return kr;
4915 	}
4916 
4917 	/*
4918 	 * There is no vm_sanitize_addr_size variant that also adjusts for
4919 	 * a separate offset. Rather than create one for this one-off issue,
4920 	 * we sanitize map_addr and map_size individually, relying on
4921 	 * vm_sanitize_size to incorporate the offset. Then, we perform the
4922 	 * overflow check manually below.
4923 	 */
4924 	*map_addr = vm_sanitize_addr(target_map, address_u);
4925 	kr = vm_sanitize_size(offset_u, initial_size_u,
4926 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4927 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
4928 	if (__improbable(kr != KERN_SUCCESS)) {
4929 		return kr;
4930 	}
4931 
4932 	/*
4933 	 * Ensure arithmetic doesn't overflow in target_map space.
4934 	 * The computation of map_size above accounts for the possibility that
4935 	 * offset_u might be unaligned in target_map space.
4936 	 */
4937 	if (vmk_flags.vmf_fixed) {
4938 		vm_map_address_t map_end;
4939 
4940 		if (__improbable(os_add_overflow(*map_addr, *map_size, &map_end))) {
4941 			return KERN_INVALID_ARGUMENT;
4942 		}
4943 	}
4944 
4945 	return KERN_SUCCESS;
4946 }
4947 
4948 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_ut offset_u,boolean_t needs_copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u)4949 vm_map_enter_mem_object_control(
4950 	vm_map_t                target_map,
4951 	vm_map_offset_ut       *address_u,
4952 	vm_map_size_ut          initial_size_u,
4953 	vm_map_offset_ut        mask_u,
4954 	vm_map_kernel_flags_t   vmk_flags,
4955 	memory_object_control_t control,
4956 	vm_object_offset_ut     offset_u,
4957 	boolean_t               needs_copy,
4958 	vm_prot_ut              cur_protection_u,
4959 	vm_prot_ut              max_protection_u,
4960 	vm_inherit_ut           inheritance_u)
4961 {
4962 	vm_map_offset_t         mask;
4963 	vm_prot_t               cur_protection;
4964 	vm_prot_t               max_protection;
4965 	vm_inherit_t            inheritance;
4966 	vm_map_address_t        map_addr;
4967 	vm_map_size_t           map_size;
4968 	vm_object_t             object;
4969 	vm_object_offset_t      obj_offs, obj_end;
4970 	vm_object_size_t        obj_size;
4971 	kern_return_t           result;
4972 	memory_object_t         pager;
4973 	vm_prot_t               pager_prot;
4974 	kern_return_t           kr;
4975 
4976 	/*
4977 	 * Check arguments for validity
4978 	 */
4979 	if (target_map == VM_MAP_NULL) {
4980 		return KERN_INVALID_ARGUMENT;
4981 	}
4982 
4983 	/*
4984 	 * We only support vmf_return_data_addr-like behavior.
4985 	 */
4986 	vmk_flags.vmf_return_data_addr = true;
4987 
4988 	/*
4989 	 * Sanitize any input parameters that are addr/size/prot/inherit
4990 	 */
4991 	kr = vm_map_enter_mem_object_control_sanitize(target_map,
4992 	    *address_u,
4993 	    initial_size_u,
4994 	    mask_u,
4995 	    offset_u,
4996 	    cur_protection_u,
4997 	    max_protection_u,
4998 	    inheritance_u,
4999 	    vmk_flags,
5000 	    &map_addr,
5001 	    &map_size,
5002 	    &mask,
5003 	    &obj_offs,
5004 	    &obj_end,
5005 	    &obj_size,
5006 	    &cur_protection,
5007 	    &max_protection,
5008 	    &inheritance);
5009 	if (__improbable(kr != KERN_SUCCESS)) {
5010 		return vm_sanitize_get_kr(kr);
5011 	}
5012 
5013 	object = memory_object_control_to_vm_object(control);
5014 
5015 	if (object == VM_OBJECT_NULL) {
5016 		return KERN_INVALID_OBJECT;
5017 	}
5018 
5019 	if (is_kernel_object(object)) {
5020 		printf("Warning: Attempt to map kernel object"
5021 		    " by a non-private kernel entity\n");
5022 		return KERN_INVALID_OBJECT;
5023 	}
5024 
5025 	vm_object_lock(object);
5026 	os_ref_retain_locked_raw(&object->ref_count, &vm_object_refgrp);
5027 
5028 
5029 	/*
5030 	 * For "named" VM objects, let the pager know that the
5031 	 * memory object is being mapped.  Some pagers need to keep
5032 	 * track of this, to know when they can reclaim the memory
5033 	 * object, for example.
5034 	 * VM calls memory_object_map() for each mapping (specifying
5035 	 * the protection of each mapping) and calls
5036 	 * memory_object_last_unmap() when all the mappings are gone.
5037 	 */
5038 	pager_prot = max_protection;
5039 	if (needs_copy) {
5040 		pager_prot &= ~VM_PROT_WRITE;
5041 	}
5042 	pager = object->pager;
5043 	if (object->named &&
5044 	    pager != MEMORY_OBJECT_NULL &&
5045 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5046 		assert(object->pager_ready);
5047 		vm_object_mapping_wait(object, THREAD_UNINT);
5048 		/* object might have lost its pager while waiting */
5049 		pager = object->pager;
5050 		if (object->named && pager != MEMORY_OBJECT_NULL) {
5051 			vm_object_mapping_begin(object);
5052 			vm_object_unlock(object);
5053 
5054 			kr = memory_object_map(pager, pager_prot);
5055 			assert(kr == KERN_SUCCESS);
5056 
5057 			vm_object_lock(object);
5058 			vm_object_mapping_end(object);
5059 		}
5060 	}
5061 	vm_object_unlock(object);
5062 
5063 	/*
5064 	 *	Perform the copy if requested
5065 	 */
5066 
5067 	if (needs_copy) {
5068 		vm_object_t             new_object;
5069 		vm_object_offset_t      new_offset;
5070 
5071 		result = vm_object_copy_strategically(object, obj_offs, obj_size,
5072 		    false,                                   /* forking */
5073 		    &new_object, &new_offset,
5074 		    &needs_copy);
5075 
5076 
5077 		if (result == KERN_MEMORY_RESTART_COPY) {
5078 			boolean_t success;
5079 			boolean_t src_needs_copy;
5080 
5081 			/*
5082 			 * XXX
5083 			 * We currently ignore src_needs_copy.
5084 			 * This really is the issue of how to make
5085 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5086 			 * non-kernel users to use. Solution forthcoming.
5087 			 * In the meantime, since we don't allow non-kernel
5088 			 * memory managers to specify symmetric copy,
5089 			 * we won't run into problems here.
5090 			 */
5091 			new_object = object;
5092 			new_offset = obj_offs;
5093 			success = vm_object_copy_quickly(new_object,
5094 			    new_offset, obj_size,
5095 			    &src_needs_copy,
5096 			    &needs_copy);
5097 			assert(success);
5098 			result = KERN_SUCCESS;
5099 		}
5100 		/*
5101 		 *	Throw away the reference to the
5102 		 *	original object, as it won't be mapped.
5103 		 */
5104 
5105 		vm_object_deallocate(object);
5106 
5107 		if (result != KERN_SUCCESS) {
5108 			return result;
5109 		}
5110 
5111 		object   = new_object;
5112 		obj_offs = new_offset;
5113 	}
5114 
5115 	result = vm_map_enter(target_map,
5116 	    &map_addr, map_size,
5117 	    (vm_map_offset_t)mask,
5118 	    vmk_flags,
5119 	    object,
5120 	    obj_offs,
5121 	    needs_copy,
5122 	    cur_protection, max_protection,
5123 	    inheritance);
5124 
5125 	if (result == KERN_SUCCESS) {
5126 		*address_u = vm_sanitize_wrap_addr(
5127 			map_addr + (obj_offs & vm_map_page_mask(target_map)));
5128 	} else {
5129 		vm_object_deallocate(object);
5130 	}
5131 
5132 	return result;
5133 }
5134 
5135 
5136 /* Not used without nested pmaps */
5137 #ifndef NO_NESTED_PMAP
5138 /*
5139  * Clip and unnest a portion of a nested submap mapping.
5140  */
5141 
5142 
5143 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5144 vm_map_clip_unnest(
5145 	vm_map_t        map,
5146 	vm_map_entry_t  entry,
5147 	vm_map_offset_t start_unnest,
5148 	vm_map_offset_t end_unnest)
5149 {
5150 	vm_map_offset_t old_start_unnest = start_unnest;
5151 	vm_map_offset_t old_end_unnest = end_unnest;
5152 
5153 	assert(entry->is_sub_map);
5154 	assert(VME_SUBMAP(entry) != NULL);
5155 	assert(entry->use_pmap);
5156 
5157 	/*
5158 	 * Query the platform for the optimal unnest range.
5159 	 * DRK: There's some duplication of effort here, since
5160 	 * callers may have adjusted the range to some extent. This
5161 	 * routine was introduced to support 1GiB subtree nesting
5162 	 * for x86 platforms, which can also nest on 2MiB boundaries
5163 	 * depending on size/alignment.
5164 	 */
5165 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5166 		assert(VME_SUBMAP(entry)->is_nested_map);
5167 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5168 		log_unnest_badness(map,
5169 		    old_start_unnest,
5170 		    old_end_unnest,
5171 		    VME_SUBMAP(entry)->is_nested_map,
5172 		    (entry->vme_start +
5173 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5174 		    VME_OFFSET(entry)));
5175 	}
5176 
5177 	if (entry->vme_start > start_unnest ||
5178 	    entry->vme_end < end_unnest) {
5179 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5180 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5181 		    (long long)start_unnest, (long long)end_unnest,
5182 		    (long long)entry->vme_start, (long long)entry->vme_end);
5183 	}
5184 
5185 	if (start_unnest > entry->vme_start) {
5186 		_vm_map_clip_start(&map->hdr,
5187 		    entry,
5188 		    start_unnest);
5189 		if (map->holelistenabled) {
5190 			vm_map_store_update_first_free(map, NULL, FALSE);
5191 		} else {
5192 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5193 		}
5194 	}
5195 	if (entry->vme_end > end_unnest) {
5196 		_vm_map_clip_end(&map->hdr,
5197 		    entry,
5198 		    end_unnest);
5199 		if (map->holelistenabled) {
5200 			vm_map_store_update_first_free(map, NULL, FALSE);
5201 		} else {
5202 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5203 		}
5204 	}
5205 
5206 	pmap_unnest(map->pmap,
5207 	    entry->vme_start,
5208 	    entry->vme_end - entry->vme_start);
5209 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5210 		/* clean up parent map/maps */
5211 		vm_map_submap_pmap_clean(
5212 			map, entry->vme_start,
5213 			entry->vme_end,
5214 			VME_SUBMAP(entry),
5215 			VME_OFFSET(entry));
5216 	}
5217 	entry->use_pmap = FALSE;
5218 	if ((map->pmap != kernel_pmap) &&
5219 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5220 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5221 	}
5222 }
5223 #endif  /* NO_NESTED_PMAP */
5224 
5225 __abortlike
5226 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5227 __vm_map_clip_atomic_entry_panic(
5228 	vm_map_t        map,
5229 	vm_map_entry_t  entry,
5230 	vm_map_offset_t where)
5231 {
5232 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5233 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5234 	    (uint64_t)entry->vme_start,
5235 	    (uint64_t)entry->vme_end,
5236 	    (uint64_t)where);
5237 }
5238 
5239 /*
5240  *	vm_map_clip_start:	[ internal use only ]
5241  *
5242  *	Asserts that the given entry begins at or after
5243  *	the specified address; if necessary,
5244  *	it splits the entry into two.
5245  */
5246 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5247 vm_map_clip_start(
5248 	vm_map_t        map,
5249 	vm_map_entry_t  entry,
5250 	vm_map_offset_t startaddr)
5251 {
5252 #ifndef NO_NESTED_PMAP
5253 	if (entry->is_sub_map &&
5254 	    entry->use_pmap &&
5255 	    startaddr >= entry->vme_start) {
5256 		vm_map_offset_t start_unnest, end_unnest;
5257 
5258 		/*
5259 		 * Make sure "startaddr" is no longer in a nested range
5260 		 * before we clip.  Unnest only the minimum range the platform
5261 		 * can handle.
5262 		 * vm_map_clip_unnest may perform additional adjustments to
5263 		 * the unnest range.
5264 		 */
5265 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5266 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5267 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5268 	}
5269 #endif /* NO_NESTED_PMAP */
5270 	if (startaddr > entry->vme_start) {
5271 		if (!entry->is_sub_map &&
5272 		    VME_OBJECT(entry) &&
5273 		    VME_OBJECT(entry)->phys_contiguous) {
5274 			pmap_remove(map->pmap,
5275 			    (addr64_t)(entry->vme_start),
5276 			    (addr64_t)(entry->vme_end));
5277 		}
5278 		if (entry->vme_atomic) {
5279 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5280 		}
5281 
5282 		DTRACE_VM5(
5283 			vm_map_clip_start,
5284 			vm_map_t, map,
5285 			vm_map_offset_t, entry->vme_start,
5286 			vm_map_offset_t, entry->vme_end,
5287 			vm_map_offset_t, startaddr,
5288 			int, VME_ALIAS(entry));
5289 
5290 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5291 		if (map->holelistenabled) {
5292 			vm_map_store_update_first_free(map, NULL, FALSE);
5293 		} else {
5294 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5295 		}
5296 	}
5297 }
5298 
5299 
5300 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5301 	MACRO_BEGIN \
5302 	if ((startaddr) > (entry)->vme_start) \
5303 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5304 	MACRO_END
5305 
5306 /*
5307  *	This routine is called only when it is known that
5308  *	the entry must be split.
5309  */
5310 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5311 _vm_map_clip_start(
5312 	struct vm_map_header    *map_header,
5313 	vm_map_entry_t          entry,
5314 	vm_map_offset_t         start)
5315 {
5316 	vm_map_entry_t  new_entry;
5317 
5318 	/*
5319 	 *	Split off the front portion --
5320 	 *	note that we must insert the new
5321 	 *	entry BEFORE this one, so that
5322 	 *	this entry has the specified starting
5323 	 *	address.
5324 	 */
5325 
5326 	if (entry->map_aligned) {
5327 		assert(VM_MAP_PAGE_ALIGNED(start,
5328 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5329 	}
5330 
5331 	new_entry = _vm_map_entry_create(map_header);
5332 	vm_map_entry_copy_full(new_entry, entry);
5333 
5334 	new_entry->vme_end = start;
5335 	assert(new_entry->vme_start < new_entry->vme_end);
5336 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5337 	if (__improbable(start >= entry->vme_end)) {
5338 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5339 	}
5340 	assert(start < entry->vme_end);
5341 	entry->vme_start = start;
5342 
5343 #if VM_BTLOG_TAGS
5344 	if (new_entry->vme_kernel_object) {
5345 		btref_retain(new_entry->vme_tag_btref);
5346 	}
5347 #endif /* VM_BTLOG_TAGS */
5348 
5349 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5350 
5351 	if (entry->is_sub_map) {
5352 		vm_map_reference(VME_SUBMAP(new_entry));
5353 	} else {
5354 		vm_object_reference(VME_OBJECT(new_entry));
5355 	}
5356 }
5357 
5358 
5359 /*
5360  *	vm_map_clip_end:	[ internal use only ]
5361  *
5362  *	Asserts that the given entry ends at or before
5363  *	the specified address; if necessary,
5364  *	it splits the entry into two.
5365  */
5366 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5367 vm_map_clip_end(
5368 	vm_map_t        map,
5369 	vm_map_entry_t  entry,
5370 	vm_map_offset_t endaddr)
5371 {
5372 	if (endaddr > entry->vme_end) {
5373 		/*
5374 		 * Within the scope of this clipping, limit "endaddr" to
5375 		 * the end of this map entry...
5376 		 */
5377 		endaddr = entry->vme_end;
5378 	}
5379 #ifndef NO_NESTED_PMAP
5380 	if (entry->is_sub_map && entry->use_pmap) {
5381 		vm_map_offset_t start_unnest, end_unnest;
5382 
5383 		/*
5384 		 * Make sure the range between the start of this entry and
5385 		 * the new "endaddr" is no longer nested before we clip.
5386 		 * Unnest only the minimum range the platform can handle.
5387 		 * vm_map_clip_unnest may perform additional adjustments to
5388 		 * the unnest range.
5389 		 */
5390 		start_unnest = entry->vme_start;
5391 		end_unnest =
5392 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5393 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5394 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5395 	}
5396 #endif /* NO_NESTED_PMAP */
5397 	if (endaddr < entry->vme_end) {
5398 		if (!entry->is_sub_map &&
5399 		    VME_OBJECT(entry) &&
5400 		    VME_OBJECT(entry)->phys_contiguous) {
5401 			pmap_remove(map->pmap,
5402 			    (addr64_t)(entry->vme_start),
5403 			    (addr64_t)(entry->vme_end));
5404 		}
5405 		if (entry->vme_atomic) {
5406 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5407 		}
5408 		DTRACE_VM5(
5409 			vm_map_clip_end,
5410 			vm_map_t, map,
5411 			vm_map_offset_t, entry->vme_start,
5412 			vm_map_offset_t, entry->vme_end,
5413 			vm_map_offset_t, endaddr,
5414 			int, VME_ALIAS(entry));
5415 
5416 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5417 		if (map->holelistenabled) {
5418 			vm_map_store_update_first_free(map, NULL, FALSE);
5419 		} else {
5420 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5421 		}
5422 	}
5423 }
5424 
5425 
5426 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5427 	MACRO_BEGIN \
5428 	if ((endaddr) < (entry)->vme_end) \
5429 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5430 	MACRO_END
5431 
5432 /*
5433  *	This routine is called only when it is known that
5434  *	the entry must be split.
5435  */
5436 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5437 _vm_map_clip_end(
5438 	struct vm_map_header    *map_header,
5439 	vm_map_entry_t          entry,
5440 	vm_map_offset_t         end)
5441 {
5442 	vm_map_entry_t  new_entry;
5443 
5444 	/*
5445 	 *	Create a new entry and insert it
5446 	 *	AFTER the specified entry
5447 	 */
5448 
5449 	if (entry->map_aligned) {
5450 		assert(VM_MAP_PAGE_ALIGNED(end,
5451 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5452 	}
5453 
5454 	new_entry = _vm_map_entry_create(map_header);
5455 	vm_map_entry_copy_full(new_entry, entry);
5456 
5457 	if (__improbable(end <= entry->vme_start)) {
5458 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5459 	}
5460 	assert(entry->vme_start < end);
5461 	new_entry->vme_start = entry->vme_end = end;
5462 	VME_OFFSET_SET(new_entry,
5463 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5464 	assert(new_entry->vme_start < new_entry->vme_end);
5465 
5466 #if VM_BTLOG_TAGS
5467 	if (new_entry->vme_kernel_object) {
5468 		btref_retain(new_entry->vme_tag_btref);
5469 	}
5470 #endif /* VM_BTLOG_TAGS */
5471 
5472 	_vm_map_store_entry_link(map_header, entry, new_entry);
5473 
5474 	if (entry->is_sub_map) {
5475 		vm_map_reference(VME_SUBMAP(new_entry));
5476 	} else {
5477 		vm_object_reference(VME_OBJECT(new_entry));
5478 	}
5479 }
5480 
5481 
5482 /*
5483  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5484  *
5485  *	Asserts that the starting and ending region
5486  *	addresses fall within the valid range of the map.
5487  */
5488 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5489 	MACRO_BEGIN                             \
5490 	if (start < vm_map_min(map))            \
5491 	        start = vm_map_min(map);        \
5492 	if (end > vm_map_max(map))              \
5493 	        end = vm_map_max(map);          \
5494 	if (start > end)                        \
5495 	        start = end;                    \
5496 	MACRO_END
5497 
5498 /*
5499  *	vm_map_range_check:	[ internal use only ]
5500  *
5501  *	Check that the region defined by the specified start and
5502  *	end addresses are wholly contained within a single map
5503  *	entry or set of adjacent map entries of the spacified map,
5504  *	i.e. the specified region contains no unmapped space.
5505  *	If any or all of the region is unmapped, FALSE is returned.
5506  *	Otherwise, TRUE is returned and if the output argument 'entry'
5507  *	is not NULL it points to the map entry containing the start
5508  *	of the region.
5509  *
5510  *	The map is locked for reading on entry and is left locked.
5511  */
5512 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5513 vm_map_range_check(
5514 	vm_map_t                map,
5515 	vm_map_offset_t         start,
5516 	vm_map_offset_t         end,
5517 	vm_map_entry_t          *entry)
5518 {
5519 	vm_map_entry_t          cur;
5520 	vm_map_offset_t         prev;
5521 
5522 	/*
5523 	 *      Basic sanity checks first
5524 	 */
5525 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5526 		return FALSE;
5527 	}
5528 
5529 	/*
5530 	 *      Check first if the region starts within a valid
5531 	 *	mapping for the map.
5532 	 */
5533 	if (!vm_map_lookup_entry(map, start, &cur)) {
5534 		return FALSE;
5535 	}
5536 
5537 	/*
5538 	 *	Optimize for the case that the region is contained
5539 	 *	in a single map entry.
5540 	 */
5541 	if (entry != (vm_map_entry_t *) NULL) {
5542 		*entry = cur;
5543 	}
5544 	if (end <= cur->vme_end) {
5545 		return TRUE;
5546 	}
5547 
5548 	/*
5549 	 *      If the region is not wholly contained within a
5550 	 *      single entry, walk the entries looking for holes.
5551 	 */
5552 	prev = cur->vme_end;
5553 	cur = cur->vme_next;
5554 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5555 		if (end <= cur->vme_end) {
5556 			return TRUE;
5557 		}
5558 		prev = cur->vme_end;
5559 		cur = cur->vme_next;
5560 	}
5561 	return FALSE;
5562 }
5563 
5564 static __attribute__((always_inline, warn_unused_result))
5565 kern_return_t
vm_map_protect_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut new_prot_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * new_prot)5566 vm_map_protect_sanitize(
5567 	vm_map_t                map,
5568 	vm_map_offset_ut        start_u,
5569 	vm_map_offset_ut        end_u,
5570 	vm_prot_ut              new_prot_u,
5571 	vm_map_offset_t        *start,
5572 	vm_map_offset_t        *end,
5573 	vm_prot_t              *new_prot)
5574 {
5575 	kern_return_t           kr;
5576 	vm_map_size_t           size;
5577 
5578 	kr = vm_sanitize_prot(new_prot_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5579 	    map, VM_PROT_COPY, new_prot);
5580 	if (__improbable(kr != KERN_SUCCESS)) {
5581 		return kr;
5582 	}
5583 
5584 	kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5585 	    map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
5586 	if (__improbable(kr != KERN_SUCCESS)) {
5587 		return kr;
5588 	}
5589 
5590 	return KERN_SUCCESS;
5591 }
5592 
5593 /*
5594  *	vm_map_protect:
5595  *
5596  *	Sets the protection of the specified address
5597  *	region in the target map.  If "set_max" is
5598  *	specified, the maximum protection is to be set;
5599  *	otherwise, only the current protection is affected.
5600  */
5601 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t set_max,vm_prot_ut new_prot_u)5602 vm_map_protect(
5603 	vm_map_t                map,
5604 	vm_map_offset_ut        start_u,
5605 	vm_map_offset_ut        end_u,
5606 	boolean_t               set_max,
5607 	vm_prot_ut              new_prot_u)
5608 {
5609 	vm_map_entry_t                  current;
5610 	vm_map_offset_t                 prev;
5611 	vm_map_entry_t                  entry;
5612 	vm_prot_t                       new_prot;
5613 	vm_prot_t                       new_max;
5614 	int                             pmap_options = 0;
5615 	kern_return_t                   kr;
5616 	vm_map_offset_t                 start, original_start;
5617 	vm_map_offset_t                 end;
5618 
5619 	kr = vm_map_protect_sanitize(map,
5620 	    start_u,
5621 	    end_u,
5622 	    new_prot_u,
5623 	    &start,
5624 	    &end,
5625 	    &new_prot);
5626 	if (__improbable(kr != KERN_SUCCESS)) {
5627 		return vm_sanitize_get_kr(kr);
5628 	}
5629 	original_start = start;
5630 
5631 	if (new_prot & VM_PROT_COPY) {
5632 		vm_map_offset_t         new_start;
5633 		vm_prot_t               cur_prot, max_prot;
5634 		vm_map_kernel_flags_t   kflags;
5635 
5636 		/* LP64todo - see below */
5637 		if (start >= map->max_offset) {
5638 			return KERN_INVALID_ADDRESS;
5639 		}
5640 
5641 		if ((new_prot & VM_PROT_ALLEXEC) &&
5642 		    map->pmap != kernel_pmap &&
5643 		    (vm_map_cs_enforcement(map)
5644 #if XNU_TARGET_OS_OSX && __arm64__
5645 		    || !VM_MAP_IS_EXOTIC(map)
5646 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5647 		    ) &&
5648 		    VM_MAP_POLICY_WX_FAIL(map)) {
5649 			DTRACE_VM3(cs_wx,
5650 			    uint64_t, (uint64_t) start,
5651 			    uint64_t, (uint64_t) end,
5652 			    vm_prot_t, new_prot);
5653 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5654 			    proc_selfpid(),
5655 			    (get_bsdtask_info(current_task())
5656 			    ? proc_name_address(get_bsdtask_info(current_task()))
5657 			    : "?"),
5658 			    __FUNCTION__, __LINE__,
5659 #if DEVELOPMENT || DEBUG
5660 			    (uint64_t)start,
5661 			    (uint64_t)end,
5662 #else /* DEVELOPMENT || DEBUG */
5663 			    (uint64_t)0,
5664 			    (uint64_t)0,
5665 #endif /* DEVELOPMENT || DEBUG */
5666 			    new_prot);
5667 			return KERN_PROTECTION_FAILURE;
5668 		}
5669 
5670 		/*
5671 		 * Let vm_map_remap_extract() know that it will need to:
5672 		 * + make a copy of the mapping
5673 		 * + add VM_PROT_WRITE to the max protections
5674 		 * + remove any protections that are no longer allowed from the
5675 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
5676 		 *   example).
5677 		 * Note that "max_prot" is an IN/OUT parameter only for this
5678 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
5679 		 * only.
5680 		 */
5681 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5682 		cur_prot = VM_PROT_NONE;
5683 		kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5684 		kflags.vmkf_remap_prot_copy = true;
5685 		kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5686 		new_start = start;
5687 		kr = vm_map_remap(map,
5688 		    vm_sanitize_wrap_addr_ref(&new_start),
5689 		    end - start,
5690 		    0, /* mask */
5691 		    kflags,
5692 		    map,
5693 		    start,
5694 		    TRUE, /* copy-on-write remapping! */
5695 		    vm_sanitize_wrap_prot_ref(&cur_prot), /* IN/OUT */
5696 		    vm_sanitize_wrap_prot_ref(&max_prot), /* IN/OUT */
5697 		    VM_INHERIT_DEFAULT);
5698 		if (kr != KERN_SUCCESS) {
5699 			return kr;
5700 		}
5701 		new_prot &= ~VM_PROT_COPY;
5702 	}
5703 
5704 	vm_map_lock(map);
5705 restart_after_unlock:
5706 
5707 	/* LP64todo - remove this check when vm_map_commpage64()
5708 	 * no longer has to stuff in a map_entry for the commpage
5709 	 * above the map's max_offset.
5710 	 */
5711 	if (start >= map->max_offset) {
5712 		vm_map_unlock(map);
5713 		return KERN_INVALID_ADDRESS;
5714 	}
5715 
5716 	while (1) {
5717 		/*
5718 		 *      Lookup the entry.  If it doesn't start in a valid
5719 		 *	entry, return an error.
5720 		 */
5721 		if (!vm_map_lookup_entry(map, start, &entry)) {
5722 			vm_map_unlock(map);
5723 			return KERN_INVALID_ADDRESS;
5724 		}
5725 
5726 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5727 			start = SUPERPAGE_ROUND_DOWN(start);
5728 			continue;
5729 		}
5730 		break;
5731 	}
5732 	if (entry->superpage_size) {
5733 		end = SUPERPAGE_ROUND_UP(end);
5734 	}
5735 
5736 	/*
5737 	 *	Make a first pass to check for protection and address
5738 	 *	violations.
5739 	 */
5740 
5741 	current = entry;
5742 	prev = current->vme_start;
5743 	while ((current != vm_map_to_entry(map)) &&
5744 	    (current->vme_start < end)) {
5745 		/*
5746 		 * If there is a hole, return an error.
5747 		 */
5748 		if (current->vme_start != prev) {
5749 			vm_map_unlock(map);
5750 			return KERN_INVALID_ADDRESS;
5751 		}
5752 
5753 		new_max = current->max_protection;
5754 
5755 #if defined(__x86_64__)
5756 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5757 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5758 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5759 		}
5760 #elif CODE_SIGNING_MONITOR
5761 		if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5762 			new_max |= VM_PROT_EXECUTE;
5763 		}
5764 #endif
5765 		if ((new_prot & new_max) != new_prot) {
5766 			vm_map_unlock(map);
5767 			return KERN_PROTECTION_FAILURE;
5768 		}
5769 
5770 		if (current->used_for_jit &&
5771 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5772 			vm_map_unlock(map);
5773 			return KERN_PROTECTION_FAILURE;
5774 		}
5775 
5776 #if __arm64e__
5777 		/* Disallow protecting hw assisted TPRO mappings */
5778 		if (current->used_for_tpro) {
5779 			vm_map_unlock(map);
5780 			return KERN_PROTECTION_FAILURE;
5781 		}
5782 #endif /* __arm64e__ */
5783 
5784 
5785 		if ((new_prot & VM_PROT_WRITE) &&
5786 		    (new_prot & VM_PROT_ALLEXEC) &&
5787 #if XNU_TARGET_OS_OSX
5788 		    map->pmap != kernel_pmap &&
5789 		    (vm_map_cs_enforcement(map)
5790 #if __arm64__
5791 		    || !VM_MAP_IS_EXOTIC(map)
5792 #endif /* __arm64__ */
5793 		    ) &&
5794 #endif /* XNU_TARGET_OS_OSX */
5795 #if CODE_SIGNING_MONITOR
5796 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5797 #endif
5798 		    !(current->used_for_jit)) {
5799 			DTRACE_VM3(cs_wx,
5800 			    uint64_t, (uint64_t) current->vme_start,
5801 			    uint64_t, (uint64_t) current->vme_end,
5802 			    vm_prot_t, new_prot);
5803 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5804 			    proc_selfpid(),
5805 			    (get_bsdtask_info(current_task())
5806 			    ? proc_name_address(get_bsdtask_info(current_task()))
5807 			    : "?"),
5808 			    __FUNCTION__, __LINE__,
5809 #if DEVELOPMENT || DEBUG
5810 			    (uint64_t)current->vme_start,
5811 			    (uint64_t)current->vme_end,
5812 #else /* DEVELOPMENT || DEBUG */
5813 			    (uint64_t)0,
5814 			    (uint64_t)0,
5815 #endif /* DEVELOPMENT || DEBUG */
5816 			    new_prot);
5817 			new_prot &= ~VM_PROT_ALLEXEC;
5818 			if (VM_MAP_POLICY_WX_FAIL(map)) {
5819 				vm_map_unlock(map);
5820 				return KERN_PROTECTION_FAILURE;
5821 			}
5822 		}
5823 
5824 		/*
5825 		 * If the task has requested executable lockdown,
5826 		 * deny both:
5827 		 * - adding executable protections OR
5828 		 * - adding write protections to an existing executable mapping.
5829 		 */
5830 		if (map->map_disallow_new_exec == TRUE) {
5831 			if ((new_prot & VM_PROT_ALLEXEC) ||
5832 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5833 				vm_map_unlock(map);
5834 				return KERN_PROTECTION_FAILURE;
5835 			}
5836 		}
5837 
5838 		prev = current->vme_end;
5839 		current = current->vme_next;
5840 	}
5841 
5842 #if __arm64__
5843 	if (end > prev &&
5844 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5845 		vm_map_entry_t prev_entry;
5846 
5847 		prev_entry = current->vme_prev;
5848 		if (prev_entry != vm_map_to_entry(map) &&
5849 		    !prev_entry->map_aligned &&
5850 		    (vm_map_round_page(prev_entry->vme_end,
5851 		    VM_MAP_PAGE_MASK(map))
5852 		    == end)) {
5853 			/*
5854 			 * The last entry in our range is not "map-aligned"
5855 			 * but it would have reached all the way to "end"
5856 			 * if it had been map-aligned, so this is not really
5857 			 * a hole in the range and we can proceed.
5858 			 */
5859 			prev = end;
5860 		}
5861 	}
5862 #endif /* __arm64__ */
5863 
5864 	if (end > prev) {
5865 		vm_map_unlock(map);
5866 		return KERN_INVALID_ADDRESS;
5867 	}
5868 
5869 	/*
5870 	 *	Go back and fix up protections.
5871 	 *	Clip to start here if the range starts within
5872 	 *	the entry.
5873 	 */
5874 
5875 	current = entry;
5876 	if (current != vm_map_to_entry(map)) {
5877 		/* clip and unnest if necessary */
5878 		vm_map_clip_start(map, current, start);
5879 	}
5880 
5881 	while ((current != vm_map_to_entry(map)) &&
5882 	    (current->vme_start < end)) {
5883 		vm_prot_t       old_prot;
5884 
5885 		if (current->in_transition) {
5886 			wait_result_t wait_result;
5887 			vm_map_offset_t current_start;
5888 
5889 			/*
5890 			 * Another thread is wiring/unwiring this entry.
5891 			 * Let the other thread know we are waiting.
5892 			 */
5893 			current_start = current->vme_start;
5894 			current->needs_wakeup = true;
5895 			/* wait for the other thread to be done */
5896 			wait_result = vm_map_entry_wait(map, TH_UNINT);
5897 			/*
5898 			 * We unlocked the map, so anything could have changed in the
5899 			 * range and we need to re-check from "current_start" to "end".
5900 			 * Our entries might no longer be valid.
5901 			 */
5902 			current = NULL;
5903 			entry = NULL;
5904 			/*
5905 			 * Re-lookup and re-clip "current_start".
5906 			 * If it's no longer mapped,
5907 			 */
5908 			vm_map_lookup_entry_or_next(map, current_start, &current);
5909 			if (current != vm_map_to_entry(map)) {
5910 				vm_map_clip_start(map, current, current_start);
5911 			}
5912 			/* restart from this point */
5913 			start = current_start;
5914 			goto restart_after_unlock;
5915 		}
5916 
5917 		vm_map_clip_end(map, current, end);
5918 
5919 #if DEVELOPMENT || DEBUG
5920 		if (current->csm_associated && vm_log_xnu_user_debug) {
5921 			printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
5922 			    proc_selfpid(),
5923 			    (get_bsdtask_info(current_task())
5924 			    ? proc_name_address(get_bsdtask_info(current_task()))
5925 			    : "?"),
5926 			    __FUNCTION__,
5927 			    (uint64_t)start,
5928 			    (uint64_t)end,
5929 			    new_prot,
5930 			    map, current,
5931 			    current->vme_start,
5932 			    current->vme_end,
5933 			    current->protection,
5934 			    current->max_protection);
5935 		}
5936 #endif /* DEVELOPMENT || DEBUG */
5937 
5938 		if (current->is_sub_map) {
5939 			/* clipping did unnest if needed */
5940 			assert(!current->use_pmap);
5941 		}
5942 
5943 		old_prot = current->protection;
5944 
5945 		if (set_max) {
5946 			current->max_protection = new_prot;
5947 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
5948 			current->protection = (new_prot & old_prot);
5949 		} else {
5950 			current->protection = new_prot;
5951 		}
5952 
5953 #if CODE_SIGNING_MONITOR
5954 		if (!current->vme_xnu_user_debug &&
5955 		    /* a !csm_associated mapping becoming executable */
5956 		    ((!current->csm_associated &&
5957 		    !(old_prot & VM_PROT_EXECUTE) &&
5958 		    (current->protection & VM_PROT_EXECUTE))
5959 		    ||
5960 		    /* a csm_associated mapping becoming writable */
5961 		    (current->csm_associated &&
5962 		    !(old_prot & VM_PROT_WRITE) &&
5963 		    (current->protection & VM_PROT_WRITE)))) {
5964 			/*
5965 			 * This mapping has not already been marked as
5966 			 * "user_debug" and it is either:
5967 			 * 1. not code-signing-monitored and becoming executable
5968 			 * 2. code-signing-monitored and becoming writable,
5969 			 * so inform the CodeSigningMonitor and mark the
5970 			 * mapping as "user_debug" if appropriate.
5971 			 */
5972 			vm_map_kernel_flags_t vmk_flags;
5973 			vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
5974 			/* pretend it's a vm_protect(VM_PROT_COPY)... */
5975 			vmk_flags.vmkf_remap_prot_copy = true;
5976 			kr = vm_map_entry_cs_associate(map, current, vmk_flags);
5977 #if DEVELOPMENT || DEBUG
5978 			if (vm_log_xnu_user_debug) {
5979 				printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
5980 				    proc_selfpid(),
5981 				    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
5982 				    __FUNCTION__, __LINE__,
5983 				    map, current,
5984 				    current->vme_start, current->vme_end,
5985 				    old_prot, current->protection,
5986 				    kr, current->vme_xnu_user_debug);
5987 			}
5988 #endif /* DEVELOPMENT || DEBUG */
5989 		}
5990 #endif /* CODE_SIGNING_MONITOR */
5991 
5992 		/*
5993 		 *	Update physical map if necessary.
5994 		 *	If the request is to turn off write protection,
5995 		 *	we won't do it for real (in pmap). This is because
5996 		 *	it would cause copy-on-write to fail.  We've already
5997 		 *	set, the new protection in the map, so if a
5998 		 *	write-protect fault occurred, it will be fixed up
5999 		 *	properly, COW or not.
6000 		 */
6001 		if (current->protection != old_prot) {
6002 			/* Look one level in we support nested pmaps */
6003 			/* from mapped submaps which are direct entries */
6004 			/* in our map */
6005 
6006 			vm_prot_t prot;
6007 
6008 			prot = current->protection;
6009 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6010 				prot &= ~VM_PROT_WRITE;
6011 			} else {
6012 				assert(!VME_OBJECT(current)->code_signed);
6013 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6014 				if (prot & VM_PROT_WRITE) {
6015 					/*
6016 					 * For write requests on the
6017 					 * compressor, we wil ask the
6018 					 * pmap layer to prevent us from
6019 					 * taking a write fault when we
6020 					 * attempt to access the mapping
6021 					 * next.
6022 					 */
6023 					pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6024 				}
6025 			}
6026 
6027 			if (override_nx(map, VME_ALIAS(current)) && prot) {
6028 				prot |= VM_PROT_EXECUTE;
6029 			}
6030 
6031 #if DEVELOPMENT || DEBUG
6032 			if (!(old_prot & VM_PROT_EXECUTE) &&
6033 			    (prot & VM_PROT_EXECUTE) &&
6034 			    panic_on_unsigned_execute &&
6035 			    (proc_selfcsflags() & CS_KILL)) {
6036 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6037 			}
6038 #endif /* DEVELOPMENT || DEBUG */
6039 
6040 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6041 				if (current->wired_count) {
6042 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6043 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6044 				}
6045 
6046 				/* If the pmap layer cares about this
6047 				 * protection type, force a fault for
6048 				 * each page so that vm_fault will
6049 				 * repopulate the page with the full
6050 				 * set of protections.
6051 				 */
6052 				/*
6053 				 * TODO: We don't seem to need this,
6054 				 * but this is due to an internal
6055 				 * implementation detail of
6056 				 * pmap_protect.  Do we want to rely
6057 				 * on this?
6058 				 */
6059 				prot = VM_PROT_NONE;
6060 			}
6061 
6062 			if (current->is_sub_map && current->use_pmap) {
6063 				pmap_protect(VME_SUBMAP(current)->pmap,
6064 				    current->vme_start,
6065 				    current->vme_end,
6066 				    prot);
6067 			} else {
6068 				pmap_protect_options(map->pmap,
6069 				    current->vme_start,
6070 				    current->vme_end,
6071 				    prot,
6072 				    pmap_options,
6073 				    NULL);
6074 			}
6075 		}
6076 		current = current->vme_next;
6077 	}
6078 
6079 	if (entry == VM_MAP_ENTRY_NULL) {
6080 		/*
6081 		 * Re-lookup the original start of our range.
6082 		 * If it's no longer mapped, start with the next mapping.
6083 		 */
6084 		vm_map_lookup_entry_or_next(map, original_start, &entry);
6085 	}
6086 	current = entry;
6087 	while ((current != vm_map_to_entry(map)) &&
6088 	    (current->vme_start <= end)) {
6089 		vm_map_simplify_entry(map, current);
6090 		current = current->vme_next;
6091 	}
6092 
6093 	vm_map_unlock(map);
6094 	return KERN_SUCCESS;
6095 }
6096 
6097 static __attribute__((always_inline, warn_unused_result))
6098 kern_return_t
vm_map_inherit_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_inherit_t * new_inheritance)6099 vm_map_inherit_sanitize(
6100 	vm_map_t                        map,
6101 	vm_map_offset_ut                start_u,
6102 	vm_map_offset_ut                end_u,
6103 	vm_inherit_ut                   new_inheritance_u,
6104 	vm_map_offset_t                *start,
6105 	vm_map_offset_t                *end,
6106 	vm_inherit_t                   *new_inheritance)
6107 {
6108 	kern_return_t   kr;
6109 	vm_map_size_t   size;
6110 
6111 	kr = vm_sanitize_inherit(new_inheritance_u,
6112 	    VM_SANITIZE_CALLER_VM_MAP_INHERIT, new_inheritance);
6113 	if (__improbable(kr != KERN_SUCCESS)) {
6114 		return kr;
6115 	}
6116 
6117 	kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_INHERIT,
6118 	    map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
6119 	if (__improbable(kr != KERN_SUCCESS)) {
6120 		return kr;
6121 	}
6122 
6123 	return KERN_SUCCESS;
6124 }
6125 
6126 /*
6127  *	vm_map_inherit:
6128  *
6129  *	Sets the inheritance of the specified address
6130  *	range in the target map.  Inheritance
6131  *	affects how the map will be shared with
6132  *	child maps at the time of vm_map_fork.
6133  */
6134 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u)6135 vm_map_inherit(
6136 	vm_map_t                        map,
6137 	vm_map_offset_ut                start_u,
6138 	vm_map_offset_ut                end_u,
6139 	vm_inherit_ut                   new_inheritance_u)
6140 {
6141 	vm_map_entry_t  entry;
6142 	vm_map_entry_t  temp_entry;
6143 	kern_return_t   kr;
6144 	vm_map_offset_t start;
6145 	vm_map_offset_t end;
6146 	vm_inherit_t    new_inheritance;
6147 
6148 	kr = vm_map_inherit_sanitize(map,
6149 	    start_u,
6150 	    end_u,
6151 	    new_inheritance_u,
6152 	    &start,
6153 	    &end,
6154 	    &new_inheritance);
6155 	if (__improbable(kr != KERN_SUCCESS)) {
6156 		return vm_sanitize_get_kr(kr);
6157 	}
6158 
6159 	vm_map_lock(map);
6160 
6161 	VM_MAP_RANGE_CHECK(map, start, end);
6162 
6163 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6164 		entry = temp_entry;
6165 	} else {
6166 		temp_entry = temp_entry->vme_next;
6167 		entry = temp_entry;
6168 	}
6169 
6170 	/* first check entire range for submaps which can't support the */
6171 	/* given inheritance. */
6172 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6173 		if (entry->is_sub_map) {
6174 			if (new_inheritance == VM_INHERIT_COPY) {
6175 				vm_map_unlock(map);
6176 				return KERN_INVALID_ARGUMENT;
6177 			}
6178 		}
6179 
6180 		entry = entry->vme_next;
6181 	}
6182 
6183 	entry = temp_entry;
6184 	if (entry != vm_map_to_entry(map)) {
6185 		/* clip and unnest if necessary */
6186 		vm_map_clip_start(map, entry, start);
6187 	}
6188 
6189 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6190 		vm_map_clip_end(map, entry, end);
6191 		if (entry->is_sub_map) {
6192 			/* clip did unnest if needed */
6193 			assert(!entry->use_pmap);
6194 		}
6195 
6196 		entry->inheritance = new_inheritance;
6197 
6198 		entry = entry->vme_next;
6199 	}
6200 
6201 	vm_map_unlock(map);
6202 	return KERN_SUCCESS;
6203 }
6204 
6205 /*
6206  * Update the accounting for the amount of wired memory in this map.  If the user has
6207  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6208  */
6209 
6210 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6211 add_wire_counts(
6212 	vm_map_t        map,
6213 	vm_map_entry_t  entry,
6214 	boolean_t       user_wire)
6215 {
6216 	vm_map_size_t   size;
6217 
6218 	bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6219 
6220 	if (user_wire) {
6221 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6222 
6223 		/*
6224 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6225 		 * this map entry.
6226 		 */
6227 
6228 		if (entry->user_wired_count == 0) {
6229 			size = entry->vme_end - entry->vme_start;
6230 
6231 			/*
6232 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6233 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6234 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6235 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6236 			 * limit, then we fail.
6237 			 */
6238 
6239 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6240 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6241 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6242 #if DEVELOPMENT || DEBUG
6243 					if (panic_on_mlock_failure) {
6244 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6245 					}
6246 #endif /* DEVELOPMENT || DEBUG */
6247 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6248 				} else {
6249 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6250 #if DEVELOPMENT || DEBUG
6251 					if (panic_on_mlock_failure) {
6252 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6253 					}
6254 #endif /* DEVELOPMENT || DEBUG */
6255 				}
6256 				return KERN_RESOURCE_SHORTAGE;
6257 			}
6258 
6259 			/*
6260 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6261 			 * the total that has been wired in the map.
6262 			 */
6263 
6264 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6265 				return KERN_FAILURE;
6266 			}
6267 
6268 			entry->wired_count++;
6269 			map->user_wire_size += size;
6270 		}
6271 
6272 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6273 			return KERN_FAILURE;
6274 		}
6275 
6276 		entry->user_wired_count++;
6277 	} else {
6278 		/*
6279 		 * The kernel's wiring the memory.  Just bump the count and continue.
6280 		 */
6281 
6282 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6283 			panic("vm_map_wire: too many wirings");
6284 		}
6285 
6286 		entry->wired_count++;
6287 	}
6288 
6289 	if (first_wire) {
6290 		vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6291 	}
6292 
6293 	return KERN_SUCCESS;
6294 }
6295 
6296 /*
6297  * Update the memory wiring accounting now that the given map entry is being unwired.
6298  */
6299 
6300 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6301 subtract_wire_counts(
6302 	vm_map_t        map,
6303 	vm_map_entry_t  entry,
6304 	boolean_t       user_wire)
6305 {
6306 	if (user_wire) {
6307 		/*
6308 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6309 		 */
6310 
6311 		if (entry->user_wired_count == 1) {
6312 			/*
6313 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6314 			 * user wired memory for this map.
6315 			 */
6316 
6317 			assert(entry->wired_count >= 1);
6318 			entry->wired_count--;
6319 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6320 		}
6321 
6322 		assert(entry->user_wired_count >= 1);
6323 		entry->user_wired_count--;
6324 	} else {
6325 		/*
6326 		 * The kernel is unwiring the memory.   Just update the count.
6327 		 */
6328 
6329 		assert(entry->wired_count >= 1);
6330 		entry->wired_count--;
6331 	}
6332 
6333 	vme_btref_consider_and_put(entry);
6334 }
6335 
6336 int cs_executable_wire = 0;
6337 
6338 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6339 vm_map_wire_nested(
6340 	vm_map_t                map,
6341 	vm_map_offset_t         start,
6342 	vm_map_offset_t         end,
6343 	vm_prot_t               caller_prot,
6344 	vm_tag_t                tag,
6345 	boolean_t               user_wire,
6346 	pmap_t                  map_pmap,
6347 	vm_map_offset_t         pmap_addr,
6348 	ppnum_t                *physpage_p)
6349 {
6350 	vm_map_entry_t          entry;
6351 	vm_prot_t               access_type;
6352 	struct vm_map_entry     *first_entry, tmp_entry;
6353 	vm_map_t                real_map;
6354 	vm_map_offset_t         s, e;
6355 	kern_return_t           rc;
6356 	boolean_t               need_wakeup;
6357 	boolean_t               main_map = FALSE;
6358 	wait_interrupt_t        interruptible_state;
6359 	thread_t                cur_thread;
6360 	unsigned int            last_timestamp;
6361 	vm_map_size_t           size;
6362 	boolean_t               wire_and_extract;
6363 	vm_prot_t               extra_prots;
6364 
6365 	extra_prots = VM_PROT_COPY;
6366 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6367 #if XNU_TARGET_OS_OSX
6368 	if (map->pmap == kernel_pmap ||
6369 	    !vm_map_cs_enforcement(map)) {
6370 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6371 	}
6372 #endif /* XNU_TARGET_OS_OSX */
6373 #if CODE_SIGNING_MONITOR
6374 	if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6375 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6376 	}
6377 #endif /* CODE_SIGNING_MONITOR */
6378 
6379 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6380 
6381 	wire_and_extract = FALSE;
6382 	if (physpage_p != NULL) {
6383 		/*
6384 		 * The caller wants the physical page number of the
6385 		 * wired page.  We return only one physical page number
6386 		 * so this works for only one page at a time.
6387 		 *
6388 		 * The only caller (vm_map_wire_and_extract)
6389 		 * guarantees it.
6390 		 */
6391 		assert(end - start == VM_MAP_PAGE_SIZE(map));
6392 		wire_and_extract = TRUE;
6393 		*physpage_p = 0;
6394 	}
6395 
6396 	VM_MAP_RANGE_CHECK(map, start, end);
6397 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6398 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6399 	if (start == end) {
6400 		/* We wired what the caller asked for, zero pages */
6401 		return KERN_SUCCESS;
6402 	}
6403 
6404 	vm_map_lock(map);
6405 	if (map_pmap == NULL) {
6406 		main_map = TRUE;
6407 	}
6408 	last_timestamp = map->timestamp;
6409 
6410 	need_wakeup = FALSE;
6411 	cur_thread = current_thread();
6412 
6413 	s = start;
6414 	rc = KERN_SUCCESS;
6415 
6416 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6417 		entry = first_entry;
6418 		/*
6419 		 * vm_map_clip_start will be done later.
6420 		 * We don't want to unnest any nested submaps here !
6421 		 */
6422 	} else {
6423 		/* Start address is not in map */
6424 		rc = KERN_INVALID_ADDRESS;
6425 		goto done;
6426 	}
6427 
6428 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6429 		/*
6430 		 * At this point, we have wired from "start" to "s".
6431 		 * We still need to wire from "s" to "end".
6432 		 *
6433 		 * "entry" hasn't been clipped, so it could start before "s"
6434 		 * and/or end after "end".
6435 		 */
6436 
6437 		/* "e" is how far we want to wire in this entry */
6438 		e = entry->vme_end;
6439 		if (e > end) {
6440 			e = end;
6441 		}
6442 
6443 		/*
6444 		 * If another thread is wiring/unwiring this entry then
6445 		 * block after informing other thread to wake us up.
6446 		 */
6447 		if (entry->in_transition) {
6448 			wait_result_t wait_result;
6449 
6450 			/*
6451 			 * We have not clipped the entry.  Make sure that
6452 			 * the start address is in range so that the lookup
6453 			 * below will succeed.
6454 			 * "s" is the current starting point: we've already
6455 			 * wired from "start" to "s" and we still have
6456 			 * to wire from "s" to "end".
6457 			 */
6458 
6459 			entry->needs_wakeup = TRUE;
6460 
6461 			/*
6462 			 * wake up anybody waiting on entries that we have
6463 			 * already wired.
6464 			 */
6465 			if (need_wakeup) {
6466 				vm_map_entry_wakeup(map);
6467 				need_wakeup = FALSE;
6468 			}
6469 			/*
6470 			 * User wiring is interruptible
6471 			 */
6472 			wait_result = vm_map_entry_wait(map,
6473 			    (user_wire) ? THREAD_ABORTSAFE :
6474 			    THREAD_UNINT);
6475 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6476 				/*
6477 				 * undo the wirings we have done so far
6478 				 * We do not clear the needs_wakeup flag,
6479 				 * because we cannot tell if we were the
6480 				 * only one waiting.
6481 				 */
6482 				rc = KERN_FAILURE;
6483 				goto done;
6484 			}
6485 
6486 			/*
6487 			 * Cannot avoid a lookup here. reset timestamp.
6488 			 */
6489 			last_timestamp = map->timestamp;
6490 
6491 			/*
6492 			 * The entry could have been clipped, look it up again.
6493 			 * Worse that can happen is, it may not exist anymore.
6494 			 */
6495 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6496 				/*
6497 				 * User: undo everything upto the previous
6498 				 * entry.  let vm_map_unwire worry about
6499 				 * checking the validity of the range.
6500 				 */
6501 				rc = KERN_FAILURE;
6502 				goto done;
6503 			}
6504 			entry = first_entry;
6505 			continue;
6506 		}
6507 
6508 		if (entry->is_sub_map) {
6509 			vm_map_offset_t sub_start;
6510 			vm_map_offset_t sub_end;
6511 			vm_map_offset_t local_start;
6512 			vm_map_offset_t local_end;
6513 			pmap_t          pmap;
6514 
6515 			if (wire_and_extract) {
6516 				/*
6517 				 * Wiring would result in copy-on-write
6518 				 * which would not be compatible with
6519 				 * the sharing we have with the original
6520 				 * provider of this memory.
6521 				 */
6522 				rc = KERN_INVALID_ARGUMENT;
6523 				goto done;
6524 			}
6525 
6526 			vm_map_clip_start(map, entry, s);
6527 			vm_map_clip_end(map, entry, end);
6528 
6529 			sub_start = VME_OFFSET(entry);
6530 			sub_end = entry->vme_end;
6531 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6532 
6533 			local_end = entry->vme_end;
6534 			if (map_pmap == NULL) {
6535 				vm_object_t             object;
6536 				vm_object_offset_t      offset;
6537 				vm_prot_t               prot;
6538 				boolean_t               wired;
6539 				vm_map_entry_t          local_entry;
6540 				vm_map_version_t         version;
6541 				vm_map_t                lookup_map;
6542 
6543 				if (entry->use_pmap) {
6544 					pmap = VME_SUBMAP(entry)->pmap;
6545 					/* ppc implementation requires that */
6546 					/* submaps pmap address ranges line */
6547 					/* up with parent map */
6548 #ifdef notdef
6549 					pmap_addr = sub_start;
6550 #endif
6551 					pmap_addr = s;
6552 				} else {
6553 					pmap = map->pmap;
6554 					pmap_addr = s;
6555 				}
6556 
6557 				if (entry->wired_count) {
6558 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6559 						goto done;
6560 					}
6561 
6562 					/*
6563 					 * The map was not unlocked:
6564 					 * no need to goto re-lookup.
6565 					 * Just go directly to next entry.
6566 					 */
6567 					entry = entry->vme_next;
6568 					s = entry->vme_start;
6569 					continue;
6570 				}
6571 
6572 				/* call vm_map_lookup_and_lock_object to */
6573 				/* cause any needs copy to be   */
6574 				/* evaluated */
6575 				local_start = entry->vme_start;
6576 				lookup_map = map;
6577 				vm_map_lock_write_to_read(map);
6578 				rc = vm_map_lookup_and_lock_object(
6579 					&lookup_map, local_start,
6580 					(access_type | extra_prots),
6581 					OBJECT_LOCK_EXCLUSIVE,
6582 					&version, &object,
6583 					&offset, &prot, &wired,
6584 					NULL,
6585 					&real_map, NULL);
6586 				if (rc != KERN_SUCCESS) {
6587 					vm_map_unlock_read(lookup_map);
6588 					assert(map_pmap == NULL);
6589 					vm_map_unwire_nested(map, start,
6590 					    s, user_wire, PMAP_NULL, 0);
6591 					return rc;
6592 				}
6593 				vm_object_unlock(object);
6594 				if (real_map != lookup_map) {
6595 					vm_map_unlock(real_map);
6596 				}
6597 				vm_map_unlock_read(lookup_map);
6598 				vm_map_lock(map);
6599 
6600 				/* we unlocked, so must re-lookup */
6601 				if (!vm_map_lookup_entry(map,
6602 				    local_start,
6603 				    &local_entry)) {
6604 					rc = KERN_FAILURE;
6605 					goto done;
6606 				}
6607 
6608 				/*
6609 				 * entry could have been "simplified",
6610 				 * so re-clip
6611 				 */
6612 				entry = local_entry;
6613 				assert(s == local_start);
6614 				vm_map_clip_start(map, entry, s);
6615 				vm_map_clip_end(map, entry, end);
6616 				/* re-compute "e" */
6617 				e = entry->vme_end;
6618 				if (e > end) {
6619 					e = end;
6620 				}
6621 
6622 				/* did we have a change of type? */
6623 				if (!entry->is_sub_map) {
6624 					last_timestamp = map->timestamp;
6625 					continue;
6626 				}
6627 			} else {
6628 				local_start = entry->vme_start;
6629 				pmap = map_pmap;
6630 			}
6631 
6632 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6633 				goto done;
6634 			}
6635 
6636 			entry->in_transition = TRUE;
6637 
6638 			vm_map_unlock(map);
6639 			rc = vm_map_wire_nested(VME_SUBMAP(entry),
6640 			    sub_start, sub_end,
6641 			    caller_prot, tag,
6642 			    user_wire, pmap, pmap_addr,
6643 			    NULL);
6644 			vm_map_lock(map);
6645 
6646 			/*
6647 			 * Find the entry again.  It could have been clipped
6648 			 * after we unlocked the map.
6649 			 */
6650 			if (!vm_map_lookup_entry(map, local_start,
6651 			    &first_entry)) {
6652 				panic("vm_map_wire: re-lookup failed");
6653 			}
6654 			entry = first_entry;
6655 
6656 			assert(local_start == s);
6657 			/* re-compute "e" */
6658 			e = entry->vme_end;
6659 			if (e > end) {
6660 				e = end;
6661 			}
6662 
6663 			last_timestamp = map->timestamp;
6664 			while ((entry != vm_map_to_entry(map)) &&
6665 			    (entry->vme_start < e)) {
6666 				assert(entry->in_transition);
6667 				entry->in_transition = FALSE;
6668 				if (entry->needs_wakeup) {
6669 					entry->needs_wakeup = FALSE;
6670 					need_wakeup = TRUE;
6671 				}
6672 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6673 					subtract_wire_counts(map, entry, user_wire);
6674 				}
6675 				entry = entry->vme_next;
6676 			}
6677 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6678 				goto done;
6679 			}
6680 
6681 			/* no need to relookup again */
6682 			s = entry->vme_start;
6683 			continue;
6684 		}
6685 
6686 		/*
6687 		 * If this entry is already wired then increment
6688 		 * the appropriate wire reference count.
6689 		 */
6690 		if (entry->wired_count) {
6691 			if ((entry->protection & access_type) != access_type) {
6692 				/* found a protection problem */
6693 
6694 				/*
6695 				 * XXX FBDP
6696 				 * We should always return an error
6697 				 * in this case but since we didn't
6698 				 * enforce it before, let's do
6699 				 * it only for the new "wire_and_extract"
6700 				 * code path for now...
6701 				 */
6702 				if (wire_and_extract) {
6703 					rc = KERN_PROTECTION_FAILURE;
6704 					goto done;
6705 				}
6706 			}
6707 
6708 			/*
6709 			 * entry is already wired down, get our reference
6710 			 * after clipping to our range.
6711 			 */
6712 			vm_map_clip_start(map, entry, s);
6713 			vm_map_clip_end(map, entry, end);
6714 
6715 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6716 				goto done;
6717 			}
6718 
6719 			if (wire_and_extract) {
6720 				vm_object_t             object;
6721 				vm_object_offset_t      offset;
6722 				vm_page_t               m;
6723 
6724 				/*
6725 				 * We don't have to "wire" the page again
6726 				 * bit we still have to "extract" its
6727 				 * physical page number, after some sanity
6728 				 * checks.
6729 				 */
6730 				assert((entry->vme_end - entry->vme_start)
6731 				    == PAGE_SIZE);
6732 				assert(!entry->needs_copy);
6733 				assert(!entry->is_sub_map);
6734 				assert(VME_OBJECT(entry));
6735 				if (((entry->vme_end - entry->vme_start)
6736 				    != PAGE_SIZE) ||
6737 				    entry->needs_copy ||
6738 				    entry->is_sub_map ||
6739 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
6740 					rc = KERN_INVALID_ARGUMENT;
6741 					goto done;
6742 				}
6743 
6744 				object = VME_OBJECT(entry);
6745 				offset = VME_OFFSET(entry);
6746 				/* need exclusive lock to update m->dirty */
6747 				if (entry->protection & VM_PROT_WRITE) {
6748 					vm_object_lock(object);
6749 				} else {
6750 					vm_object_lock_shared(object);
6751 				}
6752 				m = vm_page_lookup(object, offset);
6753 				assert(m != VM_PAGE_NULL);
6754 				assert(VM_PAGE_WIRED(m));
6755 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6756 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6757 					if (entry->protection & VM_PROT_WRITE) {
6758 						vm_object_lock_assert_exclusive(
6759 							object);
6760 						m->vmp_dirty = TRUE;
6761 					}
6762 				} else {
6763 					/* not already wired !? */
6764 					*physpage_p = 0;
6765 				}
6766 				vm_object_unlock(object);
6767 			}
6768 
6769 			/* map was not unlocked: no need to relookup */
6770 			entry = entry->vme_next;
6771 			s = entry->vme_start;
6772 			continue;
6773 		}
6774 
6775 		/*
6776 		 * Unwired entry or wire request transmitted via submap
6777 		 */
6778 
6779 		/*
6780 		 * Wiring would copy the pages to the shadow object.
6781 		 * The shadow object would not be code-signed so
6782 		 * attempting to execute code from these copied pages
6783 		 * would trigger a code-signing violation.
6784 		 */
6785 
6786 		if ((entry->protection & VM_PROT_EXECUTE)
6787 #if XNU_TARGET_OS_OSX
6788 		    &&
6789 		    map->pmap != kernel_pmap &&
6790 		    (vm_map_cs_enforcement(map)
6791 #if __arm64__
6792 		    || !VM_MAP_IS_EXOTIC(map)
6793 #endif /* __arm64__ */
6794 		    )
6795 #endif /* XNU_TARGET_OS_OSX */
6796 #if CODE_SIGNING_MONITOR
6797 		    &&
6798 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6799 #endif
6800 		    ) {
6801 #if MACH_ASSERT
6802 			printf("pid %d[%s] wiring executable range from "
6803 			    "0x%llx to 0x%llx: rejected to preserve "
6804 			    "code-signing\n",
6805 			    proc_selfpid(),
6806 			    (get_bsdtask_info(current_task())
6807 			    ? proc_name_address(get_bsdtask_info(current_task()))
6808 			    : "?"),
6809 			    (uint64_t) entry->vme_start,
6810 			    (uint64_t) entry->vme_end);
6811 #endif /* MACH_ASSERT */
6812 			DTRACE_VM2(cs_executable_wire,
6813 			    uint64_t, (uint64_t)entry->vme_start,
6814 			    uint64_t, (uint64_t)entry->vme_end);
6815 			cs_executable_wire++;
6816 			rc = KERN_PROTECTION_FAILURE;
6817 			goto done;
6818 		}
6819 
6820 		/*
6821 		 * Perform actions of vm_map_lookup that need the write
6822 		 * lock on the map: create a shadow object for a
6823 		 * copy-on-write region, or an object for a zero-fill
6824 		 * region.
6825 		 */
6826 		size = entry->vme_end - entry->vme_start;
6827 		/*
6828 		 * If wiring a copy-on-write page, we need to copy it now
6829 		 * even if we're only (currently) requesting read access.
6830 		 * This is aggressive, but once it's wired we can't move it.
6831 		 */
6832 		if (entry->needs_copy) {
6833 			if (wire_and_extract) {
6834 				/*
6835 				 * We're supposed to share with the original
6836 				 * provider so should not be "needs_copy"
6837 				 */
6838 				rc = KERN_INVALID_ARGUMENT;
6839 				goto done;
6840 			}
6841 
6842 			VME_OBJECT_SHADOW(entry, size,
6843 			    vm_map_always_shadow(map));
6844 			entry->needs_copy = FALSE;
6845 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6846 			if (wire_and_extract) {
6847 				/*
6848 				 * We're supposed to share with the original
6849 				 * provider so should already have an object.
6850 				 */
6851 				rc = KERN_INVALID_ARGUMENT;
6852 				goto done;
6853 			}
6854 			VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6855 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6856 			assert(entry->use_pmap);
6857 		} else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6858 			if (wire_and_extract) {
6859 				/*
6860 				 * We're supposed to share with the original
6861 				 * provider so should not be COPY_SYMMETRIC.
6862 				 */
6863 				rc = KERN_INVALID_ARGUMENT;
6864 				goto done;
6865 			}
6866 			/*
6867 			 * Force an unrequested "copy-on-write" but only for
6868 			 * the range we're wiring.
6869 			 */
6870 //			printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6871 			vm_map_clip_start(map, entry, s);
6872 			vm_map_clip_end(map, entry, end);
6873 			/* recompute "size" */
6874 			size = entry->vme_end - entry->vme_start;
6875 			/* make a shadow object */
6876 			vm_object_t orig_object;
6877 			vm_object_offset_t orig_offset;
6878 			orig_object = VME_OBJECT(entry);
6879 			orig_offset = VME_OFFSET(entry);
6880 			VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6881 			if (VME_OBJECT(entry) != orig_object) {
6882 				/*
6883 				 * This mapping has not been shared (or it would be
6884 				 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6885 				 * not been copied-on-write (or it would be marked
6886 				 * as "needs_copy" and would have been handled above
6887 				 * and also already write-protected).
6888 				 * We still need to write-protect here to prevent
6889 				 * other threads from modifying these pages while
6890 				 * we're in the process of copying and wiring
6891 				 * the copied pages.
6892 				 * Since the mapping is neither shared nor COWed,
6893 				 * we only need to write-protect the PTEs for this
6894 				 * mapping.
6895 				 */
6896 				vm_object_pmap_protect(orig_object,
6897 				    orig_offset,
6898 				    size,
6899 				    map->pmap,
6900 				    VM_MAP_PAGE_SIZE(map),
6901 				    entry->vme_start,
6902 				    entry->protection & ~VM_PROT_WRITE);
6903 			}
6904 		}
6905 		if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6906 			/*
6907 			 * Make the object COPY_DELAY to get a stable object
6908 			 * to wire.
6909 			 * That should avoid creating long shadow chains while
6910 			 * wiring/unwiring the same range repeatedly.
6911 			 * That also prevents part of the object from being
6912 			 * wired while another part is "needs_copy", which
6913 			 * could result in conflicting rules wrt copy-on-write.
6914 			 */
6915 			vm_object_t object;
6916 
6917 			object = VME_OBJECT(entry);
6918 			vm_object_lock(object);
6919 			if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6920 				assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
6921 				    "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
6922 				    object, (uint64_t)object->vo_size,
6923 				    entry,
6924 				    (uint64_t)entry->vme_start,
6925 				    (uint64_t)entry->vme_end,
6926 				    (uint64_t)VME_OFFSET(entry),
6927 				    (uint64_t)size);
6928 				assertf(os_ref_get_count_raw(&object->ref_count) == 1,
6929 				    "object %p ref_count %d\n",
6930 				    object, os_ref_get_count_raw(&object->ref_count));
6931 				assertf(!entry->needs_copy,
6932 				    "entry %p\n", entry);
6933 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6934 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
6935 			}
6936 			vm_object_unlock(object);
6937 		}
6938 
6939 		vm_map_clip_start(map, entry, s);
6940 		vm_map_clip_end(map, entry, end);
6941 
6942 		/* re-compute "e" */
6943 		e = entry->vme_end;
6944 		if (e > end) {
6945 			e = end;
6946 		}
6947 
6948 		/*
6949 		 * Check for holes and protection mismatch.
6950 		 * Holes: Next entry should be contiguous unless this
6951 		 *	  is the end of the region.
6952 		 * Protection: Access requested must be allowed, unless
6953 		 *	wiring is by protection class
6954 		 */
6955 		if ((entry->vme_end < end) &&
6956 		    ((entry->vme_next == vm_map_to_entry(map)) ||
6957 		    (entry->vme_next->vme_start > entry->vme_end))) {
6958 			/* found a hole */
6959 			rc = KERN_INVALID_ADDRESS;
6960 			goto done;
6961 		}
6962 		if ((entry->protection & access_type) != access_type) {
6963 			/* found a protection problem */
6964 			rc = KERN_PROTECTION_FAILURE;
6965 			goto done;
6966 		}
6967 
6968 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
6969 
6970 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6971 			goto done;
6972 		}
6973 
6974 		entry->in_transition = TRUE;
6975 
6976 		/*
6977 		 * This entry might get split once we unlock the map.
6978 		 * In vm_fault_wire(), we need the current range as
6979 		 * defined by this entry.  In order for this to work
6980 		 * along with a simultaneous clip operation, we make a
6981 		 * temporary copy of this entry and use that for the
6982 		 * wiring.  Note that the underlying objects do not
6983 		 * change during a clip.
6984 		 */
6985 		tmp_entry = *entry;
6986 
6987 		/*
6988 		 * The in_transition state guarentees that the entry
6989 		 * (or entries for this range, if split occured) will be
6990 		 * there when the map lock is acquired for the second time.
6991 		 */
6992 		vm_map_unlock(map);
6993 
6994 		if (!user_wire && cur_thread != THREAD_NULL) {
6995 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
6996 		} else {
6997 			interruptible_state = THREAD_UNINT;
6998 		}
6999 
7000 		if (map_pmap) {
7001 			rc = vm_fault_wire(map,
7002 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7003 			    physpage_p);
7004 		} else {
7005 			rc = vm_fault_wire(map,
7006 			    &tmp_entry, caller_prot, tag, map->pmap,
7007 			    tmp_entry.vme_start,
7008 			    physpage_p);
7009 		}
7010 
7011 		if (!user_wire && cur_thread != THREAD_NULL) {
7012 			thread_interrupt_level(interruptible_state);
7013 		}
7014 
7015 		vm_map_lock(map);
7016 
7017 		if (last_timestamp + 1 != map->timestamp) {
7018 			/*
7019 			 * Find the entry again.  It could have been clipped
7020 			 * after we unlocked the map.
7021 			 */
7022 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7023 			    &first_entry)) {
7024 				panic("vm_map_wire: re-lookup failed");
7025 			}
7026 
7027 			entry = first_entry;
7028 		}
7029 
7030 		last_timestamp = map->timestamp;
7031 
7032 		while ((entry != vm_map_to_entry(map)) &&
7033 		    (entry->vme_start < tmp_entry.vme_end)) {
7034 			assert(entry->in_transition);
7035 			entry->in_transition = FALSE;
7036 			if (entry->needs_wakeup) {
7037 				entry->needs_wakeup = FALSE;
7038 				need_wakeup = TRUE;
7039 			}
7040 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
7041 				subtract_wire_counts(map, entry, user_wire);
7042 			}
7043 			entry = entry->vme_next;
7044 		}
7045 
7046 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
7047 			goto done;
7048 		}
7049 
7050 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7051 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
7052 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7053 			/* found a "new" hole */
7054 			s = tmp_entry.vme_end;
7055 			rc = KERN_INVALID_ADDRESS;
7056 			goto done;
7057 		}
7058 
7059 		s = entry->vme_start;
7060 	} /* end while loop through map entries */
7061 
7062 done:
7063 	if (rc == KERN_SUCCESS) {
7064 		/* repair any damage we may have made to the VM map */
7065 		vm_map_simplify_range(map, start, end);
7066 	}
7067 
7068 	vm_map_unlock(map);
7069 
7070 	/*
7071 	 * wake up anybody waiting on entries we wired.
7072 	 */
7073 	if (need_wakeup) {
7074 		vm_map_entry_wakeup(map);
7075 	}
7076 
7077 	if (rc != KERN_SUCCESS) {
7078 		/* undo what has been wired so far */
7079 		vm_map_unwire_nested(map, start, s, user_wire,
7080 		    map_pmap, pmap_addr);
7081 		if (physpage_p) {
7082 			*physpage_p = 0;
7083 		}
7084 	}
7085 
7086 	return rc;
7087 }
7088 
7089 static __attribute__((always_inline, warn_unused_result))
7090 kern_return_t
vm_map_wire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size,vm_prot_t * prot)7091 vm_map_wire_sanitize(
7092 	vm_map_t                map,
7093 	vm_map_offset_ut        start_u,
7094 	vm_map_offset_ut        end_u,
7095 	vm_prot_ut              prot_u,
7096 	vm_sanitize_caller_t    vm_sanitize_caller,
7097 	vm_map_offset_t        *start,
7098 	vm_map_offset_t        *end,
7099 	vm_map_size_t          *size,
7100 	vm_prot_t              *prot)
7101 {
7102 	kern_return_t   kr;
7103 
7104 	kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7105 	    VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7106 	    size);
7107 	if (__improbable(kr != KERN_SUCCESS)) {
7108 		return kr;
7109 	}
7110 
7111 	kr = vm_sanitize_prot(prot_u, vm_sanitize_caller, map, prot);
7112 	if (__improbable(kr != KERN_SUCCESS)) {
7113 		return kr;
7114 	}
7115 
7116 	return KERN_SUCCESS;
7117 }
7118 
7119 /*
7120  * Validation function for vm_map_wire_nested().
7121  */
7122 kern_return_t
vm_map_wire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p,vm_sanitize_caller_t vm_sanitize_caller)7123 vm_map_wire_impl(
7124 	vm_map_t                map,
7125 	vm_map_offset_ut        start_u,
7126 	vm_map_offset_ut        end_u,
7127 	vm_prot_ut              prot_u,
7128 	vm_tag_t                tag,
7129 	boolean_t               user_wire,
7130 	ppnum_t                *physpage_p,
7131 	vm_sanitize_caller_t    vm_sanitize_caller)
7132 {
7133 	vm_map_offset_t start, end;
7134 	vm_map_size_t   size;
7135 	vm_prot_t       prot;
7136 	kern_return_t   kr;
7137 
7138 	/*
7139 	 * Sanitize any input parameters that are addr/size/prot/inherit
7140 	 */
7141 	kr = vm_map_wire_sanitize(map,
7142 	    start_u,
7143 	    end_u,
7144 	    prot_u,
7145 	    vm_sanitize_caller,
7146 	    &start,
7147 	    &end,
7148 	    &size,
7149 	    &prot);
7150 	if (__improbable(kr != KERN_SUCCESS)) {
7151 		if (physpage_p) {
7152 			*physpage_p = 0;
7153 		}
7154 		return vm_sanitize_get_kr(kr);
7155 	}
7156 
7157 	return vm_map_wire_nested(map, start, end, prot, tag, user_wire,
7158 	           PMAP_NULL, 0, physpage_p);
7159 }
7160 
7161 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,boolean_t user_wire)7162 vm_map_wire_external(
7163 	vm_map_t                map,
7164 	vm_map_offset_ut        start_u,
7165 	vm_map_offset_ut        end_u,
7166 	vm_prot_ut              prot_u,
7167 	boolean_t               user_wire)
7168 {
7169 	vm_tag_t tag = vm_tag_bt();
7170 
7171 	return vm_map_wire_kernel(map, start_u, end_u, prot_u, tag, user_wire);
7172 }
7173 
7174 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire)7175 vm_map_wire_kernel(
7176 	vm_map_t                map,
7177 	vm_map_offset_ut        start_u,
7178 	vm_map_offset_ut        end_u,
7179 	vm_prot_ut              prot_u,
7180 	vm_tag_t                tag,
7181 	boolean_t               user_wire)
7182 {
7183 	return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7184 	           user_wire, NULL, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7185 }
7186 
7187 #if XNU_PLATFORM_MacOSX
7188 
7189 kern_return_t
vm_map_wire_and_extract(vm_map_t map,vm_map_offset_ut start_u,vm_prot_ut prot_u,boolean_t user_wire,ppnum_t * physpage_p)7190 vm_map_wire_and_extract(
7191 	vm_map_t                map,
7192 	vm_map_offset_ut        start_u,
7193 	vm_prot_ut              prot_u,
7194 	boolean_t               user_wire,
7195 	ppnum_t                *physpage_p)
7196 {
7197 	vm_tag_t         tag    = vm_tag_bt();
7198 	vm_map_size_ut   size_u = vm_sanitize_wrap_size(VM_MAP_PAGE_SIZE(map));
7199 	vm_map_offset_ut end_u  = vm_sanitize_compute_ut_end(start_u, size_u);
7200 
7201 	return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7202 	           user_wire, physpage_p, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7203 }
7204 
7205 #endif /* XNU_PLATFORM_MacOSX */
7206 
7207 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7208 vm_map_unwire_nested(
7209 	vm_map_t                map,
7210 	vm_map_offset_t         start,
7211 	vm_map_offset_t         end,
7212 	boolean_t               user_wire,
7213 	pmap_t                  map_pmap,
7214 	vm_map_offset_t         pmap_addr)
7215 {
7216 	vm_map_entry_t          entry;
7217 	struct vm_map_entry     *first_entry, tmp_entry;
7218 	boolean_t               need_wakeup;
7219 	boolean_t               main_map = FALSE;
7220 	unsigned int            last_timestamp;
7221 
7222 	VM_MAP_RANGE_CHECK(map, start, end);
7223 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7224 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7225 
7226 	if (start == end) {
7227 		/* We unwired what the caller asked for: zero pages */
7228 		return KERN_SUCCESS;
7229 	}
7230 
7231 	vm_map_lock(map);
7232 	if (map_pmap == NULL) {
7233 		main_map = TRUE;
7234 	}
7235 	last_timestamp = map->timestamp;
7236 
7237 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7238 		entry = first_entry;
7239 		/*
7240 		 * vm_map_clip_start will be done later.
7241 		 * We don't want to unnest any nested sub maps here !
7242 		 */
7243 	} else {
7244 		if (!user_wire) {
7245 			panic("vm_map_unwire: start not found");
7246 		}
7247 		/*	Start address is not in map. */
7248 		vm_map_unlock(map);
7249 		return KERN_INVALID_ADDRESS;
7250 	}
7251 
7252 	if (entry->superpage_size) {
7253 		/* superpages are always wired */
7254 		vm_map_unlock(map);
7255 		return KERN_INVALID_ADDRESS;
7256 	}
7257 
7258 	need_wakeup = FALSE;
7259 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7260 		if (entry->in_transition) {
7261 			/*
7262 			 * 1)
7263 			 * Another thread is wiring down this entry. Note
7264 			 * that if it is not for the other thread we would
7265 			 * be unwiring an unwired entry.  This is not
7266 			 * permitted.  If we wait, we will be unwiring memory
7267 			 * we did not wire.
7268 			 *
7269 			 * 2)
7270 			 * Another thread is unwiring this entry.  We did not
7271 			 * have a reference to it, because if we did, this
7272 			 * entry will not be getting unwired now.
7273 			 */
7274 			if (!user_wire) {
7275 				/*
7276 				 * XXX FBDP
7277 				 * This could happen:  there could be some
7278 				 * overlapping vslock/vsunlock operations
7279 				 * going on.
7280 				 * We should probably just wait and retry,
7281 				 * but then we have to be careful that this
7282 				 * entry could get "simplified" after
7283 				 * "in_transition" gets unset and before
7284 				 * we re-lookup the entry, so we would
7285 				 * have to re-clip the entry to avoid
7286 				 * re-unwiring what we have already unwired...
7287 				 * See vm_map_wire_nested().
7288 				 *
7289 				 * Or we could just ignore "in_transition"
7290 				 * here and proceed to decement the wired
7291 				 * count(s) on this entry.  That should be fine
7292 				 * as long as "wired_count" doesn't drop all
7293 				 * the way to 0 (and we should panic if THAT
7294 				 * happens).
7295 				 */
7296 				panic("vm_map_unwire: in_transition entry");
7297 			}
7298 
7299 			entry = entry->vme_next;
7300 			continue;
7301 		}
7302 
7303 		if (entry->is_sub_map) {
7304 			vm_map_offset_t sub_start;
7305 			vm_map_offset_t sub_end;
7306 			vm_map_offset_t local_end;
7307 			pmap_t          pmap;
7308 
7309 			vm_map_clip_start(map, entry, start);
7310 			vm_map_clip_end(map, entry, end);
7311 
7312 			sub_start = VME_OFFSET(entry);
7313 			sub_end = entry->vme_end - entry->vme_start;
7314 			sub_end += VME_OFFSET(entry);
7315 			local_end = entry->vme_end;
7316 			if (map_pmap == NULL) {
7317 				if (entry->use_pmap) {
7318 					pmap = VME_SUBMAP(entry)->pmap;
7319 					pmap_addr = sub_start;
7320 				} else {
7321 					pmap = map->pmap;
7322 					pmap_addr = start;
7323 				}
7324 				if (entry->wired_count == 0 ||
7325 				    (user_wire && entry->user_wired_count == 0)) {
7326 					if (!user_wire) {
7327 						panic("vm_map_unwire: entry is unwired");
7328 					}
7329 					entry = entry->vme_next;
7330 					continue;
7331 				}
7332 
7333 				/*
7334 				 * Check for holes
7335 				 * Holes: Next entry should be contiguous unless
7336 				 * this is the end of the region.
7337 				 */
7338 				if (((entry->vme_end < end) &&
7339 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7340 				    (entry->vme_next->vme_start
7341 				    > entry->vme_end)))) {
7342 					if (!user_wire) {
7343 						panic("vm_map_unwire: non-contiguous region");
7344 					}
7345 /*
7346  *                                       entry = entry->vme_next;
7347  *                                       continue;
7348  */
7349 				}
7350 
7351 				subtract_wire_counts(map, entry, user_wire);
7352 
7353 				if (entry->wired_count != 0) {
7354 					entry = entry->vme_next;
7355 					continue;
7356 				}
7357 
7358 				entry->in_transition = TRUE;
7359 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7360 
7361 				/*
7362 				 * We can unlock the map now. The in_transition state
7363 				 * guarantees existance of the entry.
7364 				 */
7365 				vm_map_unlock(map);
7366 				vm_map_unwire_nested(VME_SUBMAP(entry),
7367 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7368 				vm_map_lock(map);
7369 
7370 				if (last_timestamp + 1 != map->timestamp) {
7371 					/*
7372 					 * Find the entry again.  It could have been
7373 					 * clipped or deleted after we unlocked the map.
7374 					 */
7375 					if (!vm_map_lookup_entry(map,
7376 					    tmp_entry.vme_start,
7377 					    &first_entry)) {
7378 						if (!user_wire) {
7379 							panic("vm_map_unwire: re-lookup failed");
7380 						}
7381 						entry = first_entry->vme_next;
7382 					} else {
7383 						entry = first_entry;
7384 					}
7385 				}
7386 				last_timestamp = map->timestamp;
7387 
7388 				/*
7389 				 * clear transition bit for all constituent entries
7390 				 * that were in the original entry (saved in
7391 				 * tmp_entry).  Also check for waiters.
7392 				 */
7393 				while ((entry != vm_map_to_entry(map)) &&
7394 				    (entry->vme_start < tmp_entry.vme_end)) {
7395 					assert(entry->in_transition);
7396 					entry->in_transition = FALSE;
7397 					if (entry->needs_wakeup) {
7398 						entry->needs_wakeup = FALSE;
7399 						need_wakeup = TRUE;
7400 					}
7401 					entry = entry->vme_next;
7402 				}
7403 				continue;
7404 			} else {
7405 				tmp_entry = *entry;
7406 				vm_map_unlock(map);
7407 				vm_map_unwire_nested(VME_SUBMAP(entry),
7408 				    sub_start, sub_end, user_wire, map_pmap,
7409 				    pmap_addr);
7410 				vm_map_lock(map);
7411 
7412 				if (last_timestamp + 1 != map->timestamp) {
7413 					/*
7414 					 * Find the entry again.  It could have been
7415 					 * clipped or deleted after we unlocked the map.
7416 					 */
7417 					if (!vm_map_lookup_entry(map,
7418 					    tmp_entry.vme_start,
7419 					    &first_entry)) {
7420 						if (!user_wire) {
7421 							panic("vm_map_unwire: re-lookup failed");
7422 						}
7423 						entry = first_entry->vme_next;
7424 					} else {
7425 						entry = first_entry;
7426 					}
7427 				}
7428 				last_timestamp = map->timestamp;
7429 			}
7430 		}
7431 
7432 
7433 		if ((entry->wired_count == 0) ||
7434 		    (user_wire && entry->user_wired_count == 0)) {
7435 			if (!user_wire) {
7436 				panic("vm_map_unwire: entry is unwired");
7437 			}
7438 
7439 			entry = entry->vme_next;
7440 			continue;
7441 		}
7442 
7443 		assert(entry->wired_count > 0 &&
7444 		    (!user_wire || entry->user_wired_count > 0));
7445 
7446 		vm_map_clip_start(map, entry, start);
7447 		vm_map_clip_end(map, entry, end);
7448 
7449 		/*
7450 		 * Check for holes
7451 		 * Holes: Next entry should be contiguous unless
7452 		 *	  this is the end of the region.
7453 		 */
7454 		if (((entry->vme_end < end) &&
7455 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7456 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7457 			if (!user_wire) {
7458 				panic("vm_map_unwire: non-contiguous region");
7459 			}
7460 			entry = entry->vme_next;
7461 			continue;
7462 		}
7463 
7464 		subtract_wire_counts(map, entry, user_wire);
7465 
7466 		if (entry->wired_count != 0) {
7467 			entry = entry->vme_next;
7468 			continue;
7469 		}
7470 
7471 		if (entry->zero_wired_pages) {
7472 			entry->zero_wired_pages = FALSE;
7473 		}
7474 
7475 		entry->in_transition = TRUE;
7476 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7477 
7478 		/*
7479 		 * We can unlock the map now. The in_transition state
7480 		 * guarantees existance of the entry.
7481 		 */
7482 		vm_map_unlock(map);
7483 		if (map_pmap) {
7484 			vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7485 			    pmap_addr, tmp_entry.vme_end);
7486 		} else {
7487 			vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7488 			    tmp_entry.vme_start, tmp_entry.vme_end);
7489 		}
7490 		vm_map_lock(map);
7491 
7492 		if (last_timestamp + 1 != map->timestamp) {
7493 			/*
7494 			 * Find the entry again.  It could have been clipped
7495 			 * or deleted after we unlocked the map.
7496 			 */
7497 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7498 			    &first_entry)) {
7499 				if (!user_wire) {
7500 					panic("vm_map_unwire: re-lookup failed");
7501 				}
7502 				entry = first_entry->vme_next;
7503 			} else {
7504 				entry = first_entry;
7505 			}
7506 		}
7507 		last_timestamp = map->timestamp;
7508 
7509 		/*
7510 		 * clear transition bit for all constituent entries that
7511 		 * were in the original entry (saved in tmp_entry).  Also
7512 		 * check for waiters.
7513 		 */
7514 		while ((entry != vm_map_to_entry(map)) &&
7515 		    (entry->vme_start < tmp_entry.vme_end)) {
7516 			assert(entry->in_transition);
7517 			entry->in_transition = FALSE;
7518 			if (entry->needs_wakeup) {
7519 				entry->needs_wakeup = FALSE;
7520 				need_wakeup = TRUE;
7521 			}
7522 			entry = entry->vme_next;
7523 		}
7524 	}
7525 
7526 	/*
7527 	 * We might have fragmented the address space when we wired this
7528 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7529 	 * with their neighbors now that they're no longer wired.
7530 	 * Under some circumstances, address space fragmentation can
7531 	 * prevent VM object shadow chain collapsing, which can cause
7532 	 * swap space leaks.
7533 	 */
7534 	vm_map_simplify_range(map, start, end);
7535 
7536 	vm_map_unlock(map);
7537 	/*
7538 	 * wake up anybody waiting on entries that we have unwired.
7539 	 */
7540 	if (need_wakeup) {
7541 		vm_map_entry_wakeup(map);
7542 	}
7543 	return KERN_SUCCESS;
7544 }
7545 
7546 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire)7547 vm_map_unwire(
7548 	vm_map_t                map,
7549 	vm_map_offset_ut        start_u,
7550 	vm_map_offset_ut        end_u,
7551 	boolean_t               user_wire)
7552 {
7553 	return vm_map_unwire_impl(map, start_u, end_u, user_wire,
7554 	           VM_SANITIZE_CALLER_VM_MAP_UNWIRE);
7555 }
7556 
7557 static __attribute__((always_inline, warn_unused_result))
7558 kern_return_t
vm_map_unwire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)7559 vm_map_unwire_sanitize(
7560 	vm_map_t                map,
7561 	vm_map_offset_ut        start_u,
7562 	vm_map_offset_ut        end_u,
7563 	vm_sanitize_caller_t    vm_sanitize_caller,
7564 	vm_map_offset_t        *start,
7565 	vm_map_offset_t        *end,
7566 	vm_map_size_t          *size)
7567 {
7568 	return vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7569 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7570 	           size);
7571 }
7572 
7573 kern_return_t
vm_map_unwire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire,vm_sanitize_caller_t vm_sanitize_caller)7574 vm_map_unwire_impl(
7575 	vm_map_t                map,
7576 	vm_map_offset_ut        start_u,
7577 	vm_map_offset_ut        end_u,
7578 	boolean_t               user_wire,
7579 	vm_sanitize_caller_t    vm_sanitize_caller)
7580 {
7581 	vm_map_offset_t start, end;
7582 	vm_map_size_t   size;
7583 	kern_return_t   kr;
7584 
7585 	/*
7586 	 * Sanitize any input parameters that are addr/size/prot/inherit
7587 	 */
7588 	kr = vm_map_unwire_sanitize(
7589 		map,
7590 		start_u,
7591 		end_u,
7592 		vm_sanitize_caller,
7593 		&start,
7594 		&end,
7595 		&size);
7596 	if (__improbable(kr != KERN_SUCCESS)) {
7597 		return vm_sanitize_get_kr(kr);
7598 	}
7599 
7600 	return vm_map_unwire_nested(map, start, end,
7601 	           user_wire, (pmap_t)NULL, 0);
7602 }
7603 
7604 
7605 /*
7606  *	vm_map_entry_zap:	[ internal use only ]
7607  *
7608  *	Remove the entry from the target map
7609  *	and put it on a zap list.
7610  */
7611 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7612 vm_map_entry_zap(
7613 	vm_map_t                map,
7614 	vm_map_entry_t          entry,
7615 	vm_map_zap_t            zap)
7616 {
7617 	vm_map_offset_t s, e;
7618 
7619 	s = entry->vme_start;
7620 	e = entry->vme_end;
7621 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7622 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7623 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7624 		assert(page_aligned(s));
7625 		assert(page_aligned(e));
7626 	}
7627 	if (entry->map_aligned == TRUE) {
7628 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7629 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7630 	}
7631 	assert(entry->wired_count == 0);
7632 	assert(entry->user_wired_count == 0);
7633 	assert(!entry->vme_permanent);
7634 
7635 	vm_map_store_entry_unlink(map, entry, false);
7636 	map->size -= e - s;
7637 
7638 	vm_map_zap_append(zap, entry);
7639 }
7640 
7641 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7642 vm_map_submap_pmap_clean(
7643 	vm_map_t        map,
7644 	vm_map_offset_t start,
7645 	vm_map_offset_t end,
7646 	vm_map_t        sub_map,
7647 	vm_map_offset_t offset)
7648 {
7649 	vm_map_offset_t submap_start;
7650 	vm_map_offset_t submap_end;
7651 	vm_map_size_t   remove_size;
7652 	vm_map_entry_t  entry;
7653 
7654 	submap_end = offset + (end - start);
7655 	submap_start = offset;
7656 
7657 	vm_map_lock_read(sub_map);
7658 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7659 		remove_size = (entry->vme_end - entry->vme_start);
7660 		if (offset > entry->vme_start) {
7661 			remove_size -= offset - entry->vme_start;
7662 		}
7663 
7664 
7665 		if (submap_end < entry->vme_end) {
7666 			remove_size -=
7667 			    entry->vme_end - submap_end;
7668 		}
7669 		if (entry->is_sub_map) {
7670 			vm_map_submap_pmap_clean(
7671 				sub_map,
7672 				start,
7673 				start + remove_size,
7674 				VME_SUBMAP(entry),
7675 				VME_OFFSET(entry));
7676 		} else {
7677 			if (map->mapped_in_other_pmaps &&
7678 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7679 			    VME_OBJECT(entry) != NULL) {
7680 				vm_object_pmap_protect_options(
7681 					VME_OBJECT(entry),
7682 					(VME_OFFSET(entry) +
7683 					offset -
7684 					entry->vme_start),
7685 					remove_size,
7686 					PMAP_NULL,
7687 					PAGE_SIZE,
7688 					entry->vme_start,
7689 					VM_PROT_NONE,
7690 					PMAP_OPTIONS_REMOVE);
7691 			} else {
7692 				pmap_remove(map->pmap,
7693 				    (addr64_t)start,
7694 				    (addr64_t)(start + remove_size));
7695 			}
7696 		}
7697 	}
7698 
7699 	entry = entry->vme_next;
7700 
7701 	while ((entry != vm_map_to_entry(sub_map))
7702 	    && (entry->vme_start < submap_end)) {
7703 		remove_size = (entry->vme_end - entry->vme_start);
7704 		if (submap_end < entry->vme_end) {
7705 			remove_size -= entry->vme_end - submap_end;
7706 		}
7707 		if (entry->is_sub_map) {
7708 			vm_map_submap_pmap_clean(
7709 				sub_map,
7710 				(start + entry->vme_start) - offset,
7711 				((start + entry->vme_start) - offset) + remove_size,
7712 				VME_SUBMAP(entry),
7713 				VME_OFFSET(entry));
7714 		} else {
7715 			if (map->mapped_in_other_pmaps &&
7716 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7717 			    VME_OBJECT(entry) != NULL) {
7718 				vm_object_pmap_protect_options(
7719 					VME_OBJECT(entry),
7720 					VME_OFFSET(entry),
7721 					remove_size,
7722 					PMAP_NULL,
7723 					PAGE_SIZE,
7724 					entry->vme_start,
7725 					VM_PROT_NONE,
7726 					PMAP_OPTIONS_REMOVE);
7727 			} else {
7728 				pmap_remove(map->pmap,
7729 				    (addr64_t)((start + entry->vme_start)
7730 				    - offset),
7731 				    (addr64_t)(((start + entry->vme_start)
7732 				    - offset) + remove_size));
7733 			}
7734 		}
7735 		entry = entry->vme_next;
7736 	}
7737 	vm_map_unlock_read(sub_map);
7738 	return;
7739 }
7740 
7741 /*
7742  *     virt_memory_guard_ast:
7743  *
7744  *     Handle the AST callout for a virtual memory guard.
7745  *	   raise an EXC_GUARD exception and terminate the task
7746  *     if configured to do so.
7747  */
7748 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7749 virt_memory_guard_ast(
7750 	thread_t thread,
7751 	mach_exception_data_type_t code,
7752 	mach_exception_data_type_t subcode)
7753 {
7754 	task_t task = get_threadtask(thread);
7755 	assert(task != kernel_task);
7756 	assert(task == current_task());
7757 	kern_return_t sync_exception_result;
7758 	uint32_t behavior;
7759 
7760 	behavior = task->task_exc_guard;
7761 
7762 	/* Is delivery enabled */
7763 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7764 		return;
7765 	}
7766 
7767 	/* If only once, make sure we're that once */
7768 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7769 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7770 
7771 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7772 			break;
7773 		}
7774 		behavior = task->task_exc_guard;
7775 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7776 			return;
7777 		}
7778 	}
7779 
7780 	const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7781 	/* Raise exception synchronously and see if handler claimed it */
7782 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7783 
7784 	if (fatal) {
7785 		/*
7786 		 * If Synchronous EXC_GUARD delivery was successful then
7787 		 * kill the process and return, else kill the process
7788 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
7789 		 */
7790 
7791 
7792 		int flags = PX_DEBUG_NO_HONOR;
7793 		exception_info_t info = {
7794 			.os_reason = OS_REASON_GUARD,
7795 			.exception_type = EXC_GUARD,
7796 			.mx_code = code,
7797 			.mx_subcode = subcode
7798 		};
7799 
7800 		if (sync_exception_result == KERN_SUCCESS) {
7801 			flags |= PX_PSIGNAL;
7802 		}
7803 		exit_with_mach_exception(current_proc(), info, flags);
7804 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7805 		/*
7806 		 * If the synchronous EXC_GUARD delivery was not successful,
7807 		 * raise a simulated crash.
7808 		 */
7809 		if (sync_exception_result != KERN_SUCCESS) {
7810 			task_violated_guard(code, subcode, NULL, FALSE);
7811 		}
7812 	}
7813 }
7814 
7815 /*
7816  *     vm_map_guard_exception:
7817  *
7818  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7819  *
7820  *     Right now, we do this when we find nothing mapped, or a
7821  *     gap in the mapping when a user address space deallocate
7822  *     was requested. We report the address of the first gap found.
7823  */
7824 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7825 vm_map_guard_exception(
7826 	vm_map_offset_t gap_start,
7827 	unsigned reason)
7828 {
7829 	mach_exception_code_t code = 0;
7830 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7831 	unsigned int target = 0; /* should we pass in pid associated with map? */
7832 	mach_exception_data_type_t subcode = (uint64_t)gap_start;
7833 	boolean_t fatal = FALSE;
7834 
7835 	task_t task = current_task_early();
7836 
7837 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7838 	if (task == NULL || task == kernel_task) {
7839 		return;
7840 	}
7841 
7842 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
7843 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
7844 	EXC_GUARD_ENCODE_TARGET(code, target);
7845 
7846 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7847 		fatal = TRUE;
7848 	}
7849 	thread_guard_violation(current_thread(), code, subcode, fatal);
7850 }
7851 
7852 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7853 vm_map_delete_submap_recurse(
7854 	vm_map_t submap,
7855 	vm_map_offset_t submap_start,
7856 	vm_map_offset_t submap_end)
7857 {
7858 	vm_map_entry_t submap_entry;
7859 
7860 	/*
7861 	 * Verify that the submap does not contain any "permanent" entries
7862 	 * within the specified range. We permit TPRO ranges to be overwritten
7863 	 * as we only reach this path if TPRO const protection is disabled for a
7864 	 * given map.
7865 	 *
7866 	 * We do not care about gaps.
7867 	 */
7868 
7869 	vm_map_lock(submap);
7870 
7871 	if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7872 		submap_entry = submap_entry->vme_next;
7873 	}
7874 
7875 	for (;
7876 	    submap_entry != vm_map_to_entry(submap) &&
7877 	    submap_entry->vme_start < submap_end;
7878 	    submap_entry = submap_entry->vme_next) {
7879 		if (submap_entry->vme_permanent
7880 #ifdef __arm64e__
7881 		    /* allow TPRO submap entries to be overwritten */
7882 		    && !submap_entry->used_for_tpro
7883 #endif
7884 		    ) {
7885 			/* "permanent" entry -> fail */
7886 			vm_map_unlock(submap);
7887 			return KERN_PROTECTION_FAILURE;
7888 		}
7889 	}
7890 	/* no "permanent" entries in the range -> success */
7891 	vm_map_unlock(submap);
7892 	return KERN_SUCCESS;
7893 }
7894 
7895 __abortlike
7896 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7897 __vm_map_delete_misaligned_panic(
7898 	vm_map_t                map,
7899 	vm_map_offset_t         start,
7900 	vm_map_offset_t         end)
7901 {
7902 	panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7903 	    map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7904 }
7905 
7906 __abortlike
7907 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7908 __vm_map_delete_failed_panic(
7909 	vm_map_t                map,
7910 	vm_map_offset_t         start,
7911 	vm_map_offset_t         end,
7912 	kern_return_t           kr)
7913 {
7914 	panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7915 	    map, (uint64_t)start, (uint64_t)end, kr);
7916 }
7917 
7918 __abortlike
7919 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7920 __vm_map_delete_gap_panic(
7921 	vm_map_t                map,
7922 	vm_map_offset_t         where,
7923 	vm_map_offset_t         start,
7924 	vm_map_offset_t         end)
7925 {
7926 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7927 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7928 }
7929 
7930 __abortlike
7931 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7932 __vm_map_delete_permanent_panic(
7933 	vm_map_t                map,
7934 	vm_map_offset_t         start,
7935 	vm_map_offset_t         end,
7936 	vm_map_entry_t          entry)
7937 {
7938 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
7939 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7940 	    map, (uint64_t)start, (uint64_t)end, entry,
7941 	    (uint64_t)entry->vme_start,
7942 	    (uint64_t)entry->vme_end);
7943 }
7944 
7945 __options_decl(vm_map_delete_state_t, uint32_t, {
7946 	VMDS_NONE               = 0x0000,
7947 
7948 	VMDS_FOUND_GAP          = 0x0001,
7949 	VMDS_GAPS_OK            = 0x0002,
7950 
7951 	VMDS_KERNEL_PMAP        = 0x0004,
7952 	VMDS_NEEDS_LOOKUP       = 0x0008,
7953 	VMDS_NEEDS_WAKEUP       = 0x0010,
7954 	VMDS_KERNEL_KMEMPTR     = 0x0020
7955 });
7956 
7957 /*
7958  * vm_map_clamp_to_pmap(map, start, end)
7959  *
7960  * Modify *start and *end so they fall within the bounds of map->pmap.
7961  */
7962 #if MACH_ASSERT
7963 static void
vm_map_clamp_to_pmap(vm_map_t map,vm_map_address_t * start,vm_map_address_t * end)7964 vm_map_clamp_to_pmap(vm_map_t map, vm_map_address_t *start, vm_map_address_t *end)
7965 {
7966 	vm_map_address_t min;
7967 	vm_map_address_t max;
7968 
7969 #if __x86_64__
7970 	/* x86_64 struct pmap does not have min and max fields */
7971 	if (map->pmap == kernel_pmap) {
7972 		min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
7973 		max = VM_MAX_KERNEL_ADDRESS;
7974 	} else {
7975 		min = VM_MAP_MIN_ADDRESS;
7976 		max = VM_MAP_MAX_ADDRESS;
7977 	}
7978 #else
7979 	min = map->pmap->min;
7980 	max = map->pmap->max;
7981 #endif
7982 
7983 	if (*start < min) {
7984 		*start = min;
7985 	} else if (*start > max) {
7986 		*start = max;
7987 	}
7988 	if (*end < min) {
7989 		*end = min;
7990 	} else if (*end > max) {
7991 		*end = max;
7992 	}
7993 }
7994 #endif
7995 
7996 int vm_log_map_delete_permanent_prot_none = 0;
7997 /*
7998  *	vm_map_delete:	[ internal use only ]
7999  *
8000  *	Deallocates the given address range from the target map.
8001  *	Removes all user wirings. Unwires one kernel wiring if
8002  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
8003  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
8004  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8005  *
8006  *
8007  *	When the map is a kernel map, then any error in removing mappings
8008  *	will lead to a panic so that clients do not have to repeat the panic
8009  *	code at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
8010  *	is also passed, then KERN_ABORTED will not lead to a panic.
8011  *
8012  *	This routine is called with map locked and leaves map locked.
8013  */
8014 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8015 vm_map_delete(
8016 	vm_map_t                map,
8017 	vm_map_offset_t         start,
8018 	vm_map_offset_t         end,
8019 	vmr_flags_t             flags,
8020 	kmem_guard_t            guard,
8021 	vm_map_zap_t            zap_list)
8022 {
8023 	vm_map_entry_t          entry, next;
8024 	int                     interruptible;
8025 	vm_map_offset_t         gap_start = 0;
8026 	vm_map_offset_t         clear_in_transition_end = 0;
8027 	__unused vm_map_offset_t save_start = start;
8028 	__unused vm_map_offset_t save_end = end;
8029 	vm_map_delete_state_t   state = VMDS_NONE;
8030 	kmem_return_t           ret = { };
8031 	vm_map_range_id_t       range_id = 0;
8032 	struct kmem_page_meta  *meta = NULL;
8033 	uint32_t                size_idx, slot_idx;
8034 	struct mach_vm_range    slot;
8035 
8036 	if (vm_map_pmap(map) == kernel_pmap) {
8037 		state |= VMDS_KERNEL_PMAP;
8038 		range_id = kmem_addr_get_range(start, end - start);
8039 		if (kmem_is_ptr_range(range_id)) {
8040 			state |= VMDS_KERNEL_KMEMPTR;
8041 			slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8042 			    &size_idx, &slot);
8043 		}
8044 	}
8045 
8046 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8047 		state |= VMDS_GAPS_OK;
8048 	}
8049 
8050 	if (map->corpse_source &&
8051 	    !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8052 	    !map->terminated) {
8053 		/*
8054 		 * The map is being used for corpses related diagnostics.
8055 		 * So skip any entry removal to avoid perturbing the map state.
8056 		 * The cleanup will happen in task_terminate_internal after the
8057 		 * call to task_port_no_senders.
8058 		 */
8059 		goto out;
8060 	}
8061 
8062 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8063 	    THREAD_ABORTSAFE : THREAD_UNINT;
8064 
8065 	if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8066 	    (start & VM_MAP_PAGE_MASK(map))) {
8067 		__vm_map_delete_misaligned_panic(map, start, end);
8068 	}
8069 
8070 	if ((state & VMDS_GAPS_OK) == 0) {
8071 		/*
8072 		 * If the map isn't terminated then all deletions must have
8073 		 * no gaps, and be within the [min, max) of the map.
8074 		 *
8075 		 * We got here without VM_MAP_RANGE_CHECK() being called,
8076 		 * and hence must validate bounds manually.
8077 		 *
8078 		 * It is worth noting that because vm_deallocate() will
8079 		 * round_page() the deallocation size, it's possible for "end"
8080 		 * to be 0 here due to overflow. We hence must treat it as being
8081 		 * beyond vm_map_max(map).
8082 		 *
8083 		 * Similarly, end < start means some wrap around happend,
8084 		 * which should cause an error or panic.
8085 		 */
8086 		if (end == 0 || end > vm_map_max(map)) {
8087 			state |= VMDS_FOUND_GAP;
8088 			gap_start = vm_map_max(map);
8089 			if (state & VMDS_KERNEL_PMAP) {
8090 				__vm_map_delete_gap_panic(map,
8091 				    gap_start, start, end);
8092 			}
8093 			goto out;
8094 		}
8095 
8096 		if (end < start) {
8097 			if (state & VMDS_KERNEL_PMAP) {
8098 				__vm_map_delete_gap_panic(map,
8099 				    vm_map_max(map), start, end);
8100 			}
8101 			ret.kmr_return = KERN_INVALID_ARGUMENT;
8102 			goto out;
8103 		}
8104 
8105 		if (start < vm_map_min(map)) {
8106 			state |= VMDS_FOUND_GAP;
8107 			gap_start = start;
8108 			if (state & VMDS_KERNEL_PMAP) {
8109 				__vm_map_delete_gap_panic(map,
8110 				    gap_start, start, end);
8111 			}
8112 			goto out;
8113 		}
8114 	} else {
8115 		/*
8116 		 * If the map is terminated, we must accept start/end
8117 		 * being beyond the boundaries of the map as this is
8118 		 * how some of the mappings like commpage mappings
8119 		 * can be destroyed (they're outside of those bounds).
8120 		 *
8121 		 * end < start is still something we can't cope with,
8122 		 * so just bail.
8123 		 */
8124 		if (end < start) {
8125 			goto out;
8126 		}
8127 	}
8128 
8129 
8130 	/*
8131 	 *	Find the start of the region.
8132 	 *
8133 	 *	If in a superpage, extend the range
8134 	 *	to include the start of the mapping.
8135 	 */
8136 	while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8137 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8138 			start = SUPERPAGE_ROUND_DOWN(start);
8139 		} else {
8140 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8141 			break;
8142 		}
8143 	}
8144 
8145 	if (entry->superpage_size) {
8146 		end = SUPERPAGE_ROUND_UP(end);
8147 	}
8148 
8149 	/*
8150 	 *	Step through all entries in this region
8151 	 */
8152 	for (vm_map_offset_t s = start; s < end;) {
8153 		/*
8154 		 * At this point, we have deleted all the memory entries
8155 		 * in [start, s) and are proceeding with the [s, end) range.
8156 		 *
8157 		 * This loop might drop the map lock, and it is possible that
8158 		 * some memory was already reallocated within [start, s)
8159 		 * and we don't want to mess with those entries.
8160 		 *
8161 		 * Some of those entries could even have been re-assembled
8162 		 * with an entry after "s" (in vm_map_simplify_entry()), so
8163 		 * we may have to vm_map_clip_start() again.
8164 		 *
8165 		 * When clear_in_transition_end is set, the we had marked
8166 		 * [start, clear_in_transition_end) as "in_transition"
8167 		 * during a previous iteration and we need to clear it.
8168 		 */
8169 
8170 		/*
8171 		 * Step 1: If needed (because we dropped locks),
8172 		 *         lookup the entry again.
8173 		 *
8174 		 *         If we're coming back from unwiring (Step 5),
8175 		 *         we also need to mark the entries as no longer
8176 		 *         in transition after that.
8177 		 */
8178 
8179 		if (state & VMDS_NEEDS_LOOKUP) {
8180 			state &= ~VMDS_NEEDS_LOOKUP;
8181 
8182 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8183 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8184 			}
8185 
8186 			if (state & VMDS_KERNEL_KMEMPTR) {
8187 				kmem_validate_slot(s, meta, size_idx, slot_idx);
8188 			}
8189 		}
8190 
8191 		if (clear_in_transition_end) {
8192 			for (vm_map_entry_t it = entry;
8193 			    it != vm_map_to_entry(map) &&
8194 			    it->vme_start < clear_in_transition_end;
8195 			    it = it->vme_next) {
8196 				assert(it->in_transition);
8197 				it->in_transition = FALSE;
8198 				if (it->needs_wakeup) {
8199 					it->needs_wakeup = FALSE;
8200 					state |= VMDS_NEEDS_WAKEUP;
8201 				}
8202 			}
8203 
8204 			clear_in_transition_end = 0;
8205 		}
8206 
8207 
8208 		/*
8209 		 * Step 2: Perform various policy checks
8210 		 *         before we do _anything_ to this entry.
8211 		 */
8212 
8213 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8214 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8215 				/*
8216 				 * Either we found a gap already,
8217 				 * or we are tearing down a map,
8218 				 * keep going.
8219 				 */
8220 			} else if (state & VMDS_KERNEL_PMAP) {
8221 				__vm_map_delete_gap_panic(map, s, start, end);
8222 			} else if (s < end) {
8223 				state |= VMDS_FOUND_GAP;
8224 				gap_start = s;
8225 			}
8226 
8227 			if (entry == vm_map_to_entry(map) ||
8228 			    end <= entry->vme_start) {
8229 				break;
8230 			}
8231 
8232 			s = entry->vme_start;
8233 		}
8234 
8235 		if (state & VMDS_KERNEL_PMAP) {
8236 			/*
8237 			 * In the kernel map and its submaps,
8238 			 * permanent entries never die, even
8239 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8240 			 */
8241 			if (entry->vme_permanent) {
8242 				__vm_map_delete_permanent_panic(map, start, end, entry);
8243 			}
8244 
8245 			if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8246 				end = entry->vme_end;
8247 				flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8248 			}
8249 
8250 			/*
8251 			 * In the kernel map and its submaps,
8252 			 * the removal of an atomic/guarded entry is strict.
8253 			 *
8254 			 * An atomic entry is processed only if it was
8255 			 * specifically targeted.
8256 			 *
8257 			 * We might have deleted non-atomic entries before
8258 			 * we reach this this point however...
8259 			 */
8260 			kmem_entry_validate_guard(map, entry,
8261 			    start, end - start, guard);
8262 		}
8263 
8264 		/*
8265 		 * Step 2.1: handle "permanent" and "submap" entries
8266 		 * *before* clipping to avoid triggering some unnecessary
8267 		 * un-nesting of the shared region.
8268 		 */
8269 		if (entry->vme_permanent && entry->is_sub_map) {
8270 //			printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8271 			/*
8272 			 * Un-mapping a "permanent" mapping of a user-space
8273 			 * submap is not allowed unless...
8274 			 */
8275 			if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8276 				/*
8277 				 * a. explicitly requested by the kernel caller.
8278 				 */
8279 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8280 			} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8281 			    developer_mode_state()) {
8282 				/*
8283 				 * b. we're in "developer" mode (for
8284 				 *    breakpoints, dtrace probes, ...).
8285 				 */
8286 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8287 			} else if (map->terminated) {
8288 				/*
8289 				 * c. this is the final address space cleanup.
8290 				 */
8291 //				printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8292 			} else {
8293 				vm_map_offset_t submap_start, submap_end;
8294 				kern_return_t submap_kr;
8295 
8296 				/*
8297 				 * Check if there are any "permanent" mappings
8298 				 * in this range in the submap.
8299 				 */
8300 				if (entry->in_transition) {
8301 					/* can that even happen ? */
8302 					goto in_transition;
8303 				}
8304 				/* compute the clipped range in the submap */
8305 				submap_start = s - entry->vme_start;
8306 				submap_start += VME_OFFSET(entry);
8307 				submap_end = end - entry->vme_start;
8308 				submap_end += VME_OFFSET(entry);
8309 				submap_kr = vm_map_delete_submap_recurse(
8310 					VME_SUBMAP(entry),
8311 					submap_start,
8312 					submap_end);
8313 				if (submap_kr != KERN_SUCCESS) {
8314 					/*
8315 					 * There are some "permanent" mappings
8316 					 * in the submap: we are not allowed
8317 					 * to remove this range.
8318 					 */
8319 					printf("%d[%s] removing permanent submap entry "
8320 					    "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8321 					    proc_selfpid(),
8322 					    (get_bsdtask_info(current_task())
8323 					    ? proc_name_address(get_bsdtask_info(current_task()))
8324 					    : "?"), entry,
8325 					    (uint64_t)entry->vme_start,
8326 					    (uint64_t)entry->vme_end,
8327 					    entry->protection,
8328 					    entry->max_protection);
8329 					DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8330 					    vm_map_entry_t, entry,
8331 					    vm_map_offset_t, entry->vme_start,
8332 					    vm_map_offset_t, entry->vme_end,
8333 					    vm_prot_t, entry->protection,
8334 					    vm_prot_t, entry->max_protection,
8335 					    int, VME_ALIAS(entry));
8336 					ret.kmr_return = KERN_PROTECTION_FAILURE;
8337 					goto out;
8338 				}
8339 				/* no permanent mappings: proceed */
8340 			}
8341 		}
8342 
8343 		/*
8344 		 * Step 3: Perform any clipping needed.
8345 		 *
8346 		 *         After this, "entry" starts at "s", ends before "end"
8347 		 */
8348 
8349 		if (entry->vme_start < s) {
8350 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8351 			    entry->map_aligned &&
8352 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8353 				/*
8354 				 * The entry will no longer be map-aligned
8355 				 * after clipping and the caller said it's OK.
8356 				 */
8357 				entry->map_aligned = FALSE;
8358 			}
8359 			vm_map_clip_start(map, entry, s);
8360 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8361 		}
8362 
8363 		if (end < entry->vme_end) {
8364 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8365 			    entry->map_aligned &&
8366 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8367 				/*
8368 				 * The entry will no longer be map-aligned
8369 				 * after clipping and the caller said it's OK.
8370 				 */
8371 				entry->map_aligned = FALSE;
8372 			}
8373 			vm_map_clip_end(map, entry, end);
8374 		}
8375 
8376 		if (entry->vme_permanent && entry->is_sub_map) {
8377 			/*
8378 			 * We already went through step 2.1 which did not deny
8379 			 * the removal of this "permanent" and "is_sub_map"
8380 			 * entry.
8381 			 * Now that we've clipped what we actually want to
8382 			 * delete, undo the "permanent" part to allow the
8383 			 * removal to proceed.
8384 			 */
8385 			DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8386 			    vm_map_entry_t, entry,
8387 			    vm_map_offset_t, entry->vme_start,
8388 			    vm_map_offset_t, entry->vme_end,
8389 			    vm_prot_t, entry->protection,
8390 			    vm_prot_t, entry->max_protection,
8391 			    int, VME_ALIAS(entry));
8392 			entry->vme_permanent = false;
8393 		}
8394 
8395 		assert(s == entry->vme_start);
8396 		assert(entry->vme_end <= end);
8397 
8398 
8399 		/*
8400 		 * Step 4: If the entry is in flux, wait for this to resolve.
8401 		 */
8402 
8403 		if (entry->in_transition) {
8404 			wait_result_t wait_result;
8405 
8406 in_transition:
8407 			/*
8408 			 * Another thread is wiring/unwiring this entry.
8409 			 * Let the other thread know we are waiting.
8410 			 */
8411 
8412 			entry->needs_wakeup = TRUE;
8413 
8414 			/*
8415 			 * wake up anybody waiting on entries that we have
8416 			 * already unwired/deleted.
8417 			 */
8418 			if (state & VMDS_NEEDS_WAKEUP) {
8419 				vm_map_entry_wakeup(map);
8420 				state &= ~VMDS_NEEDS_WAKEUP;
8421 			}
8422 
8423 			wait_result = vm_map_entry_wait(map, interruptible);
8424 
8425 			if (interruptible &&
8426 			    wait_result == THREAD_INTERRUPTED) {
8427 				/*
8428 				 * We do not clear the needs_wakeup flag,
8429 				 * since we cannot tell if we were the only one.
8430 				 */
8431 				ret.kmr_return = KERN_ABORTED;
8432 				return ret;
8433 			}
8434 
8435 			/*
8436 			 * The entry could have been clipped or it
8437 			 * may not exist anymore.  Look it up again.
8438 			 */
8439 			state |= VMDS_NEEDS_LOOKUP;
8440 			continue;
8441 		}
8442 
8443 
8444 		/*
8445 		 * Step 5: Handle wiring
8446 		 */
8447 
8448 		if (entry->wired_count) {
8449 			struct vm_map_entry tmp_entry;
8450 			boolean_t           user_wire;
8451 			unsigned int        last_timestamp;
8452 
8453 			user_wire = entry->user_wired_count > 0;
8454 
8455 			/*
8456 			 *      Remove a kernel wiring if requested
8457 			 */
8458 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8459 				entry->wired_count--;
8460 				vme_btref_consider_and_put(entry);
8461 			}
8462 
8463 			/*
8464 			 *	Remove all user wirings for proper accounting
8465 			 */
8466 			while (entry->user_wired_count) {
8467 				subtract_wire_counts(map, entry, user_wire);
8468 			}
8469 
8470 			/*
8471 			 * All our DMA I/O operations in IOKit are currently
8472 			 * done by wiring through the map entries of the task
8473 			 * requesting the I/O.
8474 			 *
8475 			 * Because of this, we must always wait for kernel wirings
8476 			 * to go away on the entries before deleting them.
8477 			 *
8478 			 * Any caller who wants to actually remove a kernel wiring
8479 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8480 			 * properly remove one wiring instead of blasting through
8481 			 * them all.
8482 			 */
8483 			if (entry->wired_count != 0) {
8484 				assert(map != kernel_map);
8485 				/*
8486 				 * Cannot continue.  Typical case is when
8487 				 * a user thread has physical io pending on
8488 				 * on this page.  Either wait for the
8489 				 * kernel wiring to go away or return an
8490 				 * error.
8491 				 */
8492 				wait_result_t wait_result;
8493 
8494 				entry->needs_wakeup = TRUE;
8495 				wait_result = vm_map_entry_wait(map,
8496 				    interruptible);
8497 
8498 				if (interruptible &&
8499 				    wait_result == THREAD_INTERRUPTED) {
8500 					/*
8501 					 * We do not clear the
8502 					 * needs_wakeup flag, since we
8503 					 * cannot tell if we were the
8504 					 * only one.
8505 					 */
8506 					ret.kmr_return = KERN_ABORTED;
8507 					return ret;
8508 				}
8509 
8510 
8511 				/*
8512 				 * The entry could have been clipped or
8513 				 * it may not exist anymore.  Look it
8514 				 * up again.
8515 				 */
8516 				state |= VMDS_NEEDS_LOOKUP;
8517 				continue;
8518 			}
8519 
8520 			/*
8521 			 * We can unlock the map now.
8522 			 *
8523 			 * The entry might be split once we unlock the map,
8524 			 * but we need the range as defined by this entry
8525 			 * to be stable. So we must make a local copy.
8526 			 *
8527 			 * The underlying objects do not change during clips,
8528 			 * and the in_transition state guarentees existence
8529 			 * of the entry.
8530 			 */
8531 			last_timestamp = map->timestamp;
8532 			entry->in_transition = TRUE;
8533 			tmp_entry = *entry;
8534 			vm_map_unlock(map);
8535 
8536 			if (tmp_entry.is_sub_map) {
8537 				vm_map_t sub_map;
8538 				vm_map_offset_t sub_start, sub_end;
8539 				pmap_t pmap;
8540 				vm_map_offset_t pmap_addr;
8541 
8542 
8543 				sub_map = VME_SUBMAP(&tmp_entry);
8544 				sub_start = VME_OFFSET(&tmp_entry);
8545 				sub_end = sub_start + (tmp_entry.vme_end -
8546 				    tmp_entry.vme_start);
8547 				if (tmp_entry.use_pmap) {
8548 					pmap = sub_map->pmap;
8549 					pmap_addr = tmp_entry.vme_start;
8550 				} else {
8551 					pmap = map->pmap;
8552 					pmap_addr = tmp_entry.vme_start;
8553 				}
8554 				(void) vm_map_unwire_nested(sub_map,
8555 				    sub_start, sub_end,
8556 				    user_wire,
8557 				    pmap, pmap_addr);
8558 			} else {
8559 				vm_map_offset_t entry_end = tmp_entry.vme_end;
8560 				vm_map_offset_t max_end;
8561 
8562 				if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8563 					max_end = end - VM_MAP_PAGE_SIZE(map);
8564 					if (entry_end > max_end) {
8565 						entry_end = max_end;
8566 					}
8567 				}
8568 
8569 				if (tmp_entry.vme_kernel_object) {
8570 					pmap_protect_options(
8571 						map->pmap,
8572 						tmp_entry.vme_start,
8573 						entry_end,
8574 						VM_PROT_NONE,
8575 						PMAP_OPTIONS_REMOVE,
8576 						NULL);
8577 				}
8578 				vm_fault_unwire(map, &tmp_entry,
8579 				    tmp_entry.vme_kernel_object, map->pmap,
8580 				    tmp_entry.vme_start, entry_end);
8581 			}
8582 
8583 			vm_map_lock(map);
8584 
8585 			/*
8586 			 * Unwiring happened, we can now go back to deleting
8587 			 * them (after we clear the in_transition bit for the range).
8588 			 */
8589 			if (last_timestamp + 1 != map->timestamp) {
8590 				state |= VMDS_NEEDS_LOOKUP;
8591 			}
8592 			clear_in_transition_end = tmp_entry.vme_end;
8593 			continue;
8594 		}
8595 
8596 		assert(entry->wired_count == 0);
8597 		assert(entry->user_wired_count == 0);
8598 
8599 
8600 		/*
8601 		 * Step 6: Entry is unwired and ready for us to delete !
8602 		 */
8603 
8604 		if (!entry->vme_permanent) {
8605 			/*
8606 			 * Typical case: the entry really shouldn't be permanent
8607 			 */
8608 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8609 		    (entry->protection & VM_PROT_EXECUTE) &&
8610 		    developer_mode_state()) {
8611 			/*
8612 			 * Allow debuggers to undo executable mappings
8613 			 * when developer mode is on.
8614 			 */
8615 #if 0
8616 			printf("FBDP %d[%s] removing permanent executable entry "
8617 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8618 			    proc_selfpid(),
8619 			    (current_task()->bsd_info
8620 			    ? proc_name_address(current_task()->bsd_info)
8621 			    : "?"), entry,
8622 			    (uint64_t)entry->vme_start,
8623 			    (uint64_t)entry->vme_end,
8624 			    entry->protection,
8625 			    entry->max_protection);
8626 #endif
8627 			entry->vme_permanent = FALSE;
8628 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8629 #if 0
8630 			printf("FBDP %d[%s] removing permanent entry "
8631 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8632 			    proc_selfpid(),
8633 			    (current_task()->bsd_info
8634 			    ? proc_name_address(current_task()->bsd_info)
8635 			    : "?"), entry,
8636 			    (uint64_t)entry->vme_start,
8637 			    (uint64_t)entry->vme_end,
8638 			    entry->protection,
8639 			    entry->max_protection);
8640 #endif
8641 			entry->vme_permanent = FALSE;
8642 #if CODE_SIGNING_MONITOR
8643 		} else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8644 			entry->vme_permanent = FALSE;
8645 
8646 			printf("%d[%s] %s(0x%llx,0x%llx): "
8647 			    "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8648 			    "prot 0x%x/0x%x\n",
8649 			    proc_selfpid(),
8650 			    (get_bsdtask_info(current_task())
8651 			    ? proc_name_address(get_bsdtask_info(current_task()))
8652 			    : "?"),
8653 			    __FUNCTION__,
8654 			    (uint64_t)start,
8655 			    (uint64_t)end,
8656 			    (uint64_t)entry->vme_start,
8657 			    (uint64_t)entry->vme_end,
8658 			    entry->protection,
8659 			    entry->max_protection);
8660 #endif
8661 		} else {
8662 			DTRACE_VM6(vm_map_delete_permanent,
8663 			    vm_map_entry_t, entry,
8664 			    vm_map_offset_t, entry->vme_start,
8665 			    vm_map_offset_t, entry->vme_end,
8666 			    vm_prot_t, entry->protection,
8667 			    vm_prot_t, entry->max_protection,
8668 			    int, VME_ALIAS(entry));
8669 		}
8670 
8671 		if (entry->is_sub_map) {
8672 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8673 			    "map %p (%d) entry %p submap %p (%d)\n",
8674 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8675 			    VME_SUBMAP(entry),
8676 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8677 			if (entry->use_pmap) {
8678 #ifndef NO_NESTED_PMAP
8679 				int pmap_flags;
8680 
8681 				if (map->terminated) {
8682 					/*
8683 					 * This is the final cleanup of the
8684 					 * address space being terminated.
8685 					 * No new mappings are expected and
8686 					 * we don't really need to unnest the
8687 					 * shared region (and lose the "global"
8688 					 * pmap mappings, if applicable).
8689 					 *
8690 					 * Tell the pmap layer that we're
8691 					 * "clean" wrt nesting.
8692 					 */
8693 					pmap_flags = PMAP_UNNEST_CLEAN;
8694 				} else {
8695 					/*
8696 					 * We're unmapping part of the nested
8697 					 * shared region, so we can't keep the
8698 					 * nested pmap.
8699 					 */
8700 					pmap_flags = 0;
8701 				}
8702 				pmap_unnest_options(
8703 					map->pmap,
8704 					(addr64_t)entry->vme_start,
8705 					entry->vme_end - entry->vme_start,
8706 					pmap_flags);
8707 #endif  /* NO_NESTED_PMAP */
8708 				if (map->mapped_in_other_pmaps &&
8709 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8710 					/* clean up parent map/maps */
8711 					vm_map_submap_pmap_clean(
8712 						map, entry->vme_start,
8713 						entry->vme_end,
8714 						VME_SUBMAP(entry),
8715 						VME_OFFSET(entry));
8716 				}
8717 			} else {
8718 				vm_map_submap_pmap_clean(
8719 					map, entry->vme_start, entry->vme_end,
8720 					VME_SUBMAP(entry),
8721 					VME_OFFSET(entry));
8722 			}
8723 		} else if (entry->vme_kernel_object ||
8724 		    VME_OBJECT(entry) == compressor_object) {
8725 			/*
8726 			 * nothing to do
8727 			 */
8728 		} else if (map->mapped_in_other_pmaps &&
8729 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8730 			vm_object_pmap_protect_options(
8731 				VME_OBJECT(entry), VME_OFFSET(entry),
8732 				entry->vme_end - entry->vme_start,
8733 				PMAP_NULL,
8734 				PAGE_SIZE,
8735 				entry->vme_start,
8736 				VM_PROT_NONE,
8737 				PMAP_OPTIONS_REMOVE);
8738 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8739 		    (state & VMDS_KERNEL_PMAP)) {
8740 			/* Remove translations associated
8741 			 * with this range unless the entry
8742 			 * does not have an object, or
8743 			 * it's the kernel map or a descendant
8744 			 * since the platform could potentially
8745 			 * create "backdoor" mappings invisible
8746 			 * to the VM. It is expected that
8747 			 * objectless, non-kernel ranges
8748 			 * do not have such VM invisible
8749 			 * translations.
8750 			 */
8751 			vm_map_address_t remove_start = entry->vme_start;
8752 			vm_map_address_t remove_end = entry->vme_end;
8753 #if MACH_ASSERT
8754 			/*
8755 			 * Prevent panics in pmap_remove() from some vm test code
8756 			 * which uses virtual address ranges that pmap disallows.
8757 			 */
8758 			if (thread_get_test_option(test_option_vm_map_clamp_pmap_remove)) {
8759 				vm_map_clamp_to_pmap(map, &remove_start, &remove_end);
8760 			}
8761 #endif /* MACH_ASSERT */
8762 			pmap_remove(map->pmap, remove_start, remove_end);
8763 		}
8764 
8765 #if DEBUG
8766 		/*
8767 		 * All pmap mappings for this map entry must have been
8768 		 * cleared by now.
8769 		 */
8770 		assert(pmap_is_empty(map->pmap,
8771 		    entry->vme_start,
8772 		    entry->vme_end));
8773 #endif /* DEBUG */
8774 
8775 		if (entry->iokit_acct) {
8776 			/* alternate accounting */
8777 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8778 			    vm_map_t, map,
8779 			    vm_map_offset_t, entry->vme_start,
8780 			    vm_map_offset_t, entry->vme_end,
8781 			    int, VME_ALIAS(entry));
8782 			vm_map_iokit_unmapped_region(map,
8783 			    (entry->vme_end -
8784 			    entry->vme_start));
8785 			entry->iokit_acct = FALSE;
8786 			entry->use_pmap = FALSE;
8787 		}
8788 
8789 		/* move "s" forward */
8790 		s    = entry->vme_end;
8791 		next = entry->vme_next;
8792 		if (!entry->map_aligned) {
8793 			vm_map_offset_t rounded_s;
8794 
8795 			/*
8796 			 * Skip artificial gap due to mis-aligned entry
8797 			 * on devices with a page size smaller than the
8798 			 * map's page size (i.e. 16k task on a 4k device).
8799 			 */
8800 			rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8801 			if (next == vm_map_to_entry(map)) {
8802 				s = rounded_s;
8803 			} else if (s < rounded_s) {
8804 				s = MIN(rounded_s, next->vme_start);
8805 			}
8806 		}
8807 		ret.kmr_size += s - entry->vme_start;
8808 
8809 		if (entry->vme_permanent) {
8810 			/*
8811 			 * A permanent entry can not be removed, so leave it
8812 			 * in place but remove all access permissions.
8813 			 */
8814 			if (__improbable(vm_log_map_delete_permanent_prot_none)) {
8815 				printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8816 				    __FUNCTION__, __LINE__,
8817 				    proc_selfpid(),
8818 				    (get_bsdtask_info(current_task())
8819 				    ? proc_name_address(get_bsdtask_info(current_task()))
8820 				    : "?"),
8821 				    map,
8822 				    entry,
8823 				    (uint64_t)entry->vme_start,
8824 				    (uint64_t)entry->vme_end,
8825 				    entry->is_sub_map,
8826 				    entry->protection,
8827 				    entry->max_protection);
8828 			}
8829 			DTRACE_VM6(vm_map_delete_permanent_prot_none,
8830 			    vm_map_entry_t, entry,
8831 			    vm_map_offset_t, entry->vme_start,
8832 			    vm_map_offset_t, entry->vme_end,
8833 			    vm_prot_t, entry->protection,
8834 			    vm_prot_t, entry->max_protection,
8835 			    int, VME_ALIAS(entry));
8836 			entry->protection = VM_PROT_NONE;
8837 			entry->max_protection = VM_PROT_NONE;
8838 #ifdef __arm64e__
8839 			entry->used_for_tpro = FALSE;
8840 #endif
8841 		} else {
8842 			vm_map_entry_zap(map, entry, zap_list);
8843 		}
8844 
8845 		entry = next;
8846 		next  = VM_MAP_ENTRY_NULL;
8847 
8848 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8849 			unsigned int last_timestamp = map->timestamp++;
8850 
8851 			if (lck_rw_lock_yield_exclusive(&map->lock,
8852 			    LCK_RW_YIELD_ANY_WAITER)) {
8853 				if (last_timestamp != map->timestamp + 1) {
8854 					state |= VMDS_NEEDS_LOOKUP;
8855 				}
8856 			} else {
8857 				/* we didn't yield, undo our change */
8858 				map->timestamp--;
8859 			}
8860 		}
8861 	}
8862 
8863 	if (map->wait_for_space) {
8864 		thread_wakeup((event_t) map);
8865 	}
8866 
8867 	if (state & VMDS_NEEDS_WAKEUP) {
8868 		vm_map_entry_wakeup(map);
8869 	}
8870 
8871 out:
8872 	if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8873 		__vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8874 	}
8875 
8876 	if (state & VMDS_KERNEL_KMEMPTR) {
8877 		kmem_free_space(start, end, range_id, &slot);
8878 	}
8879 
8880 	if (state & VMDS_FOUND_GAP) {
8881 		DTRACE_VM3(kern_vm_deallocate_gap,
8882 		    vm_map_offset_t, gap_start,
8883 		    vm_map_offset_t, save_start,
8884 		    vm_map_offset_t, save_end);
8885 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8886 			ret.kmr_return = KERN_INVALID_VALUE;
8887 		} else {
8888 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8889 		}
8890 	}
8891 
8892 	return ret;
8893 }
8894 
8895 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8896 vm_map_remove_and_unlock(
8897 	vm_map_t        map,
8898 	vm_map_offset_t start,
8899 	vm_map_offset_t end,
8900 	vmr_flags_t     flags,
8901 	kmem_guard_t    guard)
8902 {
8903 	kmem_return_t ret;
8904 	VM_MAP_ZAP_DECLARE(zap);
8905 
8906 	ret = vm_map_delete(map, start, end, flags, guard, &zap);
8907 	vm_map_unlock(map);
8908 
8909 	vm_map_zap_dispose(&zap);
8910 
8911 	return ret;
8912 }
8913 
8914 /*
8915  *	vm_map_remove_guard:
8916  *
8917  *	Remove the given address range from the target map.
8918  *	This is the exported form of vm_map_delete.
8919  */
8920 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8921 vm_map_remove_guard(
8922 	vm_map_t        map,
8923 	vm_map_offset_t start,
8924 	vm_map_offset_t end,
8925 	vmr_flags_t     flags,
8926 	kmem_guard_t    guard)
8927 {
8928 	vm_map_lock(map);
8929 	return vm_map_remove_and_unlock(map, start, end, flags, guard);
8930 }
8931 
8932 /*
8933  *	vm_map_terminate:
8934  *
8935  *	Clean out a task's map.
8936  */
8937 kern_return_t
vm_map_terminate(vm_map_t map)8938 vm_map_terminate(
8939 	vm_map_t        map)
8940 {
8941 	vm_map_lock(map);
8942 	map->terminated = TRUE;
8943 	vm_map_disable_hole_optimization(map);
8944 	(void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8945 	    VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
8946 	return KERN_SUCCESS;
8947 }
8948 
8949 /*
8950  *	Routine:	vm_map_copy_allocate
8951  *
8952  *	Description:
8953  *		Allocates and initializes a map copy object.
8954  */
8955 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)8956 vm_map_copy_allocate(uint16_t type)
8957 {
8958 	vm_map_copy_t new_copy;
8959 
8960 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
8961 	new_copy->type = type;
8962 	if (type == VM_MAP_COPY_ENTRY_LIST) {
8963 		new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8964 		vm_map_store_init(&new_copy->cpy_hdr);
8965 	}
8966 	return new_copy;
8967 }
8968 
8969 /*
8970  *	Routine:	vm_map_copy_discard
8971  *
8972  *	Description:
8973  *		Dispose of a map copy object (returned by
8974  *		vm_map_copyin).
8975  */
8976 void
vm_map_copy_discard(vm_map_copy_t copy)8977 vm_map_copy_discard(
8978 	vm_map_copy_t   copy)
8979 {
8980 	if (copy == VM_MAP_COPY_NULL) {
8981 		return;
8982 	}
8983 
8984 	/*
8985 	 * Assert that the vm_map_copy is coming from the right
8986 	 * zone and hasn't been forged
8987 	 */
8988 	vm_map_copy_require(copy);
8989 
8990 	switch (copy->type) {
8991 	case VM_MAP_COPY_ENTRY_LIST:
8992 		while (vm_map_copy_first_entry(copy) !=
8993 		    vm_map_copy_to_entry(copy)) {
8994 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
8995 
8996 			vm_map_copy_entry_unlink(copy, entry);
8997 			if (entry->is_sub_map) {
8998 				vm_map_deallocate(VME_SUBMAP(entry));
8999 			} else {
9000 				vm_object_deallocate(VME_OBJECT(entry));
9001 			}
9002 			vm_map_copy_entry_dispose(entry);
9003 		}
9004 		break;
9005 	case VM_MAP_COPY_KERNEL_BUFFER:
9006 
9007 		/*
9008 		 * The vm_map_copy_t and possibly the data buffer were
9009 		 * allocated by a single call to kalloc_data(), i.e. the
9010 		 * vm_map_copy_t was not allocated out of the zone.
9011 		 */
9012 		if (copy->size > msg_ool_size_small || copy->offset) {
9013 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9014 			    (long long)copy->size, (long long)copy->offset);
9015 		}
9016 		kfree_data(copy->cpy_kdata, copy->size);
9017 	}
9018 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9019 }
9020 
9021 #if XNU_PLATFORM_MacOSX
9022 
9023 __exported
9024 extern vm_map_copy_t vm_map_copy_copy(vm_map_copy_t copy);
9025 
9026 /*
9027  *	Routine:	vm_map_copy_copy
9028  *
9029  *	Description:
9030  *			Move the information in a map copy object to
9031  *			a new map copy object, leaving the old one
9032  *			empty.
9033  *
9034  *			This is used by kernel routines that need
9035  *			to look at out-of-line data (in copyin form)
9036  *			before deciding whether to return SUCCESS.
9037  *			If the routine returns FAILURE, the original
9038  *			copy object will be deallocated; therefore,
9039  *			these routines must make a copy of the copy
9040  *			object and leave the original empty so that
9041  *			deallocation will not fail.
9042  */
9043 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9044 vm_map_copy_copy(
9045 	vm_map_copy_t   copy)
9046 {
9047 	vm_map_copy_t   new_copy;
9048 
9049 	if (copy == VM_MAP_COPY_NULL) {
9050 		return VM_MAP_COPY_NULL;
9051 	}
9052 
9053 	/*
9054 	 * Assert that the vm_map_copy is coming from the right
9055 	 * zone and hasn't been forged
9056 	 */
9057 	vm_map_copy_require(copy);
9058 
9059 	/*
9060 	 * Allocate a new copy object, and copy the information
9061 	 * from the old one into it.
9062 	 */
9063 
9064 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9065 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9066 #if __has_feature(ptrauth_calls)
9067 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9068 		new_copy->cpy_kdata = copy->cpy_kdata;
9069 	}
9070 #endif
9071 
9072 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9073 		/*
9074 		 * The links in the entry chain must be
9075 		 * changed to point to the new copy object.
9076 		 */
9077 		vm_map_copy_first_entry(copy)->vme_prev
9078 		        = vm_map_copy_to_entry(new_copy);
9079 		vm_map_copy_last_entry(copy)->vme_next
9080 		        = vm_map_copy_to_entry(new_copy);
9081 	}
9082 
9083 	/*
9084 	 * Change the old copy object into one that contains
9085 	 * nothing to be deallocated.
9086 	 */
9087 	bzero(copy, sizeof(struct vm_map_copy));
9088 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9089 
9090 	/*
9091 	 * Return the new object.
9092 	 */
9093 	return new_copy;
9094 }
9095 
9096 #endif /* XNU_PLATFORM_MacOSX */
9097 
9098 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9099 vm_map_entry_is_overwritable(
9100 	vm_map_t        dst_map __unused,
9101 	vm_map_entry_t  entry)
9102 {
9103 	if (!(entry->protection & VM_PROT_WRITE)) {
9104 		/* can't overwrite if not writable */
9105 		return FALSE;
9106 	}
9107 #if !__x86_64__
9108 	if (entry->used_for_jit &&
9109 	    vm_map_cs_enforcement(dst_map) &&
9110 	    !dst_map->cs_debugged) {
9111 		/*
9112 		 * Can't overwrite a JIT region while cs_enforced
9113 		 * and not cs_debugged.
9114 		 */
9115 		return FALSE;
9116 	}
9117 
9118 #if __arm64e__
9119 	/* Do not allow overwrite HW assisted TPRO entries */
9120 	if (entry->used_for_tpro) {
9121 		return FALSE;
9122 	}
9123 #endif /* __arm64e__ */
9124 
9125 	if (entry->vme_permanent) {
9126 		if (entry->is_sub_map) {
9127 			/*
9128 			 * We can't tell if the submap contains "permanent"
9129 			 * entries within the range targeted by the caller.
9130 			 * The caller will have to check for that with
9131 			 * vm_map_overwrite_submap_recurse() for example.
9132 			 */
9133 		} else {
9134 			/*
9135 			 * Do not allow overwriting of a "permanent"
9136 			 * entry.
9137 			 */
9138 			DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9139 			    vm_map_entry_t, entry,
9140 			    vm_map_offset_t, entry->vme_start,
9141 			    vm_map_offset_t, entry->vme_end,
9142 			    vm_prot_t, entry->protection,
9143 			    vm_prot_t, entry->max_protection,
9144 			    int, VME_ALIAS(entry));
9145 			return FALSE;
9146 		}
9147 	}
9148 #endif /* !__x86_64__ */
9149 
9150 	if (entry->is_sub_map) {
9151 		/* remember not to assume every entry has a VM object... */
9152 	}
9153 
9154 	return TRUE;
9155 }
9156 
9157 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9158 vm_map_overwrite_submap_recurse(
9159 	vm_map_t        dst_map,
9160 	vm_map_offset_t dst_addr,
9161 	vm_map_size_t   dst_size)
9162 {
9163 	vm_map_offset_t dst_end;
9164 	vm_map_entry_t  tmp_entry;
9165 	vm_map_entry_t  entry;
9166 	kern_return_t   result;
9167 	boolean_t       encountered_sub_map = FALSE;
9168 
9169 
9170 
9171 	/*
9172 	 *	Verify that the destination is all writeable
9173 	 *	initially.  We have to trunc the destination
9174 	 *	address and round the copy size or we'll end up
9175 	 *	splitting entries in strange ways.
9176 	 */
9177 
9178 	dst_end = vm_map_round_page(dst_addr + dst_size,
9179 	    VM_MAP_PAGE_MASK(dst_map));
9180 	vm_map_lock(dst_map);
9181 
9182 start_pass_1:
9183 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9184 		vm_map_unlock(dst_map);
9185 		return KERN_INVALID_ADDRESS;
9186 	}
9187 
9188 	vm_map_clip_start(dst_map,
9189 	    tmp_entry,
9190 	    vm_map_trunc_page(dst_addr,
9191 	    VM_MAP_PAGE_MASK(dst_map)));
9192 	if (tmp_entry->is_sub_map) {
9193 		/* clipping did unnest if needed */
9194 		assert(!tmp_entry->use_pmap);
9195 	}
9196 
9197 	for (entry = tmp_entry;;) {
9198 		vm_map_entry_t  next;
9199 
9200 		next = entry->vme_next;
9201 		while (entry->is_sub_map) {
9202 			vm_map_offset_t sub_start;
9203 			vm_map_offset_t sub_end;
9204 			vm_map_offset_t local_end;
9205 
9206 			if (entry->in_transition) {
9207 				/*
9208 				 * Say that we are waiting, and wait for entry.
9209 				 */
9210 				entry->needs_wakeup = TRUE;
9211 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9212 
9213 				goto start_pass_1;
9214 			}
9215 
9216 			encountered_sub_map = TRUE;
9217 			sub_start = VME_OFFSET(entry);
9218 
9219 			if (entry->vme_end < dst_end) {
9220 				sub_end = entry->vme_end;
9221 			} else {
9222 				sub_end = dst_end;
9223 			}
9224 			sub_end -= entry->vme_start;
9225 			sub_end += VME_OFFSET(entry);
9226 			local_end = entry->vme_end;
9227 			vm_map_unlock(dst_map);
9228 
9229 			result = vm_map_overwrite_submap_recurse(
9230 				VME_SUBMAP(entry),
9231 				sub_start,
9232 				sub_end - sub_start);
9233 
9234 			if (result != KERN_SUCCESS) {
9235 				return result;
9236 			}
9237 			if (dst_end <= entry->vme_end) {
9238 				return KERN_SUCCESS;
9239 			}
9240 			vm_map_lock(dst_map);
9241 			if (!vm_map_lookup_entry(dst_map, local_end,
9242 			    &tmp_entry)) {
9243 				vm_map_unlock(dst_map);
9244 				return KERN_INVALID_ADDRESS;
9245 			}
9246 			entry = tmp_entry;
9247 			next = entry->vme_next;
9248 		}
9249 		assert(!entry->is_sub_map);
9250 
9251 		if (!(entry->protection & VM_PROT_WRITE)) {
9252 			vm_map_unlock(dst_map);
9253 			return KERN_PROTECTION_FAILURE;
9254 		}
9255 
9256 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9257 			vm_map_unlock(dst_map);
9258 			return KERN_PROTECTION_FAILURE;
9259 		}
9260 
9261 		/*
9262 		 *	If the entry is in transition, we must wait
9263 		 *	for it to exit that state.  Anything could happen
9264 		 *	when we unlock the map, so start over.
9265 		 */
9266 		if (entry->in_transition) {
9267 			/*
9268 			 * Say that we are waiting, and wait for entry.
9269 			 */
9270 			entry->needs_wakeup = TRUE;
9271 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9272 
9273 			goto start_pass_1;
9274 		}
9275 
9276 /*
9277  *		our range is contained completely within this map entry
9278  */
9279 		if (dst_end <= entry->vme_end) {
9280 			vm_map_unlock(dst_map);
9281 			return KERN_SUCCESS;
9282 		}
9283 /*
9284  *		check that range specified is contiguous region
9285  */
9286 		if ((next == vm_map_to_entry(dst_map)) ||
9287 		    (next->vme_start != entry->vme_end)) {
9288 			vm_map_unlock(dst_map);
9289 			return KERN_INVALID_ADDRESS;
9290 		}
9291 
9292 		/*
9293 		 *	Check for permanent objects in the destination.
9294 		 */
9295 		assert(!entry->is_sub_map);
9296 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9297 		    ((!VME_OBJECT(entry)->internal) ||
9298 		    (VME_OBJECT(entry)->true_share))) {
9299 			if (encountered_sub_map) {
9300 				vm_map_unlock(dst_map);
9301 				return KERN_FAILURE;
9302 			}
9303 		}
9304 
9305 
9306 		entry = next;
9307 	}/* for */
9308 	vm_map_unlock(dst_map);
9309 	return KERN_SUCCESS;
9310 }
9311 
9312 /*
9313  *	Routine:	vm_map_copy_overwrite
9314  *
9315  *	Description:
9316  *		Copy the memory described by the map copy
9317  *		object (copy; returned by vm_map_copyin) onto
9318  *		the specified destination region (dst_map, dst_addr).
9319  *		The destination must be writeable.
9320  *
9321  *		Unlike vm_map_copyout, this routine actually
9322  *		writes over previously-mapped memory.  If the
9323  *		previous mapping was to a permanent (user-supplied)
9324  *		memory object, it is preserved.
9325  *
9326  *		The attributes (protection and inheritance) of the
9327  *		destination region are preserved.
9328  *
9329  *		If successful, consumes the copy object.
9330  *		Otherwise, the caller is responsible for it.
9331  *
9332  *	Implementation notes:
9333  *		To overwrite aligned temporary virtual memory, it is
9334  *		sufficient to remove the previous mapping and insert
9335  *		the new copy.  This replacement is done either on
9336  *		the whole region (if no permanent virtual memory
9337  *		objects are embedded in the destination region) or
9338  *		in individual map entries.
9339  *
9340  *		To overwrite permanent virtual memory , it is necessary
9341  *		to copy each page, as the external memory management
9342  *		interface currently does not provide any optimizations.
9343  *
9344  *		Unaligned memory also has to be copied.  It is possible
9345  *		to use 'vm_trickery' to copy the aligned data.  This is
9346  *		not done but not hard to implement.
9347  *
9348  *		Once a page of permanent memory has been overwritten,
9349  *		it is impossible to interrupt this function; otherwise,
9350  *		the call would be neither atomic nor location-independent.
9351  *		The kernel-state portion of a user thread must be
9352  *		interruptible.
9353  *
9354  *		It may be expensive to forward all requests that might
9355  *		overwrite permanent memory (vm_write, vm_copy) to
9356  *		uninterruptible kernel threads.  This routine may be
9357  *		called by interruptible threads; however, success is
9358  *		not guaranteed -- if the request cannot be performed
9359  *		atomically and interruptibly, an error indication is
9360  *		returned.
9361  *
9362  *		Callers of this function must call vm_map_copy_require on
9363  *		previously created vm_map_copy_t or pass a newly created
9364  *		one to ensure that it hasn't been forged.
9365  */
9366 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9367 vm_map_copy_overwrite_nested(
9368 	vm_map_t                dst_map,
9369 	vm_map_address_t        dst_addr,
9370 	vm_map_copy_t           copy,
9371 	boolean_t               interruptible,
9372 	pmap_t                  pmap,
9373 	boolean_t               discard_on_success)
9374 {
9375 	vm_map_offset_t         dst_end;
9376 	vm_map_entry_t          tmp_entry;
9377 	vm_map_entry_t          entry;
9378 	kern_return_t           kr;
9379 	boolean_t               aligned = TRUE;
9380 	boolean_t               contains_permanent_objects = FALSE;
9381 	boolean_t               encountered_sub_map = FALSE;
9382 	vm_map_offset_t         base_addr;
9383 	vm_map_size_t           copy_size;
9384 	vm_map_size_t           total_size;
9385 	uint16_t                copy_page_shift;
9386 
9387 	/*
9388 	 *	Check for special kernel buffer allocated
9389 	 *	by new_ipc_kmsg_copyin.
9390 	 */
9391 
9392 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9393 		kr = vm_map_copyout_kernel_buffer(
9394 			dst_map, &dst_addr,
9395 			copy, copy->size, TRUE, discard_on_success);
9396 		return kr;
9397 	}
9398 
9399 	/*
9400 	 *      Only works for entry lists at the moment.  Will
9401 	 *	support page lists later.
9402 	 */
9403 
9404 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9405 
9406 	if (copy->size == 0) {
9407 		if (discard_on_success) {
9408 			vm_map_copy_discard(copy);
9409 		}
9410 		return KERN_SUCCESS;
9411 	}
9412 
9413 	copy_page_shift = copy->cpy_hdr.page_shift;
9414 
9415 	/*
9416 	 *	Verify that the destination is all writeable
9417 	 *	initially.  We have to trunc the destination
9418 	 *	address and round the copy size or we'll end up
9419 	 *	splitting entries in strange ways.
9420 	 */
9421 
9422 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
9423 	    VM_MAP_PAGE_MASK(dst_map)) ||
9424 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
9425 	    VM_MAP_PAGE_MASK(dst_map)) ||
9426 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
9427 	    VM_MAP_PAGE_MASK(dst_map)) ||
9428 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9429 		aligned = FALSE;
9430 		dst_end = vm_map_round_page(dst_addr + copy->size,
9431 		    VM_MAP_PAGE_MASK(dst_map));
9432 	} else {
9433 		dst_end = dst_addr + copy->size;
9434 	}
9435 
9436 	vm_map_lock(dst_map);
9437 
9438 	/* LP64todo - remove this check when vm_map_commpage64()
9439 	 * no longer has to stuff in a map_entry for the commpage
9440 	 * above the map's max_offset.
9441 	 */
9442 	if (dst_addr >= dst_map->max_offset) {
9443 		vm_map_unlock(dst_map);
9444 		return KERN_INVALID_ADDRESS;
9445 	}
9446 
9447 start_pass_1:
9448 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9449 		vm_map_unlock(dst_map);
9450 		return KERN_INVALID_ADDRESS;
9451 	}
9452 	vm_map_clip_start(dst_map,
9453 	    tmp_entry,
9454 	    vm_map_trunc_page(dst_addr,
9455 	    VM_MAP_PAGE_MASK(dst_map)));
9456 	for (entry = tmp_entry;;) {
9457 		vm_map_entry_t  next = entry->vme_next;
9458 
9459 		while (entry->is_sub_map) {
9460 			vm_map_offset_t sub_start;
9461 			vm_map_offset_t sub_end;
9462 			vm_map_offset_t local_end;
9463 
9464 			if (entry->in_transition) {
9465 				/*
9466 				 * Say that we are waiting, and wait for entry.
9467 				 */
9468 				entry->needs_wakeup = TRUE;
9469 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9470 
9471 				goto start_pass_1;
9472 			}
9473 
9474 			local_end = entry->vme_end;
9475 			if (!(entry->needs_copy)) {
9476 				/* if needs_copy we are a COW submap */
9477 				/* in such a case we just replace so */
9478 				/* there is no need for the follow-  */
9479 				/* ing check.                        */
9480 				encountered_sub_map = TRUE;
9481 				sub_start = VME_OFFSET(entry);
9482 
9483 				if (entry->vme_end < dst_end) {
9484 					sub_end = entry->vme_end;
9485 				} else {
9486 					sub_end = dst_end;
9487 				}
9488 				sub_end -= entry->vme_start;
9489 				sub_end += VME_OFFSET(entry);
9490 				vm_map_unlock(dst_map);
9491 
9492 				kr = vm_map_overwrite_submap_recurse(
9493 					VME_SUBMAP(entry),
9494 					sub_start,
9495 					sub_end - sub_start);
9496 				if (kr != KERN_SUCCESS) {
9497 					return kr;
9498 				}
9499 				vm_map_lock(dst_map);
9500 			}
9501 
9502 			if (dst_end <= entry->vme_end) {
9503 				goto start_overwrite;
9504 			}
9505 			if (!vm_map_lookup_entry(dst_map, local_end,
9506 			    &entry)) {
9507 				vm_map_unlock(dst_map);
9508 				return KERN_INVALID_ADDRESS;
9509 			}
9510 			next = entry->vme_next;
9511 		}
9512 		assert(!entry->is_sub_map);
9513 
9514 		if (!(entry->protection & VM_PROT_WRITE)) {
9515 			vm_map_unlock(dst_map);
9516 			return KERN_PROTECTION_FAILURE;
9517 		}
9518 
9519 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9520 			vm_map_unlock(dst_map);
9521 			return KERN_PROTECTION_FAILURE;
9522 		}
9523 
9524 		/*
9525 		 *	If the entry is in transition, we must wait
9526 		 *	for it to exit that state.  Anything could happen
9527 		 *	when we unlock the map, so start over.
9528 		 */
9529 		if (entry->in_transition) {
9530 			/*
9531 			 * Say that we are waiting, and wait for entry.
9532 			 */
9533 			entry->needs_wakeup = TRUE;
9534 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9535 
9536 			goto start_pass_1;
9537 		}
9538 
9539 /*
9540  *		our range is contained completely within this map entry
9541  */
9542 		if (dst_end <= entry->vme_end) {
9543 			break;
9544 		}
9545 /*
9546  *		check that range specified is contiguous region
9547  */
9548 		if ((next == vm_map_to_entry(dst_map)) ||
9549 		    (next->vme_start != entry->vme_end)) {
9550 			vm_map_unlock(dst_map);
9551 			return KERN_INVALID_ADDRESS;
9552 		}
9553 
9554 
9555 		/*
9556 		 *	Check for permanent objects in the destination.
9557 		 */
9558 		assert(!entry->is_sub_map);
9559 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9560 		    ((!VME_OBJECT(entry)->internal) ||
9561 		    (VME_OBJECT(entry)->true_share))) {
9562 			contains_permanent_objects = TRUE;
9563 		}
9564 
9565 		entry = next;
9566 	}/* for */
9567 
9568 start_overwrite:
9569 	/*
9570 	 *	If there are permanent objects in the destination, then
9571 	 *	the copy cannot be interrupted.
9572 	 */
9573 
9574 	if (interruptible && contains_permanent_objects) {
9575 		vm_map_unlock(dst_map);
9576 		return KERN_FAILURE;   /* XXX */
9577 	}
9578 
9579 	/*
9580 	 *
9581 	 *	Make a second pass, overwriting the data
9582 	 *	At the beginning of each loop iteration,
9583 	 *	the next entry to be overwritten is "tmp_entry"
9584 	 *	(initially, the value returned from the lookup above),
9585 	 *	and the starting address expected in that entry
9586 	 *	is "start".
9587 	 */
9588 
9589 	total_size = copy->size;
9590 	if (encountered_sub_map) {
9591 		copy_size = 0;
9592 		/* re-calculate tmp_entry since we've had the map */
9593 		/* unlocked */
9594 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9595 			vm_map_unlock(dst_map);
9596 			return KERN_INVALID_ADDRESS;
9597 		}
9598 	} else {
9599 		copy_size = copy->size;
9600 	}
9601 
9602 	base_addr = dst_addr;
9603 	while (TRUE) {
9604 		/* deconstruct the copy object and do in parts */
9605 		/* only in sub_map, interruptable case */
9606 		vm_map_entry_t  copy_entry;
9607 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9608 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9609 		int             nentries;
9610 		int             remaining_entries = 0;
9611 		vm_map_offset_t new_offset = 0;
9612 
9613 		for (entry = tmp_entry; copy_size == 0;) {
9614 			vm_map_entry_t  next;
9615 
9616 			next = entry->vme_next;
9617 
9618 			/* tmp_entry and base address are moved along */
9619 			/* each time we encounter a sub-map.  Otherwise */
9620 			/* entry can outpase tmp_entry, and the copy_size */
9621 			/* may reflect the distance between them */
9622 			/* if the current entry is found to be in transition */
9623 			/* we will start over at the beginning or the last */
9624 			/* encounter of a submap as dictated by base_addr */
9625 			/* we will zero copy_size accordingly. */
9626 			if (entry->in_transition) {
9627 				/*
9628 				 * Say that we are waiting, and wait for entry.
9629 				 */
9630 				entry->needs_wakeup = TRUE;
9631 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9632 
9633 				if (!vm_map_lookup_entry(dst_map, base_addr,
9634 				    &tmp_entry)) {
9635 					vm_map_unlock(dst_map);
9636 					return KERN_INVALID_ADDRESS;
9637 				}
9638 				copy_size = 0;
9639 				entry = tmp_entry;
9640 				continue;
9641 			}
9642 			if (entry->is_sub_map) {
9643 				vm_map_offset_t sub_start;
9644 				vm_map_offset_t sub_end;
9645 				vm_map_offset_t local_end;
9646 
9647 				if (entry->needs_copy) {
9648 					/* if this is a COW submap */
9649 					/* just back the range with a */
9650 					/* anonymous entry */
9651 					assert(!entry->vme_permanent);
9652 					if (entry->vme_end < dst_end) {
9653 						sub_end = entry->vme_end;
9654 					} else {
9655 						sub_end = dst_end;
9656 					}
9657 					if (entry->vme_start < base_addr) {
9658 						sub_start = base_addr;
9659 					} else {
9660 						sub_start = entry->vme_start;
9661 					}
9662 					vm_map_clip_end(
9663 						dst_map, entry, sub_end);
9664 					vm_map_clip_start(
9665 						dst_map, entry, sub_start);
9666 					assert(!entry->use_pmap);
9667 					assert(!entry->iokit_acct);
9668 					entry->use_pmap = TRUE;
9669 					vm_map_deallocate(VME_SUBMAP(entry));
9670 					assert(!entry->vme_permanent);
9671 					VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9672 					VME_OFFSET_SET(entry, 0);
9673 					entry->is_shared = FALSE;
9674 					entry->needs_copy = FALSE;
9675 					entry->protection = VM_PROT_DEFAULT;
9676 					entry->max_protection = VM_PROT_ALL;
9677 					entry->wired_count = 0;
9678 					entry->user_wired_count = 0;
9679 					if (entry->inheritance
9680 					    == VM_INHERIT_SHARE) {
9681 						entry->inheritance = VM_INHERIT_COPY;
9682 					}
9683 					continue;
9684 				}
9685 				/* first take care of any non-sub_map */
9686 				/* entries to send */
9687 				if (base_addr < entry->vme_start) {
9688 					/* stuff to send */
9689 					copy_size =
9690 					    entry->vme_start - base_addr;
9691 					break;
9692 				}
9693 				sub_start = VME_OFFSET(entry);
9694 
9695 				if (entry->vme_end < dst_end) {
9696 					sub_end = entry->vme_end;
9697 				} else {
9698 					sub_end = dst_end;
9699 				}
9700 				sub_end -= entry->vme_start;
9701 				sub_end += VME_OFFSET(entry);
9702 				local_end = entry->vme_end;
9703 				vm_map_unlock(dst_map);
9704 				copy_size = sub_end - sub_start;
9705 
9706 				/* adjust the copy object */
9707 				if (total_size > copy_size) {
9708 					vm_map_size_t   local_size = 0;
9709 					vm_map_size_t   entry_size;
9710 
9711 					nentries = 1;
9712 					new_offset = copy->offset;
9713 					copy_entry = vm_map_copy_first_entry(copy);
9714 					while (copy_entry !=
9715 					    vm_map_copy_to_entry(copy)) {
9716 						entry_size = copy_entry->vme_end -
9717 						    copy_entry->vme_start;
9718 						if ((local_size < copy_size) &&
9719 						    ((local_size + entry_size)
9720 						    >= copy_size)) {
9721 							vm_map_copy_clip_end(copy,
9722 							    copy_entry,
9723 							    copy_entry->vme_start +
9724 							    (copy_size - local_size));
9725 							entry_size = copy_entry->vme_end -
9726 							    copy_entry->vme_start;
9727 							local_size += entry_size;
9728 							new_offset += entry_size;
9729 						}
9730 						if (local_size >= copy_size) {
9731 							next_copy = copy_entry->vme_next;
9732 							copy_entry->vme_next =
9733 							    vm_map_copy_to_entry(copy);
9734 							previous_prev =
9735 							    copy->cpy_hdr.links.prev;
9736 							copy->cpy_hdr.links.prev = copy_entry;
9737 							copy->size = copy_size;
9738 							remaining_entries =
9739 							    copy->cpy_hdr.nentries;
9740 							remaining_entries -= nentries;
9741 							copy->cpy_hdr.nentries = nentries;
9742 							break;
9743 						} else {
9744 							local_size += entry_size;
9745 							new_offset += entry_size;
9746 							nentries++;
9747 						}
9748 						copy_entry = copy_entry->vme_next;
9749 					}
9750 				}
9751 
9752 				if ((entry->use_pmap) && (pmap == NULL)) {
9753 					kr = vm_map_copy_overwrite_nested(
9754 						VME_SUBMAP(entry),
9755 						sub_start,
9756 						copy,
9757 						interruptible,
9758 						VME_SUBMAP(entry)->pmap,
9759 						TRUE);
9760 				} else if (pmap != NULL) {
9761 					kr = vm_map_copy_overwrite_nested(
9762 						VME_SUBMAP(entry),
9763 						sub_start,
9764 						copy,
9765 						interruptible, pmap,
9766 						TRUE);
9767 				} else {
9768 					kr = vm_map_copy_overwrite_nested(
9769 						VME_SUBMAP(entry),
9770 						sub_start,
9771 						copy,
9772 						interruptible,
9773 						dst_map->pmap,
9774 						TRUE);
9775 				}
9776 				if (kr != KERN_SUCCESS) {
9777 					if (next_copy != NULL) {
9778 						copy->cpy_hdr.nentries +=
9779 						    remaining_entries;
9780 						copy->cpy_hdr.links.prev->vme_next =
9781 						    next_copy;
9782 						copy->cpy_hdr.links.prev
9783 						        = previous_prev;
9784 						copy->size = total_size;
9785 					}
9786 					return kr;
9787 				}
9788 				if (dst_end <= local_end) {
9789 					return KERN_SUCCESS;
9790 				}
9791 				/* otherwise copy no longer exists, it was */
9792 				/* destroyed after successful copy_overwrite */
9793 				copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9794 				copy->offset = new_offset;
9795 				copy->cpy_hdr.page_shift = copy_page_shift;
9796 
9797 				total_size -= copy_size;
9798 				copy_size = 0;
9799 				/* put back remainder of copy in container */
9800 				if (next_copy != NULL) {
9801 					copy->cpy_hdr.nentries = remaining_entries;
9802 					copy->cpy_hdr.links.next = next_copy;
9803 					copy->cpy_hdr.links.prev = previous_prev;
9804 					copy->size = total_size;
9805 					next_copy->vme_prev =
9806 					    vm_map_copy_to_entry(copy);
9807 					next_copy = NULL;
9808 				}
9809 				base_addr = local_end;
9810 				vm_map_lock(dst_map);
9811 				if (!vm_map_lookup_entry(dst_map,
9812 				    local_end, &tmp_entry)) {
9813 					vm_map_unlock(dst_map);
9814 					return KERN_INVALID_ADDRESS;
9815 				}
9816 				entry = tmp_entry;
9817 				continue;
9818 			}
9819 			assert(!entry->is_sub_map);
9820 
9821 			if (dst_end <= entry->vme_end) {
9822 				copy_size = dst_end - base_addr;
9823 				break;
9824 			}
9825 
9826 			if ((next == vm_map_to_entry(dst_map)) ||
9827 			    (next->vme_start != entry->vme_end)) {
9828 				vm_map_unlock(dst_map);
9829 				return KERN_INVALID_ADDRESS;
9830 			}
9831 
9832 			entry = next;
9833 		}/* for */
9834 
9835 		next_copy = NULL;
9836 		nentries = 1;
9837 
9838 		/* adjust the copy object */
9839 		if (total_size > copy_size) {
9840 			vm_map_size_t   local_size = 0;
9841 			vm_map_size_t   entry_size;
9842 
9843 			new_offset = copy->offset;
9844 			copy_entry = vm_map_copy_first_entry(copy);
9845 			while (copy_entry != vm_map_copy_to_entry(copy)) {
9846 				entry_size = copy_entry->vme_end -
9847 				    copy_entry->vme_start;
9848 				if ((local_size < copy_size) &&
9849 				    ((local_size + entry_size)
9850 				    >= copy_size)) {
9851 					vm_map_copy_clip_end(copy, copy_entry,
9852 					    copy_entry->vme_start +
9853 					    (copy_size - local_size));
9854 					entry_size = copy_entry->vme_end -
9855 					    copy_entry->vme_start;
9856 					local_size += entry_size;
9857 					new_offset += entry_size;
9858 				}
9859 				if (local_size >= copy_size) {
9860 					next_copy = copy_entry->vme_next;
9861 					copy_entry->vme_next =
9862 					    vm_map_copy_to_entry(copy);
9863 					previous_prev =
9864 					    copy->cpy_hdr.links.prev;
9865 					copy->cpy_hdr.links.prev = copy_entry;
9866 					copy->size = copy_size;
9867 					remaining_entries =
9868 					    copy->cpy_hdr.nentries;
9869 					remaining_entries -= nentries;
9870 					copy->cpy_hdr.nentries = nentries;
9871 					break;
9872 				} else {
9873 					local_size += entry_size;
9874 					new_offset += entry_size;
9875 					nentries++;
9876 				}
9877 				copy_entry = copy_entry->vme_next;
9878 			}
9879 		}
9880 
9881 		if (aligned) {
9882 			pmap_t  local_pmap;
9883 
9884 			if (pmap) {
9885 				local_pmap = pmap;
9886 			} else {
9887 				local_pmap = dst_map->pmap;
9888 			}
9889 
9890 			if ((kr =  vm_map_copy_overwrite_aligned(
9891 				    dst_map, tmp_entry, copy,
9892 				    base_addr, local_pmap)) != KERN_SUCCESS) {
9893 				if (next_copy != NULL) {
9894 					copy->cpy_hdr.nentries +=
9895 					    remaining_entries;
9896 					copy->cpy_hdr.links.prev->vme_next =
9897 					    next_copy;
9898 					copy->cpy_hdr.links.prev =
9899 					    previous_prev;
9900 					copy->size += copy_size;
9901 				}
9902 				return kr;
9903 			}
9904 			vm_map_unlock(dst_map);
9905 		} else {
9906 			/*
9907 			 * Performance gain:
9908 			 *
9909 			 * if the copy and dst address are misaligned but the same
9910 			 * offset within the page we can copy_not_aligned the
9911 			 * misaligned parts and copy aligned the rest.  If they are
9912 			 * aligned but len is unaligned we simply need to copy
9913 			 * the end bit unaligned.  We'll need to split the misaligned
9914 			 * bits of the region in this case !
9915 			 */
9916 			/* ALWAYS UNLOCKS THE dst_map MAP */
9917 			kr = vm_map_copy_overwrite_unaligned(
9918 				dst_map,
9919 				tmp_entry,
9920 				copy,
9921 				base_addr,
9922 				discard_on_success);
9923 			if (kr != KERN_SUCCESS) {
9924 				if (next_copy != NULL) {
9925 					copy->cpy_hdr.nentries +=
9926 					    remaining_entries;
9927 					copy->cpy_hdr.links.prev->vme_next =
9928 					    next_copy;
9929 					copy->cpy_hdr.links.prev =
9930 					    previous_prev;
9931 					copy->size += copy_size;
9932 				}
9933 				return kr;
9934 			}
9935 		}
9936 		total_size -= copy_size;
9937 		if (total_size == 0) {
9938 			break;
9939 		}
9940 		base_addr += copy_size;
9941 		copy_size = 0;
9942 		copy->offset = new_offset;
9943 		if (next_copy != NULL) {
9944 			copy->cpy_hdr.nentries = remaining_entries;
9945 			copy->cpy_hdr.links.next = next_copy;
9946 			copy->cpy_hdr.links.prev = previous_prev;
9947 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
9948 			copy->size = total_size;
9949 		}
9950 		vm_map_lock(dst_map);
9951 		while (TRUE) {
9952 			if (!vm_map_lookup_entry(dst_map,
9953 			    base_addr, &tmp_entry)) {
9954 				vm_map_unlock(dst_map);
9955 				return KERN_INVALID_ADDRESS;
9956 			}
9957 			if (tmp_entry->in_transition) {
9958 				entry->needs_wakeup = TRUE;
9959 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9960 			} else {
9961 				break;
9962 			}
9963 		}
9964 		vm_map_clip_start(dst_map,
9965 		    tmp_entry,
9966 		    vm_map_trunc_page(base_addr,
9967 		    VM_MAP_PAGE_MASK(dst_map)));
9968 
9969 		entry = tmp_entry;
9970 	} /* while */
9971 
9972 	/*
9973 	 *	Throw away the vm_map_copy object
9974 	 */
9975 	if (discard_on_success) {
9976 		vm_map_copy_discard(copy);
9977 	}
9978 
9979 	return KERN_SUCCESS;
9980 }/* vm_map_copy_overwrite */
9981 
9982 static __attribute__((always_inline, warn_unused_result))
9983 kern_return_t
vm_map_copy_addr_size_sanitize(vm_map_t map,vm_map_offset_ut addr_u,vm_map_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * addr,vm_map_offset_t * end,vm_map_size_t * size)9984 vm_map_copy_addr_size_sanitize(
9985 	vm_map_t                map,
9986 	vm_map_offset_ut        addr_u,
9987 	vm_map_size_ut          size_u,
9988 	vm_sanitize_caller_t    vm_sanitize_caller,
9989 	vm_map_offset_t        *addr,
9990 	vm_map_offset_t        *end,
9991 	vm_map_size_t          *size)
9992 {
9993 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
9994 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES;
9995 
9996 
9997 	return vm_sanitize_addr_size(addr_u, size_u,
9998 	           vm_sanitize_caller, map,
9999 	           flags,
10000 	           addr, end, size);
10001 }
10002 
10003 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_ut dst_addr_u,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t interruptible)10004 vm_map_copy_overwrite(
10005 	vm_map_t                dst_map,
10006 	vm_map_offset_ut        dst_addr_u,
10007 	vm_map_copy_t           copy,
10008 	vm_map_size_ut          copy_size_u,
10009 	boolean_t               interruptible)
10010 {
10011 	vm_map_offset_t dst_addr, dst_end;
10012 	vm_map_size_t   copy_size;
10013 	vm_map_size_t   head_size, tail_size;
10014 	vm_map_copy_t   head_copy, tail_copy;
10015 	vm_map_offset_t head_addr, tail_addr;
10016 	vm_map_entry_t  entry;
10017 	kern_return_t   kr;
10018 	vm_map_offset_t effective_page_mask, effective_page_size;
10019 	uint16_t        copy_page_shift;
10020 
10021 	head_size = 0;
10022 	tail_size = 0;
10023 	head_copy = NULL;
10024 	tail_copy = NULL;
10025 	head_addr = 0;
10026 	tail_addr = 0;
10027 
10028 	/*
10029 	 *	Check for null copy object.
10030 	 */
10031 	if (copy == VM_MAP_COPY_NULL) {
10032 		return KERN_SUCCESS;
10033 	}
10034 
10035 	/*
10036 	 * Sanitize any input parameters that are addr/size/prot/inherit
10037 	 */
10038 	kr = vm_map_copy_addr_size_sanitize(
10039 		dst_map,
10040 		dst_addr_u,
10041 		copy_size_u,
10042 		VM_SANITIZE_CALLER_VM_MAP_COPY_OVERWRITE,
10043 		&dst_addr,
10044 		&dst_end,
10045 		&copy_size);
10046 	if (__improbable(kr != KERN_SUCCESS)) {
10047 		return vm_sanitize_get_kr(kr);
10048 	}
10049 
10050 	/*
10051 	 * Assert that the vm_map_copy is coming from the right
10052 	 * zone and hasn't been forged
10053 	 */
10054 	vm_map_copy_require(copy);
10055 
10056 	if (interruptible ||
10057 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
10058 		/*
10059 		 * We can't split the "copy" map if we're interruptible
10060 		 * or if we don't have a "copy" map...
10061 		 */
10062 blunt_copy:
10063 		kr = vm_map_copy_overwrite_nested(dst_map,
10064 		    dst_addr,
10065 		    copy,
10066 		    interruptible,
10067 		    (pmap_t) NULL,
10068 		    TRUE);
10069 		if (kr) {
10070 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10071 		}
10072 		return kr;
10073 	}
10074 
10075 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10076 	if (copy_page_shift < PAGE_SHIFT ||
10077 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10078 		goto blunt_copy;
10079 	}
10080 
10081 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10082 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10083 	} else {
10084 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10085 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10086 		    effective_page_mask);
10087 	}
10088 	effective_page_size = effective_page_mask + 1;
10089 
10090 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10091 		/*
10092 		 * Too small to bother with optimizing...
10093 		 */
10094 		goto blunt_copy;
10095 	}
10096 
10097 	if ((dst_addr & effective_page_mask) !=
10098 	    (copy->offset & effective_page_mask)) {
10099 		/*
10100 		 * Incompatible mis-alignment of source and destination...
10101 		 */
10102 		goto blunt_copy;
10103 	}
10104 
10105 	/*
10106 	 * Proper alignment or identical mis-alignment at the beginning.
10107 	 * Let's try and do a small unaligned copy first (if needed)
10108 	 * and then an aligned copy for the rest.
10109 	 */
10110 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10111 		head_addr = dst_addr;
10112 		head_size = (effective_page_size -
10113 		    (copy->offset & effective_page_mask));
10114 		head_size = MIN(head_size, copy_size);
10115 	}
10116 	if (!vm_map_page_aligned(copy->offset + copy_size,
10117 	    effective_page_mask)) {
10118 		/*
10119 		 * Mis-alignment at the end.
10120 		 * Do an aligned copy up to the last page and
10121 		 * then an unaligned copy for the remaining bytes.
10122 		 */
10123 		tail_size = ((copy->offset + copy_size) &
10124 		    effective_page_mask);
10125 		tail_size = MIN(tail_size, copy_size);
10126 		tail_addr = dst_addr + copy_size - tail_size;
10127 		assert(tail_addr >= head_addr + head_size);
10128 	}
10129 	assert(head_size + tail_size <= copy_size);
10130 
10131 	if (head_size + tail_size == copy_size) {
10132 		/*
10133 		 * It's all unaligned, no optimization possible...
10134 		 */
10135 		goto blunt_copy;
10136 	}
10137 
10138 	/*
10139 	 * Can't optimize if there are any submaps in the
10140 	 * destination due to the way we free the "copy" map
10141 	 * progressively in vm_map_copy_overwrite_nested()
10142 	 * in that case.
10143 	 */
10144 	vm_map_lock_read(dst_map);
10145 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10146 		vm_map_unlock_read(dst_map);
10147 		goto blunt_copy;
10148 	}
10149 	for (;
10150 	    (entry != vm_map_to_entry(dst_map) &&
10151 	    entry->vme_start < dst_addr + copy_size);
10152 	    entry = entry->vme_next) {
10153 		if (entry->is_sub_map) {
10154 			vm_map_unlock_read(dst_map);
10155 			goto blunt_copy;
10156 		}
10157 	}
10158 	vm_map_unlock_read(dst_map);
10159 
10160 	if (head_size) {
10161 		/*
10162 		 * Unaligned copy of the first "head_size" bytes, to reach
10163 		 * a page boundary.
10164 		 */
10165 
10166 		/*
10167 		 * Extract "head_copy" out of "copy".
10168 		 */
10169 		head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10170 		head_copy->cpy_hdr.entries_pageable =
10171 		    copy->cpy_hdr.entries_pageable;
10172 		head_copy->cpy_hdr.page_shift = copy_page_shift;
10173 
10174 		entry = vm_map_copy_first_entry(copy);
10175 		if (entry->vme_end < copy->offset + head_size) {
10176 			head_size = entry->vme_end - copy->offset;
10177 		}
10178 
10179 		head_copy->offset = copy->offset;
10180 		head_copy->size = head_size;
10181 		copy->offset += head_size;
10182 		copy->size -= head_size;
10183 		copy_size -= head_size;
10184 		assert(copy_size > 0);
10185 
10186 		vm_map_copy_clip_end(copy, entry, copy->offset);
10187 		vm_map_copy_entry_unlink(copy, entry);
10188 		vm_map_copy_entry_link(head_copy,
10189 		    vm_map_copy_to_entry(head_copy),
10190 		    entry);
10191 
10192 		/*
10193 		 * Do the unaligned copy.
10194 		 */
10195 		kr = vm_map_copy_overwrite_nested(dst_map,
10196 		    head_addr,
10197 		    head_copy,
10198 		    interruptible,
10199 		    (pmap_t) NULL,
10200 		    FALSE);
10201 		if (kr != KERN_SUCCESS) {
10202 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10203 			goto done;
10204 		}
10205 	}
10206 
10207 	if (tail_size) {
10208 		/*
10209 		 * Extract "tail_copy" out of "copy".
10210 		 */
10211 		tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10212 		tail_copy->cpy_hdr.entries_pageable =
10213 		    copy->cpy_hdr.entries_pageable;
10214 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
10215 
10216 		tail_copy->offset = copy->offset + copy_size - tail_size;
10217 		tail_copy->size = tail_size;
10218 
10219 		copy->size -= tail_size;
10220 		copy_size -= tail_size;
10221 		assert(copy_size > 0);
10222 
10223 		entry = vm_map_copy_last_entry(copy);
10224 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10225 		entry = vm_map_copy_last_entry(copy);
10226 		vm_map_copy_entry_unlink(copy, entry);
10227 		vm_map_copy_entry_link(tail_copy,
10228 		    vm_map_copy_last_entry(tail_copy),
10229 		    entry);
10230 	}
10231 
10232 	/*
10233 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10234 	 * we want to avoid TOCTOU issues w.r.t copy->size but
10235 	 * we don't need to change vm_map_copy_overwrite_nested()
10236 	 * and all other vm_map_copy_overwrite variants.
10237 	 *
10238 	 * So we assign the original copy_size that was passed into
10239 	 * this routine back to copy.
10240 	 *
10241 	 * This use of local 'copy_size' passed into this routine is
10242 	 * to try and protect against TOCTOU attacks where the kernel
10243 	 * has been exploited. We don't expect this to be an issue
10244 	 * during normal system operation.
10245 	 */
10246 	assertf(copy->size == copy_size,
10247 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10248 	copy->size = copy_size;
10249 
10250 	/*
10251 	 * Copy most (or possibly all) of the data.
10252 	 */
10253 	kr = vm_map_copy_overwrite_nested(dst_map,
10254 	    dst_addr + head_size,
10255 	    copy,
10256 	    interruptible,
10257 	    (pmap_t) NULL,
10258 	    FALSE);
10259 	if (kr != KERN_SUCCESS) {
10260 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10261 		goto done;
10262 	}
10263 
10264 	if (tail_size) {
10265 		kr = vm_map_copy_overwrite_nested(dst_map,
10266 		    tail_addr,
10267 		    tail_copy,
10268 		    interruptible,
10269 		    (pmap_t) NULL,
10270 		    FALSE);
10271 		if (kr) {
10272 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10273 		}
10274 	}
10275 
10276 done:
10277 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10278 	if (kr == KERN_SUCCESS) {
10279 		/*
10280 		 * Discard all the copy maps.
10281 		 */
10282 		if (head_copy) {
10283 			vm_map_copy_discard(head_copy);
10284 			head_copy = NULL;
10285 		}
10286 		vm_map_copy_discard(copy);
10287 		if (tail_copy) {
10288 			vm_map_copy_discard(tail_copy);
10289 			tail_copy = NULL;
10290 		}
10291 	} else {
10292 		/*
10293 		 * Re-assemble the original copy map.
10294 		 */
10295 		if (head_copy) {
10296 			entry = vm_map_copy_first_entry(head_copy);
10297 			vm_map_copy_entry_unlink(head_copy, entry);
10298 			vm_map_copy_entry_link(copy,
10299 			    vm_map_copy_to_entry(copy),
10300 			    entry);
10301 			copy->offset -= head_size;
10302 			copy->size += head_size;
10303 			vm_map_copy_discard(head_copy);
10304 			head_copy = NULL;
10305 		}
10306 		if (tail_copy) {
10307 			entry = vm_map_copy_last_entry(tail_copy);
10308 			vm_map_copy_entry_unlink(tail_copy, entry);
10309 			vm_map_copy_entry_link(copy,
10310 			    vm_map_copy_last_entry(copy),
10311 			    entry);
10312 			copy->size += tail_size;
10313 			vm_map_copy_discard(tail_copy);
10314 			tail_copy = NULL;
10315 		}
10316 	}
10317 	return kr;
10318 }
10319 
10320 
10321 /*
10322  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
10323  *
10324  *	Decription:
10325  *	Physically copy unaligned data
10326  *
10327  *	Implementation:
10328  *	Unaligned parts of pages have to be physically copied.  We use
10329  *	a modified form of vm_fault_copy (which understands none-aligned
10330  *	page offsets and sizes) to do the copy.  We attempt to copy as
10331  *	much memory in one go as possibly, however vm_fault_copy copies
10332  *	within 1 memory object so we have to find the smaller of "amount left"
10333  *	"source object data size" and "target object data size".  With
10334  *	unaligned data we don't need to split regions, therefore the source
10335  *	(copy) object should be one map entry, the target range may be split
10336  *	over multiple map entries however.  In any event we are pessimistic
10337  *	about these assumptions.
10338  *
10339  *	Callers of this function must call vm_map_copy_require on
10340  *	previously created vm_map_copy_t or pass a newly created
10341  *	one to ensure that it hasn't been forged.
10342  *
10343  *	Assumptions:
10344  *	dst_map is locked on entry and is return locked on success,
10345  *	unlocked on error.
10346  */
10347 
10348 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10349 vm_map_copy_overwrite_unaligned(
10350 	vm_map_t        dst_map,
10351 	vm_map_entry_t  entry,
10352 	vm_map_copy_t   copy,
10353 	vm_map_offset_t start,
10354 	boolean_t       discard_on_success)
10355 {
10356 	vm_map_entry_t          copy_entry;
10357 	vm_map_entry_t          copy_entry_next;
10358 	vm_map_version_t        version;
10359 	vm_object_t             dst_object;
10360 	vm_object_offset_t      dst_offset;
10361 	vm_object_offset_t      src_offset;
10362 	vm_object_offset_t      entry_offset;
10363 	vm_map_offset_t         entry_end;
10364 	vm_map_size_t           src_size,
10365 	    dst_size,
10366 	    copy_size,
10367 	    amount_left;
10368 	kern_return_t           kr = KERN_SUCCESS;
10369 
10370 
10371 	copy_entry = vm_map_copy_first_entry(copy);
10372 
10373 	vm_map_lock_write_to_read(dst_map);
10374 
10375 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10376 	amount_left = copy->size;
10377 /*
10378  *	unaligned so we never clipped this entry, we need the offset into
10379  *	the vm_object not just the data.
10380  */
10381 	while (amount_left > 0) {
10382 		if (entry == vm_map_to_entry(dst_map)) {
10383 			vm_map_unlock_read(dst_map);
10384 			return KERN_INVALID_ADDRESS;
10385 		}
10386 
10387 		/* "start" must be within the current map entry */
10388 		assert((start >= entry->vme_start) && (start < entry->vme_end));
10389 
10390 		/*
10391 		 *	Check protection again
10392 		 */
10393 		if (!(entry->protection & VM_PROT_WRITE)) {
10394 			vm_map_unlock_read(dst_map);
10395 			return KERN_PROTECTION_FAILURE;
10396 		}
10397 		if (entry->is_sub_map) {
10398 			/* not implemented... */
10399 			vm_map_unlock_read(dst_map);
10400 			return KERN_INVALID_ARGUMENT;
10401 		}
10402 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10403 			vm_map_unlock_read(dst_map);
10404 			return KERN_PROTECTION_FAILURE;
10405 		}
10406 		/*
10407 		 *	If the entry is in transition, we must wait
10408 		 *	for it to exit that state.  Anything could happen
10409 		 *	when we unlock the map, so start over.
10410 		 */
10411 		if (entry->in_transition) {
10412 			/*
10413 			 * Say that we are waiting, and wait for entry.
10414 			 */
10415 			entry->needs_wakeup = TRUE;
10416 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10417 
10418 			goto RetryLookup;
10419 		}
10420 
10421 		dst_offset = start - entry->vme_start;
10422 
10423 		dst_size = entry->vme_end - start;
10424 
10425 		src_size = copy_entry->vme_end -
10426 		    (copy_entry->vme_start + src_offset);
10427 
10428 		if (dst_size < src_size) {
10429 /*
10430  *			we can only copy dst_size bytes before
10431  *			we have to get the next destination entry
10432  */
10433 			copy_size = dst_size;
10434 		} else {
10435 /*
10436  *			we can only copy src_size bytes before
10437  *			we have to get the next source copy entry
10438  */
10439 			copy_size = src_size;
10440 		}
10441 
10442 		if (copy_size > amount_left) {
10443 			copy_size = amount_left;
10444 		}
10445 /*
10446  *		Entry needs copy, create a shadow shadow object for
10447  *		Copy on write region.
10448  */
10449 		assert(!entry->is_sub_map);
10450 		if (entry->needs_copy) {
10451 			if (vm_map_lock_read_to_write(dst_map)) {
10452 				vm_map_lock_read(dst_map);
10453 				goto RetryLookup;
10454 			}
10455 			VME_OBJECT_SHADOW(entry,
10456 			    (vm_map_size_t)(entry->vme_end
10457 			    - entry->vme_start),
10458 			    vm_map_always_shadow(dst_map));
10459 			entry->needs_copy = FALSE;
10460 			vm_map_lock_write_to_read(dst_map);
10461 		}
10462 		dst_object = VME_OBJECT(entry);
10463 /*
10464  *		unlike with the virtual (aligned) copy we're going
10465  *		to fault on it therefore we need a target object.
10466  */
10467 		if (dst_object == VM_OBJECT_NULL) {
10468 			if (vm_map_lock_read_to_write(dst_map)) {
10469 				vm_map_lock_read(dst_map);
10470 				goto RetryLookup;
10471 			}
10472 			dst_object = vm_object_allocate((vm_map_size_t)
10473 			    entry->vme_end - entry->vme_start);
10474 			VME_OBJECT_SET(entry, dst_object, false, 0);
10475 			VME_OFFSET_SET(entry, 0);
10476 			assert(entry->use_pmap);
10477 			vm_map_lock_write_to_read(dst_map);
10478 		}
10479 /*
10480  *		Take an object reference and unlock map. The "entry" may
10481  *		disappear or change when the map is unlocked.
10482  */
10483 		vm_object_reference(dst_object);
10484 		version.main_timestamp = dst_map->timestamp;
10485 		entry_offset = VME_OFFSET(entry);
10486 		entry_end = entry->vme_end;
10487 		vm_map_unlock_read(dst_map);
10488 /*
10489  *		Copy as much as possible in one pass
10490  */
10491 		kr = vm_fault_copy(
10492 			VME_OBJECT(copy_entry),
10493 			VME_OFFSET(copy_entry) + src_offset,
10494 			&copy_size,
10495 			dst_object,
10496 			entry_offset + dst_offset,
10497 			dst_map,
10498 			&version,
10499 			THREAD_UNINT );
10500 
10501 		start += copy_size;
10502 		src_offset += copy_size;
10503 		amount_left -= copy_size;
10504 /*
10505  *		Release the object reference
10506  */
10507 		vm_object_deallocate(dst_object);
10508 /*
10509  *		If a hard error occurred, return it now
10510  */
10511 		if (kr != KERN_SUCCESS) {
10512 			return kr;
10513 		}
10514 
10515 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10516 		    || amount_left == 0) {
10517 /*
10518  *			all done with this copy entry, dispose.
10519  */
10520 			copy_entry_next = copy_entry->vme_next;
10521 
10522 			if (discard_on_success) {
10523 				vm_map_copy_entry_unlink(copy, copy_entry);
10524 				assert(!copy_entry->is_sub_map);
10525 				vm_object_deallocate(VME_OBJECT(copy_entry));
10526 				vm_map_copy_entry_dispose(copy_entry);
10527 			}
10528 
10529 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10530 			    amount_left) {
10531 /*
10532  *				not finished copying but run out of source
10533  */
10534 				return KERN_INVALID_ADDRESS;
10535 			}
10536 
10537 			copy_entry = copy_entry_next;
10538 
10539 			src_offset = 0;
10540 		}
10541 
10542 		if (amount_left == 0) {
10543 			return KERN_SUCCESS;
10544 		}
10545 
10546 		vm_map_lock_read(dst_map);
10547 		if (version.main_timestamp == dst_map->timestamp) {
10548 			if (start == entry_end) {
10549 /*
10550  *				destination region is split.  Use the version
10551  *				information to avoid a lookup in the normal
10552  *				case.
10553  */
10554 				entry = entry->vme_next;
10555 /*
10556  *				should be contiguous. Fail if we encounter
10557  *				a hole in the destination.
10558  */
10559 				if (start != entry->vme_start) {
10560 					vm_map_unlock_read(dst_map);
10561 					return KERN_INVALID_ADDRESS;
10562 				}
10563 			}
10564 		} else {
10565 /*
10566  *			Map version check failed.
10567  *			we must lookup the entry because somebody
10568  *			might have changed the map behind our backs.
10569  */
10570 RetryLookup:
10571 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10572 				vm_map_unlock_read(dst_map);
10573 				return KERN_INVALID_ADDRESS;
10574 			}
10575 		}
10576 	}/* while */
10577 
10578 	return KERN_SUCCESS;
10579 }/* vm_map_copy_overwrite_unaligned */
10580 
10581 /*
10582  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
10583  *
10584  *	Description:
10585  *	Does all the vm_trickery possible for whole pages.
10586  *
10587  *	Implementation:
10588  *
10589  *	If there are no permanent objects in the destination,
10590  *	and the source and destination map entry zones match,
10591  *	and the destination map entry is not shared,
10592  *	then the map entries can be deleted and replaced
10593  *	with those from the copy.  The following code is the
10594  *	basic idea of what to do, but there are lots of annoying
10595  *	little details about getting protection and inheritance
10596  *	right.  Should add protection, inheritance, and sharing checks
10597  *	to the above pass and make sure that no wiring is involved.
10598  *
10599  *	Callers of this function must call vm_map_copy_require on
10600  *	previously created vm_map_copy_t or pass a newly created
10601  *	one to ensure that it hasn't been forged.
10602  */
10603 
10604 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10605 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10606 int vm_map_copy_overwrite_aligned_src_large = 0;
10607 
10608 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10609 vm_map_copy_overwrite_aligned(
10610 	vm_map_t        dst_map,
10611 	vm_map_entry_t  tmp_entry,
10612 	vm_map_copy_t   copy,
10613 	vm_map_offset_t start,
10614 	__unused pmap_t pmap)
10615 {
10616 	vm_object_t     object;
10617 	vm_map_entry_t  copy_entry;
10618 	vm_map_size_t   copy_size;
10619 	vm_map_size_t   size;
10620 	vm_map_entry_t  entry;
10621 
10622 	while ((copy_entry = vm_map_copy_first_entry(copy))
10623 	    != vm_map_copy_to_entry(copy)) {
10624 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10625 
10626 		entry = tmp_entry;
10627 
10628 		if (entry->is_sub_map) {
10629 			/* unnested when clipped earlier */
10630 			assert(!entry->use_pmap);
10631 		}
10632 		if (entry == vm_map_to_entry(dst_map)) {
10633 			vm_map_unlock(dst_map);
10634 			return KERN_INVALID_ADDRESS;
10635 		}
10636 		size = (entry->vme_end - entry->vme_start);
10637 		/*
10638 		 *	Make sure that no holes popped up in the
10639 		 *	address map, and that the protection is
10640 		 *	still valid, in case the map was unlocked
10641 		 *	earlier.
10642 		 */
10643 
10644 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10645 		    && !entry->needs_copy)) {
10646 			vm_map_unlock(dst_map);
10647 			return KERN_INVALID_ADDRESS;
10648 		}
10649 		assert(entry != vm_map_to_entry(dst_map));
10650 
10651 		/*
10652 		 *	Check protection again
10653 		 */
10654 
10655 		if (!(entry->protection & VM_PROT_WRITE)) {
10656 			vm_map_unlock(dst_map);
10657 			return KERN_PROTECTION_FAILURE;
10658 		}
10659 
10660 		if (entry->is_sub_map) {
10661 			/* not properly implemented */
10662 			vm_map_unlock(dst_map);
10663 			return KERN_PROTECTION_FAILURE;
10664 		}
10665 
10666 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10667 			vm_map_unlock(dst_map);
10668 			return KERN_PROTECTION_FAILURE;
10669 		}
10670 
10671 		/*
10672 		 *	If the entry is in transition, we must wait
10673 		 *	for it to exit that state.  Anything could happen
10674 		 *	when we unlock the map, so start over.
10675 		 */
10676 		if (entry->in_transition) {
10677 			/*
10678 			 * Say that we are waiting, and wait for entry.
10679 			 */
10680 			entry->needs_wakeup = TRUE;
10681 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10682 
10683 			goto RetryLookup;
10684 		}
10685 
10686 		/*
10687 		 *	Adjust to source size first
10688 		 */
10689 
10690 		if (copy_size < size) {
10691 			if (entry->map_aligned &&
10692 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10693 			    VM_MAP_PAGE_MASK(dst_map))) {
10694 				/* no longer map-aligned */
10695 				entry->map_aligned = FALSE;
10696 			}
10697 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10698 			size = copy_size;
10699 		}
10700 
10701 		/*
10702 		 *	Adjust to destination size
10703 		 */
10704 
10705 		if (size < copy_size) {
10706 			vm_map_copy_clip_end(copy, copy_entry,
10707 			    copy_entry->vme_start + size);
10708 			copy_size = size;
10709 		}
10710 
10711 		assert((entry->vme_end - entry->vme_start) == size);
10712 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10713 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10714 
10715 		/*
10716 		 *	If the destination contains temporary unshared memory,
10717 		 *	we can perform the copy by throwing it away and
10718 		 *	installing the source data.
10719 		 *
10720 		 *	Exceptions for mappings with special semantics:
10721 		 *	+ "permanent" entries,
10722 		 *	+ JIT regions,
10723 		 *	+ TPRO regions,
10724 		 *      + pmap-specific protection policies,
10725 		 *	+ VM objects with COPY_NONE copy strategy.
10726 		 */
10727 
10728 		object = VME_OBJECT(entry);
10729 		if ((!entry->is_shared &&
10730 		    !entry->vme_permanent &&
10731 		    !entry->used_for_jit &&
10732 #if __arm64e__
10733 		    !entry->used_for_tpro &&
10734 #endif /* __arm64e__ */
10735 		    !(entry->protection & VM_PROT_EXECUTE) &&
10736 		    !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10737 		    ((object == VM_OBJECT_NULL) ||
10738 		    (object->internal &&
10739 		    !object->true_share &&
10740 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10741 		    entry->needs_copy) {
10742 			vm_object_t     old_object = VME_OBJECT(entry);
10743 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10744 			vm_object_offset_t      offset;
10745 
10746 			assert(!entry->is_sub_map);
10747 			/*
10748 			 * Ensure that the source and destination aren't
10749 			 * identical
10750 			 */
10751 			if (old_object == VME_OBJECT(copy_entry) &&
10752 			    old_offset == VME_OFFSET(copy_entry)) {
10753 				vm_map_copy_entry_unlink(copy, copy_entry);
10754 				vm_map_copy_entry_dispose(copy_entry);
10755 
10756 				if (old_object != VM_OBJECT_NULL) {
10757 					vm_object_deallocate(old_object);
10758 				}
10759 
10760 				start = tmp_entry->vme_end;
10761 				tmp_entry = tmp_entry->vme_next;
10762 				continue;
10763 			}
10764 
10765 #if XNU_TARGET_OS_OSX
10766 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10767 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10768 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10769 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10770 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10771 				/*
10772 				 * Virtual vs. Physical copy tradeoff #1.
10773 				 *
10774 				 * Copying only a few pages out of a large
10775 				 * object:  do a physical copy instead of
10776 				 * a virtual copy, to avoid possibly keeping
10777 				 * the entire large object alive because of
10778 				 * those few copy-on-write pages.
10779 				 */
10780 				vm_map_copy_overwrite_aligned_src_large++;
10781 				goto slow_copy;
10782 			}
10783 #endif /* XNU_TARGET_OS_OSX */
10784 
10785 			if ((dst_map->pmap != kernel_pmap) &&
10786 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10787 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10788 				vm_object_t new_object, new_shadow;
10789 
10790 				/*
10791 				 * We're about to map something over a mapping
10792 				 * established by malloc()...
10793 				 */
10794 				new_object = VME_OBJECT(copy_entry);
10795 				if (new_object != VM_OBJECT_NULL) {
10796 					vm_object_lock_shared(new_object);
10797 				}
10798 				while (new_object != VM_OBJECT_NULL &&
10799 #if XNU_TARGET_OS_OSX
10800 				    !new_object->true_share &&
10801 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10802 #endif /* XNU_TARGET_OS_OSX */
10803 				    new_object->internal) {
10804 					new_shadow = new_object->shadow;
10805 					if (new_shadow == VM_OBJECT_NULL) {
10806 						break;
10807 					}
10808 					vm_object_lock_shared(new_shadow);
10809 					vm_object_unlock(new_object);
10810 					new_object = new_shadow;
10811 				}
10812 				if (new_object != VM_OBJECT_NULL) {
10813 					if (!new_object->internal) {
10814 						/*
10815 						 * The new mapping is backed
10816 						 * by an external object.  We
10817 						 * don't want malloc'ed memory
10818 						 * to be replaced with such a
10819 						 * non-anonymous mapping, so
10820 						 * let's go off the optimized
10821 						 * path...
10822 						 */
10823 						vm_map_copy_overwrite_aligned_src_not_internal++;
10824 						vm_object_unlock(new_object);
10825 						goto slow_copy;
10826 					}
10827 #if XNU_TARGET_OS_OSX
10828 					if (new_object->true_share ||
10829 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10830 						/*
10831 						 * Same if there's a "true_share"
10832 						 * object in the shadow chain, or
10833 						 * an object with a non-default
10834 						 * (SYMMETRIC) copy strategy.
10835 						 */
10836 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
10837 						vm_object_unlock(new_object);
10838 						goto slow_copy;
10839 					}
10840 #endif /* XNU_TARGET_OS_OSX */
10841 					vm_object_unlock(new_object);
10842 				}
10843 				/*
10844 				 * The new mapping is still backed by
10845 				 * anonymous (internal) memory, so it's
10846 				 * OK to substitute it for the original
10847 				 * malloc() mapping.
10848 				 */
10849 			}
10850 
10851 			if (old_object != VM_OBJECT_NULL) {
10852 				assert(!entry->vme_permanent);
10853 				if (entry->is_sub_map) {
10854 					if (entry->use_pmap) {
10855 #ifndef NO_NESTED_PMAP
10856 						pmap_unnest(dst_map->pmap,
10857 						    (addr64_t)entry->vme_start,
10858 						    entry->vme_end - entry->vme_start);
10859 #endif  /* NO_NESTED_PMAP */
10860 						if (dst_map->mapped_in_other_pmaps) {
10861 							/* clean up parent */
10862 							/* map/maps */
10863 							vm_map_submap_pmap_clean(
10864 								dst_map, entry->vme_start,
10865 								entry->vme_end,
10866 								VME_SUBMAP(entry),
10867 								VME_OFFSET(entry));
10868 						}
10869 					} else {
10870 						vm_map_submap_pmap_clean(
10871 							dst_map, entry->vme_start,
10872 							entry->vme_end,
10873 							VME_SUBMAP(entry),
10874 							VME_OFFSET(entry));
10875 					}
10876 					vm_map_deallocate(VME_SUBMAP(entry));
10877 				} else {
10878 					if (dst_map->mapped_in_other_pmaps) {
10879 						vm_object_pmap_protect_options(
10880 							VME_OBJECT(entry),
10881 							VME_OFFSET(entry),
10882 							entry->vme_end
10883 							- entry->vme_start,
10884 							PMAP_NULL,
10885 							PAGE_SIZE,
10886 							entry->vme_start,
10887 							VM_PROT_NONE,
10888 							PMAP_OPTIONS_REMOVE);
10889 					} else {
10890 						pmap_remove_options(
10891 							dst_map->pmap,
10892 							(addr64_t)(entry->vme_start),
10893 							(addr64_t)(entry->vme_end),
10894 							PMAP_OPTIONS_REMOVE);
10895 					}
10896 					vm_object_deallocate(old_object);
10897 				}
10898 			}
10899 
10900 			if (entry->iokit_acct) {
10901 				/* keep using iokit accounting */
10902 				entry->use_pmap = FALSE;
10903 			} else {
10904 				/* use pmap accounting */
10905 				entry->use_pmap = TRUE;
10906 			}
10907 			assert(!entry->vme_permanent);
10908 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10909 			object = VME_OBJECT(entry);
10910 			entry->needs_copy = copy_entry->needs_copy;
10911 			entry->wired_count = 0;
10912 			entry->user_wired_count = 0;
10913 			offset = VME_OFFSET(copy_entry);
10914 			VME_OFFSET_SET(entry, offset);
10915 
10916 			vm_map_copy_entry_unlink(copy, copy_entry);
10917 			vm_map_copy_entry_dispose(copy_entry);
10918 
10919 			/*
10920 			 * we could try to push pages into the pmap at this point, BUT
10921 			 * this optimization only saved on average 2 us per page if ALL
10922 			 * the pages in the source were currently mapped
10923 			 * and ALL the pages in the dest were touched, if there were fewer
10924 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
10925 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10926 			 */
10927 
10928 			/*
10929 			 *	Set up for the next iteration.  The map
10930 			 *	has not been unlocked, so the next
10931 			 *	address should be at the end of this
10932 			 *	entry, and the next map entry should be
10933 			 *	the one following it.
10934 			 */
10935 
10936 			start = tmp_entry->vme_end;
10937 			tmp_entry = tmp_entry->vme_next;
10938 		} else {
10939 			vm_map_version_t        version;
10940 			vm_object_t             dst_object;
10941 			vm_object_offset_t      dst_offset;
10942 			kern_return_t           r;
10943 
10944 slow_copy:
10945 			if (entry->needs_copy) {
10946 				VME_OBJECT_SHADOW(entry,
10947 				    (entry->vme_end -
10948 				    entry->vme_start),
10949 				    vm_map_always_shadow(dst_map));
10950 				entry->needs_copy = FALSE;
10951 			}
10952 
10953 			dst_object = VME_OBJECT(entry);
10954 			dst_offset = VME_OFFSET(entry);
10955 
10956 			/*
10957 			 *	Take an object reference, and record
10958 			 *	the map version information so that the
10959 			 *	map can be safely unlocked.
10960 			 */
10961 
10962 			if (dst_object == VM_OBJECT_NULL) {
10963 				/*
10964 				 * We would usually have just taken the
10965 				 * optimized path above if the destination
10966 				 * object has not been allocated yet.  But we
10967 				 * now disable that optimization if the copy
10968 				 * entry's object is not backed by anonymous
10969 				 * memory to avoid replacing malloc'ed
10970 				 * (i.e. re-usable) anonymous memory with a
10971 				 * not-so-anonymous mapping.
10972 				 * So we have to handle this case here and
10973 				 * allocate a new VM object for this map entry.
10974 				 */
10975 				dst_object = vm_object_allocate(
10976 					entry->vme_end - entry->vme_start);
10977 				dst_offset = 0;
10978 				VME_OBJECT_SET(entry, dst_object, false, 0);
10979 				VME_OFFSET_SET(entry, dst_offset);
10980 				assert(entry->use_pmap);
10981 			}
10982 
10983 			vm_object_reference(dst_object);
10984 
10985 			/* account for unlock bumping up timestamp */
10986 			version.main_timestamp = dst_map->timestamp + 1;
10987 
10988 			vm_map_unlock(dst_map);
10989 
10990 			/*
10991 			 *	Copy as much as possible in one pass
10992 			 */
10993 
10994 			copy_size = size;
10995 			r = vm_fault_copy(
10996 				VME_OBJECT(copy_entry),
10997 				VME_OFFSET(copy_entry),
10998 				&copy_size,
10999 				dst_object,
11000 				dst_offset,
11001 				dst_map,
11002 				&version,
11003 				THREAD_UNINT );
11004 
11005 			/*
11006 			 *	Release the object reference
11007 			 */
11008 
11009 			vm_object_deallocate(dst_object);
11010 
11011 			/*
11012 			 *	If a hard error occurred, return it now
11013 			 */
11014 
11015 			if (r != KERN_SUCCESS) {
11016 				return r;
11017 			}
11018 
11019 			if (copy_size != 0) {
11020 				/*
11021 				 *	Dispose of the copied region
11022 				 */
11023 
11024 				vm_map_copy_clip_end(copy, copy_entry,
11025 				    copy_entry->vme_start + copy_size);
11026 				vm_map_copy_entry_unlink(copy, copy_entry);
11027 				vm_object_deallocate(VME_OBJECT(copy_entry));
11028 				vm_map_copy_entry_dispose(copy_entry);
11029 			}
11030 
11031 			/*
11032 			 *	Pick up in the destination map where we left off.
11033 			 *
11034 			 *	Use the version information to avoid a lookup
11035 			 *	in the normal case.
11036 			 */
11037 
11038 			start += copy_size;
11039 			vm_map_lock(dst_map);
11040 			if (version.main_timestamp == dst_map->timestamp &&
11041 			    copy_size != 0) {
11042 				/* We can safely use saved tmp_entry value */
11043 
11044 				if (tmp_entry->map_aligned &&
11045 				    !VM_MAP_PAGE_ALIGNED(
11046 					    start,
11047 					    VM_MAP_PAGE_MASK(dst_map))) {
11048 					/* no longer map-aligned */
11049 					tmp_entry->map_aligned = FALSE;
11050 				}
11051 				vm_map_clip_end(dst_map, tmp_entry, start);
11052 				tmp_entry = tmp_entry->vme_next;
11053 			} else {
11054 				/* Must do lookup of tmp_entry */
11055 
11056 RetryLookup:
11057 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11058 					vm_map_unlock(dst_map);
11059 					return KERN_INVALID_ADDRESS;
11060 				}
11061 				if (tmp_entry->map_aligned &&
11062 				    !VM_MAP_PAGE_ALIGNED(
11063 					    start,
11064 					    VM_MAP_PAGE_MASK(dst_map))) {
11065 					/* no longer map-aligned */
11066 					tmp_entry->map_aligned = FALSE;
11067 				}
11068 				vm_map_clip_start(dst_map, tmp_entry, start);
11069 			}
11070 		}
11071 	}/* while */
11072 
11073 	return KERN_SUCCESS;
11074 }/* vm_map_copy_overwrite_aligned */
11075 
11076 /*
11077  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
11078  *
11079  *	Description:
11080  *		Copy in data to a kernel buffer from space in the
11081  *		source map. The original space may be optionally
11082  *		deallocated.
11083  *
11084  *		If successful, returns a new copy object.
11085  */
11086 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11087 vm_map_copyin_kernel_buffer(
11088 	vm_map_t        src_map,
11089 	vm_map_offset_t src_addr,
11090 	vm_map_size_t   len,
11091 	boolean_t       src_destroy,
11092 	vm_map_copy_t   *copy_result)
11093 {
11094 	kern_return_t kr;
11095 	vm_map_copy_t copy;
11096 	void *kdata;
11097 
11098 	if (len > msg_ool_size_small) {
11099 		return KERN_INVALID_ARGUMENT;
11100 	}
11101 
11102 	kdata = kalloc_data(len, Z_WAITOK);
11103 	if (kdata == NULL) {
11104 		return KERN_RESOURCE_SHORTAGE;
11105 	}
11106 	kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11107 	if (kr != KERN_SUCCESS) {
11108 		kfree_data(kdata, len);
11109 		return kr;
11110 	}
11111 
11112 	copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11113 	copy->cpy_kdata = kdata;
11114 	copy->size = len;
11115 	copy->offset = 0;
11116 
11117 	if (src_destroy) {
11118 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11119 
11120 		if (src_map == kernel_map) {
11121 			flags |= VM_MAP_REMOVE_KUNWIRE;
11122 		}
11123 
11124 		(void)vm_map_remove_guard(src_map,
11125 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11126 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11127 		    flags, KMEM_GUARD_NONE);
11128 	}
11129 
11130 	*copy_result = copy;
11131 	return KERN_SUCCESS;
11132 }
11133 
11134 /*
11135  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
11136  *
11137  *	Description:
11138  *		Copy out data from a kernel buffer into space in the
11139  *		destination map. The space may be otpionally dynamically
11140  *		allocated.
11141  *
11142  *		If successful, consumes the copy object.
11143  *		Otherwise, the caller is responsible for it.
11144  *
11145  *		Callers of this function must call vm_map_copy_require on
11146  *		previously created vm_map_copy_t or pass a newly created
11147  *		one to ensure that it hasn't been forged.
11148  */
11149 static int vm_map_copyout_kernel_buffer_failures = 0;
11150 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11151 vm_map_copyout_kernel_buffer(
11152 	vm_map_t                map,
11153 	vm_map_address_t        *addr,  /* IN/OUT */
11154 	vm_map_copy_t           copy,
11155 	vm_map_size_t           copy_size,
11156 	boolean_t               overwrite,
11157 	boolean_t               consume_on_success)
11158 {
11159 	kern_return_t kr = KERN_SUCCESS;
11160 	thread_t thread = current_thread();
11161 
11162 	assert(copy->size == copy_size);
11163 
11164 	/*
11165 	 * check for corrupted vm_map_copy structure
11166 	 */
11167 	if (copy_size > msg_ool_size_small || copy->offset) {
11168 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11169 		    (long long)copy->size, (long long)copy->offset);
11170 	}
11171 
11172 	if (!overwrite) {
11173 		/*
11174 		 * Allocate space in the target map for the data
11175 		 */
11176 		vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11177 
11178 		if (map == kernel_map) {
11179 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11180 		}
11181 
11182 		*addr = 0;
11183 		kr = vm_map_enter(map,
11184 		    addr,
11185 		    vm_map_round_page(copy_size,
11186 		    VM_MAP_PAGE_MASK(map)),
11187 		    (vm_map_offset_t) 0,
11188 		    vmk_flags,
11189 		    VM_OBJECT_NULL,
11190 		    (vm_object_offset_t) 0,
11191 		    FALSE,
11192 		    VM_PROT_DEFAULT,
11193 		    VM_PROT_ALL,
11194 		    VM_INHERIT_DEFAULT);
11195 		if (kr != KERN_SUCCESS) {
11196 			return kr;
11197 		}
11198 #if KASAN
11199 		if (map->pmap == kernel_pmap) {
11200 			kasan_notify_address(*addr, copy->size);
11201 		}
11202 #endif
11203 	}
11204 
11205 	/*
11206 	 * Copyout the data from the kernel buffer to the target map.
11207 	 */
11208 	if (thread->map == map) {
11209 		/*
11210 		 * If the target map is the current map, just do
11211 		 * the copy.
11212 		 */
11213 		assert((vm_size_t)copy_size == copy_size);
11214 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11215 			kr = KERN_INVALID_ADDRESS;
11216 		}
11217 	} else {
11218 		vm_map_t oldmap;
11219 
11220 		/*
11221 		 * If the target map is another map, assume the
11222 		 * target's address space identity for the duration
11223 		 * of the copy.
11224 		 */
11225 		vm_map_reference(map);
11226 		oldmap = vm_map_switch(map);
11227 
11228 		assert((vm_size_t)copy_size == copy_size);
11229 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11230 			vm_map_copyout_kernel_buffer_failures++;
11231 			kr = KERN_INVALID_ADDRESS;
11232 		}
11233 
11234 		(void) vm_map_switch(oldmap);
11235 		vm_map_deallocate(map);
11236 	}
11237 
11238 	if (kr != KERN_SUCCESS) {
11239 		/* the copy failed, clean up */
11240 		if (!overwrite) {
11241 			/*
11242 			 * Deallocate the space we allocated in the target map.
11243 			 */
11244 			(void) vm_map_remove(map,
11245 			    vm_map_trunc_page(*addr,
11246 			    VM_MAP_PAGE_MASK(map)),
11247 			    vm_map_round_page((*addr +
11248 			    vm_map_round_page(copy_size,
11249 			    VM_MAP_PAGE_MASK(map))),
11250 			    VM_MAP_PAGE_MASK(map)));
11251 			*addr = 0;
11252 		}
11253 	} else {
11254 		/* copy was successful, dicard the copy structure */
11255 		if (consume_on_success) {
11256 			kfree_data(copy->cpy_kdata, copy_size);
11257 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11258 		}
11259 	}
11260 
11261 	return kr;
11262 }
11263 
11264 /*
11265  *	Routine:	vm_map_copy_insert      [internal use only]
11266  *
11267  *	Description:
11268  *		Link a copy chain ("copy") into a map at the
11269  *		specified location (after "where").
11270  *
11271  *		Callers of this function must call vm_map_copy_require on
11272  *		previously created vm_map_copy_t or pass a newly created
11273  *		one to ensure that it hasn't been forged.
11274  *	Side effects:
11275  *		The copy chain is destroyed.
11276  */
11277 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11278 vm_map_copy_insert(
11279 	vm_map_t        map,
11280 	vm_map_entry_t  after_where,
11281 	vm_map_copy_t   copy)
11282 {
11283 	vm_map_entry_t  entry;
11284 
11285 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11286 		entry = vm_map_copy_first_entry(copy);
11287 		vm_map_copy_entry_unlink(copy, entry);
11288 		vm_map_store_entry_link(map, after_where, entry,
11289 		    VM_MAP_KERNEL_FLAGS_NONE);
11290 		after_where = entry;
11291 	}
11292 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11293 }
11294 
11295 /*
11296  * Callers of this function must call vm_map_copy_require on
11297  * previously created vm_map_copy_t or pass a newly created
11298  * one to ensure that it hasn't been forged.
11299  */
11300 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11301 vm_map_copy_remap(
11302 	vm_map_t        map,
11303 	vm_map_entry_t  where,
11304 	vm_map_copy_t   copy,
11305 	vm_map_offset_t adjustment,
11306 	vm_prot_t       cur_prot,
11307 	vm_prot_t       max_prot,
11308 	vm_inherit_t    inheritance)
11309 {
11310 	vm_map_entry_t  copy_entry, new_entry;
11311 
11312 	for (copy_entry = vm_map_copy_first_entry(copy);
11313 	    copy_entry != vm_map_copy_to_entry(copy);
11314 	    copy_entry = copy_entry->vme_next) {
11315 		/* get a new VM map entry for the map */
11316 		new_entry = vm_map_entry_create(map);
11317 		/* copy the "copy entry" to the new entry */
11318 		vm_map_entry_copy(map, new_entry, copy_entry);
11319 		/* adjust "start" and "end" */
11320 		new_entry->vme_start += adjustment;
11321 		new_entry->vme_end += adjustment;
11322 		/* clear some attributes */
11323 		new_entry->inheritance = inheritance;
11324 		new_entry->protection = cur_prot;
11325 		new_entry->max_protection = max_prot;
11326 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11327 		/* take an extra reference on the entry's "object" */
11328 		if (new_entry->is_sub_map) {
11329 			assert(!new_entry->use_pmap); /* not nested */
11330 			vm_map_reference(VME_SUBMAP(new_entry));
11331 		} else {
11332 			vm_object_reference(VME_OBJECT(new_entry));
11333 		}
11334 		/* insert the new entry in the map */
11335 		vm_map_store_entry_link(map, where, new_entry,
11336 		    VM_MAP_KERNEL_FLAGS_NONE);
11337 		/* continue inserting the "copy entries" after the new entry */
11338 		where = new_entry;
11339 	}
11340 }
11341 
11342 
11343 /*
11344  * Returns true if *size matches (or is in the range of) copy->size.
11345  * Upon returning true, the *size field is updated with the actual size of the
11346  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11347  */
11348 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11349 vm_map_copy_validate_size(
11350 	vm_map_t                dst_map,
11351 	vm_map_copy_t           copy,
11352 	vm_map_size_t           *size)
11353 {
11354 	if (copy == VM_MAP_COPY_NULL) {
11355 		return FALSE;
11356 	}
11357 
11358 	/*
11359 	 * Assert that the vm_map_copy is coming from the right
11360 	 * zone and hasn't been forged
11361 	 */
11362 	vm_map_copy_require(copy);
11363 
11364 	vm_map_size_t copy_sz = copy->size;
11365 	vm_map_size_t sz = *size;
11366 	switch (copy->type) {
11367 	case VM_MAP_COPY_KERNEL_BUFFER:
11368 		if (sz == copy_sz) {
11369 			return TRUE;
11370 		}
11371 		break;
11372 	case VM_MAP_COPY_ENTRY_LIST:
11373 		/*
11374 		 * potential page-size rounding prevents us from exactly
11375 		 * validating this flavor of vm_map_copy, but we can at least
11376 		 * assert that it's within a range.
11377 		 */
11378 		if (copy_sz >= sz &&
11379 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11380 			*size = copy_sz;
11381 			return TRUE;
11382 		}
11383 		break;
11384 	default:
11385 		break;
11386 	}
11387 	return FALSE;
11388 }
11389 
11390 static kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11391 vm_map_copyout_internal(
11392 	vm_map_t                dst_map,
11393 	vm_map_address_t       *dst_addr,      /* OUT */
11394 	vm_map_copy_t           copy,
11395 	vm_map_size_ut          copy_size_u,
11396 	boolean_t               consume_on_success,
11397 	vm_prot_t               cur_protection,
11398 	vm_prot_t               max_protection,
11399 	vm_inherit_t            inheritance)
11400 {
11401 	vm_map_size_t           size, copy_size;
11402 	vm_map_size_t           adjustment;
11403 	vm_map_offset_t         start;
11404 	vm_object_offset_t      vm_copy_start;
11405 	vm_map_entry_t          last;
11406 	vm_map_entry_t          entry;
11407 	vm_map_copy_t           original_copy;
11408 	kern_return_t           kr;
11409 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11410 
11411 	/*
11412 	 *	Check for null copy object.
11413 	 */
11414 
11415 	if (copy == VM_MAP_COPY_NULL) {
11416 		*dst_addr = 0;
11417 		return KERN_SUCCESS;
11418 	}
11419 
11420 	/*
11421 	 * Assert that the vm_map_copy is coming from the right
11422 	 * zone and hasn't been forged
11423 	 */
11424 	vm_map_copy_require(copy);
11425 
11426 	if (!VM_SANITIZE_UNSAFE_IS_EQUAL(copy_size_u, copy->size)) {
11427 		*dst_addr = 0;
11428 		ktriage_record(thread_tid(current_thread()),
11429 		    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11430 		    KDBG_TRIAGE_RESERVED,
11431 		    KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR),
11432 		    KERN_FAILURE /* arg */);
11433 		return KERN_FAILURE;
11434 	}
11435 	copy_size = copy->size;
11436 
11437 	/*
11438 	 *	Check for special kernel buffer allocated
11439 	 *	by new_ipc_kmsg_copyin.
11440 	 */
11441 
11442 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11443 		kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11444 		    copy, copy_size, FALSE,
11445 		    consume_on_success);
11446 		if (kr) {
11447 			ktriage_record(thread_tid(current_thread()),
11448 			    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11449 			    KDBG_TRIAGE_RESERVED,
11450 			    KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11451 		}
11452 		return kr;
11453 	}
11454 
11455 	original_copy = copy;
11456 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11457 		vm_map_copy_t target_copy;
11458 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11459 
11460 		target_copy = VM_MAP_COPY_NULL;
11461 		DEBUG4K_ADJUST("adjusting...\n");
11462 		kr = vm_map_copy_adjust_to_target(
11463 			copy,
11464 			0, /* offset */
11465 			copy->size, /* size */
11466 			dst_map,
11467 			TRUE, /* copy */
11468 			&target_copy,
11469 			&overmap_start,
11470 			&overmap_end,
11471 			&trimmed_start);
11472 		if (kr != KERN_SUCCESS) {
11473 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11474 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11475 			return kr;
11476 		}
11477 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11478 		if (target_copy != copy) {
11479 			copy = target_copy;
11480 		}
11481 		copy_size = copy->size;
11482 	}
11483 
11484 	/*
11485 	 *	Find space for the data
11486 	 */
11487 
11488 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11489 	    VM_MAP_COPY_PAGE_MASK(copy));
11490 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11491 	    VM_MAP_COPY_PAGE_MASK(copy))
11492 	    - vm_copy_start;
11493 
11494 	vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map, size);
11495 
11496 	vm_map_lock(dst_map);
11497 	kr = vm_map_locate_space_anywhere(dst_map, size, 0, vmk_flags,
11498 	    &start, &last);
11499 	if (kr != KERN_SUCCESS) {
11500 		vm_map_unlock(dst_map);
11501 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11502 		return kr;
11503 	}
11504 
11505 	adjustment = start - vm_copy_start;
11506 	if (!consume_on_success) {
11507 		/*
11508 		 * We're not allowed to consume "copy", so we'll have to
11509 		 * copy its map entries into the destination map below.
11510 		 * No need to re-allocate map entries from the correct
11511 		 * (pageable or not) zone, since we'll get new map entries
11512 		 * during the transfer.
11513 		 * We'll also adjust the map entries's "start" and "end"
11514 		 * during the transfer, to keep "copy"'s entries consistent
11515 		 * with its "offset".
11516 		 */
11517 		goto after_adjustments;
11518 	}
11519 
11520 	/*
11521 	 *	Since we're going to just drop the map
11522 	 *	entries from the copy into the destination
11523 	 *	map, they must come from the same pool.
11524 	 */
11525 
11526 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11527 		/*
11528 		 * Mismatches occur when dealing with the default
11529 		 * pager.
11530 		 */
11531 		vm_map_entry_t  next, new;
11532 
11533 		/*
11534 		 * Find the zone that the copies were allocated from
11535 		 */
11536 
11537 		entry = vm_map_copy_first_entry(copy);
11538 
11539 		/*
11540 		 * Reinitialize the copy so that vm_map_copy_entry_link
11541 		 * will work.
11542 		 */
11543 		vm_map_store_copy_reset(copy, entry);
11544 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11545 
11546 		/*
11547 		 * Copy each entry.
11548 		 */
11549 		while (entry != vm_map_copy_to_entry(copy)) {
11550 			new = vm_map_copy_entry_create(copy);
11551 			vm_map_entry_copy_full(new, entry);
11552 			new->vme_no_copy_on_read = FALSE;
11553 			assert(!new->iokit_acct);
11554 			if (new->is_sub_map) {
11555 				/* clr address space specifics */
11556 				new->use_pmap = FALSE;
11557 			}
11558 			vm_map_copy_entry_link(copy,
11559 			    vm_map_copy_last_entry(copy),
11560 			    new);
11561 			next = entry->vme_next;
11562 			vm_map_entry_dispose(entry);
11563 			entry = next;
11564 		}
11565 	}
11566 
11567 	/*
11568 	 *	Adjust the addresses in the copy chain, and
11569 	 *	reset the region attributes.
11570 	 */
11571 
11572 	for (entry = vm_map_copy_first_entry(copy);
11573 	    entry != vm_map_copy_to_entry(copy);
11574 	    entry = entry->vme_next) {
11575 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11576 			/*
11577 			 * We're injecting this copy entry into a map that
11578 			 * has the standard page alignment, so clear
11579 			 * "map_aligned" (which might have been inherited
11580 			 * from the original map entry).
11581 			 */
11582 			entry->map_aligned = FALSE;
11583 		}
11584 
11585 		entry->vme_start += adjustment;
11586 		entry->vme_end += adjustment;
11587 
11588 		if (entry->map_aligned) {
11589 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11590 			    VM_MAP_PAGE_MASK(dst_map)));
11591 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11592 			    VM_MAP_PAGE_MASK(dst_map)));
11593 		}
11594 
11595 		entry->inheritance = VM_INHERIT_DEFAULT;
11596 		entry->protection = VM_PROT_DEFAULT;
11597 		entry->max_protection = VM_PROT_ALL;
11598 		entry->behavior = VM_BEHAVIOR_DEFAULT;
11599 
11600 		/*
11601 		 * If the entry is now wired,
11602 		 * map the pages into the destination map.
11603 		 */
11604 		if (entry->wired_count != 0) {
11605 			vm_map_offset_t va;
11606 			vm_object_offset_t       offset;
11607 			vm_object_t object;
11608 			vm_prot_t prot;
11609 			int     type_of_fault;
11610 			uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11611 
11612 			/* TODO4K would need to use actual page size */
11613 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11614 
11615 			object = VME_OBJECT(entry);
11616 			offset = VME_OFFSET(entry);
11617 			va = entry->vme_start;
11618 
11619 			pmap_pageable(dst_map->pmap,
11620 			    entry->vme_start,
11621 			    entry->vme_end,
11622 			    TRUE);
11623 
11624 			while (va < entry->vme_end) {
11625 				vm_page_t       m;
11626 				struct vm_object_fault_info fault_info = {};
11627 
11628 				/*
11629 				 * Look up the page in the object.
11630 				 * Assert that the page will be found in the
11631 				 * top object:
11632 				 * either
11633 				 *	the object was newly created by
11634 				 *	vm_object_copy_slowly, and has
11635 				 *	copies of all of the pages from
11636 				 *	the source object
11637 				 * or
11638 				 *	the object was moved from the old
11639 				 *	map entry; because the old map
11640 				 *	entry was wired, all of the pages
11641 				 *	were in the top-level object.
11642 				 *	(XXX not true if we wire pages for
11643 				 *	 reading)
11644 				 */
11645 				vm_object_lock(object);
11646 
11647 				m = vm_page_lookup(object, offset);
11648 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11649 				    m->vmp_absent) {
11650 					panic("vm_map_copyout: wiring %p", m);
11651 				}
11652 
11653 				prot = entry->protection;
11654 
11655 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11656 				    prot) {
11657 					prot |= VM_PROT_EXECUTE;
11658 				}
11659 
11660 				type_of_fault = DBG_CACHE_HIT_FAULT;
11661 
11662 				fault_info.user_tag = VME_ALIAS(entry);
11663 				fault_info.pmap_options = 0;
11664 				if (entry->iokit_acct ||
11665 				    (!entry->is_sub_map && !entry->use_pmap)) {
11666 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11667 				}
11668 				if (entry->vme_xnu_user_debug &&
11669 				    !VM_PAGE_OBJECT(m)->code_signed) {
11670 					/*
11671 					 * Modified code-signed executable
11672 					 * region: this page does not belong
11673 					 * to a code-signed VM object, so it
11674 					 * must have been copied and should
11675 					 * therefore be typed XNU_USER_DEBUG
11676 					 * rather than XNU_USER_EXEC.
11677 					 */
11678 					fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11679 				}
11680 
11681 				vm_fault_enter(m,
11682 				    dst_map->pmap,
11683 				    va,
11684 				    PAGE_SIZE, 0,
11685 				    prot,
11686 				    prot,
11687 				    VM_PAGE_WIRED(m),
11688 				    FALSE,            /* change_wiring */
11689 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11690 				    &fault_info,
11691 				    NULL,             /* need_retry */
11692 				    &type_of_fault,
11693 				    &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11694 
11695 				vm_object_unlock(object);
11696 
11697 				offset += PAGE_SIZE_64;
11698 				va += PAGE_SIZE;
11699 			}
11700 		}
11701 	}
11702 
11703 after_adjustments:
11704 
11705 	/*
11706 	 *	Correct the page alignment for the result
11707 	 */
11708 
11709 	*dst_addr = start + (copy->offset - vm_copy_start);
11710 
11711 #if KASAN
11712 	kasan_notify_address(*dst_addr, size);
11713 #endif
11714 
11715 	/*
11716 	 *	Update the hints and the map size
11717 	 */
11718 
11719 	if (consume_on_success) {
11720 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11721 	} else {
11722 		SAVE_HINT_MAP_WRITE(dst_map, last);
11723 	}
11724 
11725 	dst_map->size += size;
11726 
11727 	/*
11728 	 *	Link in the copy
11729 	 */
11730 
11731 	if (consume_on_success) {
11732 		vm_map_copy_insert(dst_map, last, copy);
11733 		if (copy != original_copy) {
11734 			vm_map_copy_discard(original_copy);
11735 			original_copy = VM_MAP_COPY_NULL;
11736 		}
11737 	} else {
11738 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11739 		    cur_protection, max_protection,
11740 		    inheritance);
11741 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11742 			vm_map_copy_discard(copy);
11743 			copy = original_copy;
11744 		}
11745 	}
11746 
11747 
11748 	vm_map_unlock(dst_map);
11749 
11750 	/*
11751 	 * XXX	If wiring_required, call vm_map_pageable
11752 	 */
11753 
11754 	return KERN_SUCCESS;
11755 }
11756 
11757 /*
11758  *	Routine:	vm_map_copyout_size
11759  *
11760  *	Description:
11761  *		Copy out a copy chain ("copy") into newly-allocated
11762  *		space in the destination map. Uses a prevalidated
11763  *		size for the copy object (vm_map_copy_validate_size).
11764  *
11765  *		If successful, consumes the copy object.
11766  *		Otherwise, the caller is responsible for it.
11767  */
11768 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size)11769 vm_map_copyout_size(
11770 	vm_map_t                dst_map,
11771 	vm_map_address_t       *dst_addr,      /* OUT */
11772 	vm_map_copy_t           copy,
11773 	vm_map_size_ut          copy_size)
11774 {
11775 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11776 	           TRUE,                     /* consume_on_success */
11777 	           VM_PROT_DEFAULT,
11778 	           VM_PROT_ALL,
11779 	           VM_INHERIT_DEFAULT);
11780 }
11781 
11782 /*
11783  *	Routine:	vm_map_copyout
11784  *
11785  *	Description:
11786  *		Copy out a copy chain ("copy") into newly-allocated
11787  *		space in the destination map.
11788  *
11789  *		If successful, consumes the copy object.
11790  *		Otherwise, the caller is responsible for it.
11791  */
11792 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11793 vm_map_copyout(
11794 	vm_map_t                dst_map,
11795 	vm_map_address_t       *dst_addr,      /* OUT */
11796 	vm_map_copy_t           copy)
11797 {
11798 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11799 	           TRUE,                     /* consume_on_success */
11800 	           VM_PROT_DEFAULT,
11801 	           VM_PROT_ALL,
11802 	           VM_INHERIT_DEFAULT);
11803 }
11804 
11805 /*
11806  *	Routine:	vm_map_copyin
11807  *
11808  *	Description:
11809  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11810  *
11811  */
11812 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,vm_map_copy_t * copy_result)11813 vm_map_copyin(
11814 	vm_map_t                src_map,
11815 	vm_map_address_ut       src_addr,
11816 	vm_map_size_ut          len,
11817 	boolean_t               src_destroy,
11818 	vm_map_copy_t          *copy_result)   /* OUT */
11819 {
11820 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11821 	           FALSE, copy_result, FALSE);
11822 }
11823 
11824 /*
11825  *	Routine:	vm_map_copyin_common
11826  *
11827  *	Description:
11828  *		Copy the specified region (src_addr, len) from the
11829  *		source address space (src_map), possibly removing
11830  *		the region from the source address space (src_destroy).
11831  *
11832  *	Returns:
11833  *		A vm_map_copy_t object (copy_result), suitable for
11834  *		insertion into another address space (using vm_map_copyout),
11835  *		copying over another address space region (using
11836  *		vm_map_copy_overwrite).  If the copy is unused, it
11837  *		should be destroyed (using vm_map_copy_discard).
11838  *
11839  *	In/out conditions:
11840  *		The source map should not be locked on entry.
11841  */
11842 
11843 typedef struct submap_map {
11844 	vm_map_t        parent_map;
11845 	vm_map_offset_t base_start;
11846 	vm_map_offset_t base_end;
11847 	vm_map_size_t   base_len;
11848 	struct submap_map *next;
11849 } submap_map_t;
11850 
11851 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11852 vm_map_copyin_common(
11853 	vm_map_t                src_map,
11854 	vm_map_address_ut       src_addr,
11855 	vm_map_size_ut          len,
11856 	boolean_t               src_destroy,
11857 	__unused boolean_t      src_volatile,
11858 	vm_map_copy_t          *copy_result,   /* OUT */
11859 	boolean_t               use_maxprot)
11860 {
11861 	int flags;
11862 
11863 	flags = 0;
11864 	if (src_destroy) {
11865 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
11866 	}
11867 	if (use_maxprot) {
11868 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
11869 	}
11870 	return vm_map_copyin_internal(src_map,
11871 	           src_addr,
11872 	           len,
11873 	           flags,
11874 	           copy_result);
11875 }
11876 
11877 static __attribute__((always_inline, warn_unused_result))
11878 kern_return_t
vm_map_copyin_sanitize(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,vm_map_offset_t * src_start,vm_map_offset_t * src_end,vm_map_size_t * len,vm_map_offset_t * src_addr_unaligned)11879 vm_map_copyin_sanitize(
11880 	vm_map_t                src_map,
11881 	vm_map_address_ut       src_addr_u,
11882 	vm_map_size_ut          len_u,
11883 	vm_map_offset_t        *src_start,
11884 	vm_map_offset_t        *src_end,
11885 	vm_map_size_t          *len,
11886 	vm_map_offset_t        *src_addr_unaligned)
11887 {
11888 	kern_return_t   kr;
11889 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS |
11890 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES;
11891 
11892 	if (src_map->pmap == kernel_pmap) {
11893 		flags |= VM_SANITIZE_FLAGS_CANONICALIZE;
11894 	}
11895 
11896 
11897 	kr = vm_sanitize_addr_size(src_addr_u, len_u,
11898 	    VM_SANITIZE_CALLER_VM_MAP_COPYIN,
11899 	    src_map,
11900 	    flags,
11901 	    src_start, src_end, len);
11902 	if (__improbable(kr != KERN_SUCCESS)) {
11903 		return kr;
11904 	}
11905 
11906 	/*
11907 	 *	Compute (page aligned) start and end of region
11908 	 */
11909 	*src_addr_unaligned  = *src_start; /* remember unaligned value */
11910 	*src_start = vm_map_trunc_page(*src_addr_unaligned,
11911 	    VM_MAP_PAGE_MASK(src_map));
11912 	*src_end   = vm_map_round_page(*src_end, VM_MAP_PAGE_MASK(src_map));
11913 	return KERN_SUCCESS;
11914 }
11915 
11916 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,int flags,vm_map_copy_t * copy_result)11917 vm_map_copyin_internal(
11918 	vm_map_t                src_map,
11919 	vm_map_address_ut       src_addr_u,
11920 	vm_map_size_ut          len_u,
11921 	int                     flags,
11922 	vm_map_copy_t          *copy_result)   /* OUT */
11923 {
11924 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
11925 	                                 * in multi-level lookup, this
11926 	                                 * entry contains the actual
11927 	                                 * vm_object/offset.
11928 	                                 */
11929 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
11930 
11931 	vm_map_offset_t src_start;      /* Start of current entry --
11932 	                                 * where copy is taking place now
11933 	                                 */
11934 	vm_map_offset_t src_end;        /* End of entire region to be
11935 	                                 * copied */
11936 	vm_map_offset_t src_addr_unaligned;
11937 	vm_map_offset_t src_base;
11938 	vm_map_size_t   len;
11939 	vm_map_t        base_map = src_map;
11940 	boolean_t       map_share = FALSE;
11941 	submap_map_t    *parent_maps = NULL;
11942 
11943 	vm_map_copy_t   copy;           /* Resulting copy */
11944 	vm_map_address_t copy_addr;
11945 	vm_map_size_t   copy_size;
11946 	boolean_t       src_destroy;
11947 	boolean_t       use_maxprot;
11948 	boolean_t       preserve_purgeable;
11949 	boolean_t       entry_was_shared;
11950 	vm_map_entry_t  saved_src_entry;
11951 	kern_return_t   kr;
11952 
11953 
11954 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11955 		return KERN_INVALID_ARGUMENT;
11956 	}
11957 
11958 	/*
11959 	 *	Check for copies of zero bytes.
11960 	 */
11961 	if (VM_SANITIZE_UNSAFE_IS_ZERO(len_u)) {
11962 		*copy_result = VM_MAP_COPY_NULL;
11963 		return KERN_SUCCESS;
11964 	}
11965 
11966 	/*
11967 	 * Sanitize any input parameters that are addr/size/prot/inherit
11968 	 */
11969 	kr = vm_map_copyin_sanitize(
11970 		src_map,
11971 		src_addr_u,
11972 		len_u,
11973 		&src_start,
11974 		&src_end,
11975 		&len,
11976 		&src_addr_unaligned);
11977 	if (__improbable(kr != KERN_SUCCESS)) {
11978 		return vm_sanitize_get_kr(kr);
11979 	}
11980 
11981 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11982 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11983 	preserve_purgeable =
11984 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11985 
11986 	/*
11987 	 * If the copy is sufficiently small, use a kernel buffer instead
11988 	 * of making a virtual copy.  The theory being that the cost of
11989 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
11990 	 * for small regions.
11991 	 */
11992 	if ((len <= msg_ool_size_small) &&
11993 	    !use_maxprot &&
11994 	    !preserve_purgeable &&
11995 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11996 	    /*
11997 	     * Since the "msg_ool_size_small" threshold was increased and
11998 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11999 	     * address space limits, we revert to doing a virtual copy if the
12000 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
12001 	     * of the commpage would now fail when it used to work.
12002 	     */
12003 	    (src_start >= vm_map_min(src_map) &&
12004 	    src_start < vm_map_max(src_map) &&
12005 	    src_end >= vm_map_min(src_map) &&
12006 	    src_end < vm_map_max(src_map))) {
12007 		return vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len,
12008 		           src_destroy, copy_result);
12009 	}
12010 
12011 	/*
12012 	 *	Allocate a header element for the list.
12013 	 *
12014 	 *	Use the start and end in the header to
12015 	 *	remember the endpoints prior to rounding.
12016 	 */
12017 
12018 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12019 	copy->cpy_hdr.entries_pageable = TRUE;
12020 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
12021 	copy->offset = src_addr_unaligned;
12022 	copy->size = len;
12023 
12024 	new_entry = vm_map_copy_entry_create(copy);
12025 
12026 #define RETURN(x)                                               \
12027 	MACRO_BEGIN                                             \
12028 	vm_map_unlock(src_map);                                 \
12029 	if(src_map != base_map)                                 \
12030 	        vm_map_deallocate(src_map);                     \
12031 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
12032 	        vm_map_copy_entry_dispose(new_entry);           \
12033 	vm_map_copy_discard(copy);                              \
12034 	{                                                       \
12035 	        submap_map_t	*_ptr;                          \
12036                                                                 \
12037 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12038 	                parent_maps=parent_maps->next;          \
12039 	                if (_ptr->parent_map != base_map)       \
12040 	                        vm_map_deallocate(_ptr->parent_map);    \
12041 	                kfree_type(submap_map_t, _ptr);         \
12042 	        }                                               \
12043 	}                                                       \
12044 	MACRO_RETURN(x);                                        \
12045 	MACRO_END
12046 
12047 	/*
12048 	 *	Find the beginning of the region.
12049 	 */
12050 
12051 	vm_map_lock(src_map);
12052 
12053 	/*
12054 	 * Lookup the original "src_addr_unaligned" rather than the truncated
12055 	 * "src_start", in case "src_start" falls in a non-map-aligned
12056 	 * map entry *before* the map entry that contains "src_addr_unaligned"...
12057 	 */
12058 	if (!vm_map_lookup_entry(src_map, src_addr_unaligned, &tmp_entry)) {
12059 		RETURN(KERN_INVALID_ADDRESS);
12060 	}
12061 	if (!tmp_entry->is_sub_map) {
12062 		/*
12063 		 * ... but clip to the map-rounded "src_start" rather than
12064 		 * "src_addr_unaligned" to preserve map-alignment.  We'll adjust the
12065 		 * first copy entry at the end, if needed.
12066 		 */
12067 		vm_map_clip_start(src_map, tmp_entry, src_start);
12068 	}
12069 	if (src_start < tmp_entry->vme_start) {
12070 		/*
12071 		 * Move "src_start" up to the start of the
12072 		 * first map entry to copy.
12073 		 */
12074 		src_start = tmp_entry->vme_start;
12075 	}
12076 	/* set for later submap fix-up */
12077 	copy_addr = src_start;
12078 
12079 	/*
12080 	 *	Go through entries until we get to the end.
12081 	 */
12082 
12083 	while (TRUE) {
12084 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
12085 		vm_map_size_t   src_size;               /* Size of source
12086 		                                         * map entry (in both
12087 		                                         * maps)
12088 		                                         */
12089 
12090 		vm_object_t             src_object;     /* Object to copy */
12091 		vm_object_offset_t      src_offset;
12092 
12093 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
12094 
12095 		boolean_t       src_needs_copy;         /* Should source map
12096 		                                         * be made read-only
12097 		                                         * for copy-on-write?
12098 		                                         */
12099 
12100 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
12101 
12102 		boolean_t       was_wired;              /* Was source wired? */
12103 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
12104 		vm_map_version_t version;               /* Version before locks
12105 		                                         * dropped to make copy
12106 		                                         */
12107 		kern_return_t   result;                 /* Return value from
12108 		                                         * copy_strategically.
12109 		                                         */
12110 		while (tmp_entry->is_sub_map) {
12111 			vm_map_size_t submap_len;
12112 			submap_map_t *ptr;
12113 
12114 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
12115 			ptr->next = parent_maps;
12116 			parent_maps = ptr;
12117 			ptr->parent_map = src_map;
12118 			ptr->base_start = src_start;
12119 			ptr->base_end = src_end;
12120 			submap_len = tmp_entry->vme_end - src_start;
12121 			if (submap_len > (src_end - src_start)) {
12122 				submap_len = src_end - src_start;
12123 			}
12124 			ptr->base_len = submap_len;
12125 
12126 			src_start -= tmp_entry->vme_start;
12127 			src_start += VME_OFFSET(tmp_entry);
12128 			src_end = src_start + submap_len;
12129 			src_map = VME_SUBMAP(tmp_entry);
12130 			vm_map_lock(src_map);
12131 			/* keep an outstanding reference for all maps in */
12132 			/* the parents tree except the base map */
12133 			vm_map_reference(src_map);
12134 			vm_map_unlock(ptr->parent_map);
12135 			if (!vm_map_lookup_entry(
12136 				    src_map, src_start, &tmp_entry)) {
12137 				RETURN(KERN_INVALID_ADDRESS);
12138 			}
12139 			map_share = TRUE;
12140 			if (!tmp_entry->is_sub_map) {
12141 				vm_map_clip_start(src_map, tmp_entry, src_start);
12142 			}
12143 			src_entry = tmp_entry;
12144 		}
12145 		/* we are now in the lowest level submap... */
12146 
12147 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12148 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12149 			/* This is not, supported for now.In future */
12150 			/* we will need to detect the phys_contig   */
12151 			/* condition and then upgrade copy_slowly   */
12152 			/* to do physical copy from the device mem  */
12153 			/* based object. We can piggy-back off of   */
12154 			/* the was wired boolean to set-up the      */
12155 			/* proper handling */
12156 			RETURN(KERN_PROTECTION_FAILURE);
12157 		}
12158 		/*
12159 		 *	Create a new address map entry to hold the result.
12160 		 *	Fill in the fields from the appropriate source entries.
12161 		 *	We must unlock the source map to do this if we need
12162 		 *	to allocate a map entry.
12163 		 */
12164 		if (new_entry == VM_MAP_ENTRY_NULL) {
12165 			version.main_timestamp = src_map->timestamp;
12166 			vm_map_unlock(src_map);
12167 
12168 			new_entry = vm_map_copy_entry_create(copy);
12169 
12170 			vm_map_lock(src_map);
12171 			if ((version.main_timestamp + 1) != src_map->timestamp) {
12172 				if (!vm_map_lookup_entry(src_map, src_start,
12173 				    &tmp_entry)) {
12174 					RETURN(KERN_INVALID_ADDRESS);
12175 				}
12176 				if (!tmp_entry->is_sub_map) {
12177 					vm_map_clip_start(src_map, tmp_entry, src_start);
12178 				}
12179 				continue; /* restart w/ new tmp_entry */
12180 			}
12181 		}
12182 
12183 		/*
12184 		 *	Verify that the region can be read.
12185 		 */
12186 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12187 		    !use_maxprot) ||
12188 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
12189 			RETURN(KERN_PROTECTION_FAILURE);
12190 		}
12191 
12192 		/*
12193 		 *	Clip against the endpoints of the entire region.
12194 		 */
12195 
12196 		vm_map_clip_end(src_map, src_entry, src_end);
12197 
12198 		src_size = src_entry->vme_end - src_start;
12199 		src_object = VME_OBJECT(src_entry);
12200 		src_offset = VME_OFFSET(src_entry);
12201 		was_wired = (src_entry->wired_count != 0);
12202 
12203 		vm_map_entry_copy(src_map, new_entry, src_entry);
12204 		if (new_entry->is_sub_map) {
12205 			/* clr address space specifics */
12206 			new_entry->use_pmap = FALSE;
12207 		} else {
12208 			/*
12209 			 * We're dealing with a copy-on-write operation,
12210 			 * so the resulting mapping should not inherit the
12211 			 * original mapping's accounting settings.
12212 			 * "iokit_acct" should have been cleared in
12213 			 * vm_map_entry_copy().
12214 			 * "use_pmap" should be reset to its default (TRUE)
12215 			 * so that the new mapping gets accounted for in
12216 			 * the task's memory footprint.
12217 			 */
12218 			assert(!new_entry->iokit_acct);
12219 			new_entry->use_pmap = TRUE;
12220 		}
12221 
12222 		/*
12223 		 *	Attempt non-blocking copy-on-write optimizations.
12224 		 */
12225 
12226 		/*
12227 		 * If we are destroying the source, and the object
12228 		 * is internal, we could move the object reference
12229 		 * from the source to the copy.  The copy is
12230 		 * copy-on-write only if the source is.
12231 		 * We make another reference to the object, because
12232 		 * destroying the source entry will deallocate it.
12233 		 *
12234 		 * This memory transfer has to be atomic, (to prevent
12235 		 * the VM object from being shared or copied while
12236 		 * it's being moved here), so we could only do this
12237 		 * if we won't have to unlock the VM map until the
12238 		 * original mapping has been fully removed.
12239 		 */
12240 
12241 RestartCopy:
12242 		if ((src_object == VM_OBJECT_NULL ||
12243 		    (!was_wired && !map_share && !tmp_entry->is_shared
12244 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12245 		    vm_object_copy_quickly(
12246 			    VME_OBJECT(new_entry),
12247 			    src_offset,
12248 			    src_size,
12249 			    &src_needs_copy,
12250 			    &new_entry_needs_copy)) {
12251 			new_entry->needs_copy = new_entry_needs_copy;
12252 
12253 			/*
12254 			 *	Handle copy-on-write obligations
12255 			 */
12256 
12257 			if (src_needs_copy && !tmp_entry->needs_copy) {
12258 				vm_prot_t prot;
12259 
12260 				prot = src_entry->protection & ~VM_PROT_WRITE;
12261 
12262 				if (override_nx(src_map, VME_ALIAS(src_entry))
12263 				    && prot) {
12264 					prot |= VM_PROT_EXECUTE;
12265 				}
12266 
12267 				vm_object_pmap_protect(
12268 					src_object,
12269 					src_offset,
12270 					src_size,
12271 					(src_entry->is_shared ?
12272 					PMAP_NULL
12273 					: src_map->pmap),
12274 					VM_MAP_PAGE_SIZE(src_map),
12275 					src_entry->vme_start,
12276 					prot);
12277 
12278 				assert(tmp_entry->wired_count == 0);
12279 				tmp_entry->needs_copy = TRUE;
12280 			}
12281 
12282 			/*
12283 			 *	The map has never been unlocked, so it's safe
12284 			 *	to move to the next entry rather than doing
12285 			 *	another lookup.
12286 			 */
12287 
12288 			goto CopySuccessful;
12289 		}
12290 
12291 		entry_was_shared = tmp_entry->is_shared;
12292 
12293 		/*
12294 		 *	Take an object reference, so that we may
12295 		 *	release the map lock(s).
12296 		 */
12297 
12298 		assert(src_object != VM_OBJECT_NULL);
12299 		vm_object_reference(src_object);
12300 
12301 		/*
12302 		 *	Record the timestamp for later verification.
12303 		 *	Unlock the map.
12304 		 */
12305 
12306 		version.main_timestamp = src_map->timestamp;
12307 		vm_map_unlock(src_map); /* Increments timestamp once! */
12308 		saved_src_entry = src_entry;
12309 		tmp_entry = VM_MAP_ENTRY_NULL;
12310 		src_entry = VM_MAP_ENTRY_NULL;
12311 
12312 		/*
12313 		 *	Perform the copy
12314 		 */
12315 
12316 		if (was_wired ||
12317 		    (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12318 		    !(flags & VM_MAP_COPYIN_FORK)) ||
12319 		    (debug4k_no_cow_copyin &&
12320 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12321 CopySlowly:
12322 			vm_object_lock(src_object);
12323 			result = vm_object_copy_slowly(
12324 				src_object,
12325 				src_offset,
12326 				src_size,
12327 				THREAD_UNINT,
12328 				&new_copy_object);
12329 			/* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12330 			saved_used_for_jit = new_entry->used_for_jit;
12331 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12332 			new_entry->used_for_jit = saved_used_for_jit;
12333 			VME_OFFSET_SET(new_entry,
12334 			    src_offset - vm_object_trunc_page(src_offset));
12335 			new_entry->needs_copy = FALSE;
12336 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12337 		    (entry_was_shared || map_share)) {
12338 			vm_object_t new_object;
12339 
12340 			vm_object_lock_shared(src_object);
12341 			new_object = vm_object_copy_delayed(
12342 				src_object,
12343 				src_offset,
12344 				src_size,
12345 				TRUE);
12346 			if (new_object == VM_OBJECT_NULL) {
12347 				goto CopySlowly;
12348 			}
12349 
12350 			VME_OBJECT_SET(new_entry, new_object, false, 0);
12351 			assert(new_entry->wired_count == 0);
12352 			new_entry->needs_copy = TRUE;
12353 			assert(!new_entry->iokit_acct);
12354 			assert(new_object->purgable == VM_PURGABLE_DENY);
12355 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12356 			result = KERN_SUCCESS;
12357 		} else {
12358 			vm_object_offset_t new_offset;
12359 			new_offset = VME_OFFSET(new_entry);
12360 			result = vm_object_copy_strategically(src_object,
12361 			    src_offset,
12362 			    src_size,
12363 			    (flags & VM_MAP_COPYIN_FORK),
12364 			    &new_copy_object,
12365 			    &new_offset,
12366 			    &new_entry_needs_copy);
12367 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12368 			saved_used_for_jit = new_entry->used_for_jit;
12369 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12370 			new_entry->used_for_jit = saved_used_for_jit;
12371 			if (new_offset != VME_OFFSET(new_entry)) {
12372 				VME_OFFSET_SET(new_entry, new_offset);
12373 			}
12374 
12375 			new_entry->needs_copy = new_entry_needs_copy;
12376 		}
12377 
12378 		if (result == KERN_SUCCESS &&
12379 		    ((preserve_purgeable &&
12380 		    src_object->purgable != VM_PURGABLE_DENY) ||
12381 		    new_entry->used_for_jit)) {
12382 			/*
12383 			 * Purgeable objects should be COPY_NONE, true share;
12384 			 * this should be propogated to the copy.
12385 			 *
12386 			 * Also force mappings the pmap specially protects to
12387 			 * be COPY_NONE; trying to COW these mappings would
12388 			 * change the effective protections, which could have
12389 			 * side effects if the pmap layer relies on the
12390 			 * specified protections.
12391 			 */
12392 
12393 			vm_object_t     new_object;
12394 
12395 			new_object = VME_OBJECT(new_entry);
12396 			assert(new_object != src_object);
12397 			vm_object_lock(new_object);
12398 			assert(os_ref_get_count_raw(&new_object->ref_count) == 1);
12399 			assert(new_object->shadow == VM_OBJECT_NULL);
12400 			assert(new_object->vo_copy == VM_OBJECT_NULL);
12401 			assert(new_object->vo_owner == NULL);
12402 
12403 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12404 
12405 			if (preserve_purgeable &&
12406 			    src_object->purgable != VM_PURGABLE_DENY) {
12407 				VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12408 
12409 				/* start as non-volatile with no owner... */
12410 				VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12411 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12412 				/* ... and move to src_object's purgeable state */
12413 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12414 					int state;
12415 					state = src_object->purgable;
12416 					vm_object_purgable_control(
12417 						new_object,
12418 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
12419 						&state);
12420 				}
12421 				/* no pmap accounting for purgeable objects */
12422 				new_entry->use_pmap = FALSE;
12423 			}
12424 
12425 			vm_object_unlock(new_object);
12426 			new_object = VM_OBJECT_NULL;
12427 		}
12428 
12429 		/*
12430 		 *	Throw away the extra reference
12431 		 */
12432 
12433 		vm_object_deallocate(src_object);
12434 
12435 		if (result != KERN_SUCCESS &&
12436 		    result != KERN_MEMORY_RESTART_COPY) {
12437 			vm_map_lock(src_map);
12438 			RETURN(result);
12439 		}
12440 
12441 		/*
12442 		 *	Verify that the map has not substantially
12443 		 *	changed while the copy was being made.
12444 		 */
12445 
12446 		vm_map_lock(src_map);
12447 
12448 		if ((version.main_timestamp + 1) == src_map->timestamp) {
12449 			/* src_map hasn't changed: src_entry is still valid */
12450 			src_entry = saved_src_entry;
12451 			goto VerificationSuccessful;
12452 		}
12453 
12454 		/*
12455 		 *	Simple version comparison failed.
12456 		 *
12457 		 *	Retry the lookup and verify that the
12458 		 *	same object/offset are still present.
12459 		 *
12460 		 *	[Note: a memory manager that colludes with
12461 		 *	the calling task can detect that we have
12462 		 *	cheated.  While the map was unlocked, the
12463 		 *	mapping could have been changed and restored.]
12464 		 */
12465 
12466 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12467 			if (result != KERN_MEMORY_RESTART_COPY) {
12468 				vm_object_deallocate(VME_OBJECT(new_entry));
12469 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12470 				/* reset accounting state */
12471 				new_entry->iokit_acct = FALSE;
12472 				new_entry->use_pmap = TRUE;
12473 			}
12474 			RETURN(KERN_INVALID_ADDRESS);
12475 		}
12476 
12477 		src_entry = tmp_entry;
12478 		vm_map_clip_start(src_map, src_entry, src_start);
12479 
12480 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12481 		    !use_maxprot) ||
12482 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12483 			goto VerificationFailed;
12484 		}
12485 
12486 		if (src_entry->vme_end < new_entry->vme_end) {
12487 			/*
12488 			 * This entry might have been shortened
12489 			 * (vm_map_clip_end) or been replaced with
12490 			 * an entry that ends closer to "src_start"
12491 			 * than before.
12492 			 * Adjust "new_entry" accordingly; copying
12493 			 * less memory would be correct but we also
12494 			 * redo the copy (see below) if the new entry
12495 			 * no longer points at the same object/offset.
12496 			 */
12497 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12498 			    VM_MAP_COPY_PAGE_MASK(copy)));
12499 			new_entry->vme_end = src_entry->vme_end;
12500 			src_size = new_entry->vme_end - src_start;
12501 		} else if (src_entry->vme_end > new_entry->vme_end) {
12502 			/*
12503 			 * This entry might have been extended
12504 			 * (vm_map_entry_simplify() or coalesce)
12505 			 * or been replaced with an entry that ends farther
12506 			 * from "src_start" than before.
12507 			 *
12508 			 * We've called vm_object_copy_*() only on
12509 			 * the previous <start:end> range, so we can't
12510 			 * just extend new_entry.  We have to re-do
12511 			 * the copy based on the new entry as if it was
12512 			 * pointing at a different object/offset (see
12513 			 * "Verification failed" below).
12514 			 */
12515 		}
12516 
12517 		if ((VME_OBJECT(src_entry) != src_object) ||
12518 		    (VME_OFFSET(src_entry) != src_offset) ||
12519 		    (src_entry->vme_end > new_entry->vme_end)) {
12520 			/*
12521 			 *	Verification failed.
12522 			 *
12523 			 *	Start over with this top-level entry.
12524 			 */
12525 
12526 VerificationFailed:     ;
12527 
12528 			vm_object_deallocate(VME_OBJECT(new_entry));
12529 			tmp_entry = src_entry;
12530 			continue;
12531 		}
12532 
12533 		/*
12534 		 *	Verification succeeded.
12535 		 */
12536 
12537 VerificationSuccessful:;
12538 
12539 		if (result == KERN_MEMORY_RESTART_COPY) {
12540 			goto RestartCopy;
12541 		}
12542 
12543 		/*
12544 		 *	Copy succeeded.
12545 		 */
12546 
12547 CopySuccessful: ;
12548 
12549 		/*
12550 		 *	Link in the new copy entry.
12551 		 */
12552 
12553 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12554 		    new_entry);
12555 
12556 		/*
12557 		 *	Determine whether the entire region
12558 		 *	has been copied.
12559 		 */
12560 		src_base = src_start;
12561 		src_start = new_entry->vme_end;
12562 		new_entry = VM_MAP_ENTRY_NULL;
12563 		while ((src_start >= src_end) && (src_end != 0)) {
12564 			submap_map_t    *ptr;
12565 
12566 			if (src_map == base_map) {
12567 				/* back to the top */
12568 				break;
12569 			}
12570 
12571 			ptr = parent_maps;
12572 			assert(ptr != NULL);
12573 			parent_maps = parent_maps->next;
12574 
12575 			/* fix up the damage we did in that submap */
12576 			vm_map_simplify_range(src_map,
12577 			    src_base,
12578 			    src_end);
12579 
12580 			vm_map_unlock(src_map);
12581 			vm_map_deallocate(src_map);
12582 			vm_map_lock(ptr->parent_map);
12583 			src_map = ptr->parent_map;
12584 			src_base = ptr->base_start;
12585 			src_start = ptr->base_start + ptr->base_len;
12586 			src_end = ptr->base_end;
12587 			if (!vm_map_lookup_entry(src_map,
12588 			    src_start,
12589 			    &tmp_entry) &&
12590 			    (src_end > src_start)) {
12591 				RETURN(KERN_INVALID_ADDRESS);
12592 			}
12593 			kfree_type(submap_map_t, ptr);
12594 			if (parent_maps == NULL) {
12595 				map_share = FALSE;
12596 			}
12597 			src_entry = tmp_entry->vme_prev;
12598 		}
12599 
12600 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12601 		    (src_start >= src_addr_unaligned + len) &&
12602 		    (src_addr_unaligned + len != 0)) {
12603 			/*
12604 			 * Stop copying now, even though we haven't reached
12605 			 * "src_end".  We'll adjust the end of the last copy
12606 			 * entry at the end, if needed.
12607 			 *
12608 			 * If src_map's aligment is different from the
12609 			 * system's page-alignment, there could be
12610 			 * extra non-map-aligned map entries between
12611 			 * the original (non-rounded) "src_addr_unaligned + len"
12612 			 * and the rounded "src_end".
12613 			 * We do not want to copy those map entries since
12614 			 * they're not part of the copied range.
12615 			 */
12616 			break;
12617 		}
12618 
12619 		if ((src_start >= src_end) && (src_end != 0)) {
12620 			break;
12621 		}
12622 
12623 		/*
12624 		 *	Verify that there are no gaps in the region
12625 		 */
12626 
12627 		tmp_entry = src_entry->vme_next;
12628 		if ((tmp_entry->vme_start != src_start) ||
12629 		    (tmp_entry == vm_map_to_entry(src_map))) {
12630 			RETURN(KERN_INVALID_ADDRESS);
12631 		}
12632 	}
12633 
12634 	/*
12635 	 * If the source should be destroyed, do it now, since the
12636 	 * copy was successful.
12637 	 */
12638 	if (src_destroy) {
12639 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12640 
12641 		if (src_map == kernel_map) {
12642 			remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12643 		}
12644 		(void)vm_map_remove_and_unlock(src_map,
12645 		    vm_map_trunc_page(src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)),
12646 		    src_end,
12647 		    remove_flags,
12648 		    KMEM_GUARD_NONE);
12649 	} else {
12650 		/* fix up the damage we did in the base map */
12651 		vm_map_simplify_range(
12652 			src_map,
12653 			vm_map_trunc_page(src_addr_unaligned,
12654 			VM_MAP_PAGE_MASK(src_map)),
12655 			vm_map_round_page(src_end,
12656 			VM_MAP_PAGE_MASK(src_map)));
12657 		vm_map_unlock(src_map);
12658 	}
12659 
12660 	tmp_entry = VM_MAP_ENTRY_NULL;
12661 
12662 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12663 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12664 		vm_map_offset_t original_start, original_offset, original_end;
12665 
12666 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12667 
12668 		/* adjust alignment of first copy_entry's "vme_start" */
12669 		tmp_entry = vm_map_copy_first_entry(copy);
12670 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12671 			vm_map_offset_t adjustment;
12672 
12673 			original_start = tmp_entry->vme_start;
12674 			original_offset = VME_OFFSET(tmp_entry);
12675 
12676 			/* map-align the start of the first copy entry... */
12677 			adjustment = (tmp_entry->vme_start -
12678 			    vm_map_trunc_page(
12679 				    tmp_entry->vme_start,
12680 				    VM_MAP_PAGE_MASK(src_map)));
12681 			tmp_entry->vme_start -= adjustment;
12682 			VME_OFFSET_SET(tmp_entry,
12683 			    VME_OFFSET(tmp_entry) - adjustment);
12684 			copy_addr -= adjustment;
12685 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
12686 			/* ... adjust for mis-aligned start of copy range */
12687 			adjustment =
12688 			    (vm_map_trunc_page(copy->offset,
12689 			    PAGE_MASK) -
12690 			    vm_map_trunc_page(copy->offset,
12691 			    VM_MAP_PAGE_MASK(src_map)));
12692 			if (adjustment) {
12693 				assert(page_aligned(adjustment));
12694 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12695 				tmp_entry->vme_start += adjustment;
12696 				VME_OFFSET_SET(tmp_entry,
12697 				    (VME_OFFSET(tmp_entry) +
12698 				    adjustment));
12699 				copy_addr += adjustment;
12700 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12701 			}
12702 
12703 			/*
12704 			 * Assert that the adjustments haven't exposed
12705 			 * more than was originally copied...
12706 			 */
12707 			assert(tmp_entry->vme_start >= original_start);
12708 			assert(VME_OFFSET(tmp_entry) >= original_offset);
12709 			/*
12710 			 * ... and that it did not adjust outside of a
12711 			 * a single 16K page.
12712 			 */
12713 			assert(vm_map_trunc_page(tmp_entry->vme_start,
12714 			    VM_MAP_PAGE_MASK(src_map)) ==
12715 			    vm_map_trunc_page(original_start,
12716 			    VM_MAP_PAGE_MASK(src_map)));
12717 		}
12718 
12719 		/* adjust alignment of last copy_entry's "vme_end" */
12720 		tmp_entry = vm_map_copy_last_entry(copy);
12721 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12722 			vm_map_offset_t adjustment;
12723 
12724 			original_end = tmp_entry->vme_end;
12725 
12726 			/* map-align the end of the last copy entry... */
12727 			tmp_entry->vme_end =
12728 			    vm_map_round_page(tmp_entry->vme_end,
12729 			    VM_MAP_PAGE_MASK(src_map));
12730 			/* ... adjust for mis-aligned end of copy range */
12731 			adjustment =
12732 			    (vm_map_round_page((copy->offset +
12733 			    copy->size),
12734 			    VM_MAP_PAGE_MASK(src_map)) -
12735 			    vm_map_round_page((copy->offset +
12736 			    copy->size),
12737 			    PAGE_MASK));
12738 			if (adjustment) {
12739 				assert(page_aligned(adjustment));
12740 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12741 				tmp_entry->vme_end -= adjustment;
12742 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12743 			}
12744 
12745 			/*
12746 			 * Assert that the adjustments haven't exposed
12747 			 * more than was originally copied...
12748 			 */
12749 			assert(tmp_entry->vme_end <= original_end);
12750 			/*
12751 			 * ... and that it did not adjust outside of a
12752 			 * a single 16K page.
12753 			 */
12754 			assert(vm_map_round_page(tmp_entry->vme_end,
12755 			    VM_MAP_PAGE_MASK(src_map)) ==
12756 			    vm_map_round_page(original_end,
12757 			    VM_MAP_PAGE_MASK(src_map)));
12758 		}
12759 	}
12760 
12761 	/* Fix-up start and end points in copy.  This is necessary */
12762 	/* when the various entries in the copy object were picked */
12763 	/* up from different sub-maps */
12764 
12765 	tmp_entry = vm_map_copy_first_entry(copy);
12766 	copy_size = 0; /* compute actual size */
12767 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12768 		assert(VM_MAP_PAGE_ALIGNED(
12769 			    copy_addr + (tmp_entry->vme_end -
12770 			    tmp_entry->vme_start),
12771 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12772 		assert(VM_MAP_PAGE_ALIGNED(
12773 			    copy_addr,
12774 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12775 
12776 		/*
12777 		 * The copy_entries will be injected directly into the
12778 		 * destination map and might not be "map aligned" there...
12779 		 */
12780 		tmp_entry->map_aligned = FALSE;
12781 
12782 		tmp_entry->vme_end = copy_addr +
12783 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12784 		tmp_entry->vme_start = copy_addr;
12785 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12786 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12787 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12788 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12789 	}
12790 
12791 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12792 	    copy_size < copy->size) {
12793 		/*
12794 		 * The actual size of the VM map copy is smaller than what
12795 		 * was requested by the caller.  This must be because some
12796 		 * PAGE_SIZE-sized pages are missing at the end of the last
12797 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12798 		 * The caller might not have been aware of those missing
12799 		 * pages and might not want to be aware of it, which is
12800 		 * fine as long as they don't try to access (and crash on)
12801 		 * those missing pages.
12802 		 * Let's adjust the size of the "copy", to avoid failing
12803 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12804 		 */
12805 		assert(vm_map_round_page(copy_size,
12806 		    VM_MAP_PAGE_MASK(src_map)) ==
12807 		    vm_map_round_page(copy->size,
12808 		    VM_MAP_PAGE_MASK(src_map)));
12809 		copy->size = copy_size;
12810 	}
12811 
12812 	*copy_result = copy;
12813 	return KERN_SUCCESS;
12814 
12815 #undef  RETURN
12816 }
12817 
12818 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12819 vm_map_copy_extract(
12820 	vm_map_t                src_map,
12821 	vm_map_address_t        src_addr,
12822 	vm_map_size_t           len,
12823 	boolean_t               do_copy,
12824 	vm_map_copy_t           *copy_result,   /* OUT */
12825 	vm_prot_t               *cur_prot,      /* IN/OUT */
12826 	vm_prot_t               *max_prot,      /* IN/OUT */
12827 	vm_inherit_t            inheritance,
12828 	vm_map_kernel_flags_t   vmk_flags)
12829 {
12830 	vm_map_copy_t   copy;
12831 	kern_return_t   kr;
12832 	vm_prot_t required_cur_prot, required_max_prot;
12833 
12834 	/*
12835 	 *	Check for copies of zero bytes.
12836 	 */
12837 
12838 	if (len == 0) {
12839 		*copy_result = VM_MAP_COPY_NULL;
12840 		return KERN_SUCCESS;
12841 	}
12842 
12843 	/*
12844 	 *	Check that the end address doesn't overflow
12845 	 */
12846 	if (src_addr + len < src_addr) {
12847 		return KERN_INVALID_ADDRESS;
12848 	}
12849 	if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12850 		return KERN_INVALID_ADDRESS;
12851 	}
12852 
12853 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12854 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12855 	}
12856 
12857 	required_cur_prot = *cur_prot;
12858 	required_max_prot = *max_prot;
12859 
12860 	/*
12861 	 *	Allocate a header element for the list.
12862 	 *
12863 	 *	Use the start and end in the header to
12864 	 *	remember the endpoints prior to rounding.
12865 	 */
12866 
12867 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12868 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12869 	copy->offset = 0;
12870 	copy->size = len;
12871 
12872 	kr = vm_map_remap_extract(src_map,
12873 	    src_addr,
12874 	    len,
12875 	    do_copy,             /* copy */
12876 	    copy,
12877 	    cur_prot,            /* IN/OUT */
12878 	    max_prot,            /* IN/OUT */
12879 	    inheritance,
12880 	    vmk_flags);
12881 	if (kr != KERN_SUCCESS) {
12882 		vm_map_copy_discard(copy);
12883 		if ((kr == KERN_INVALID_ADDRESS ||
12884 		    kr == KERN_INVALID_ARGUMENT) &&
12885 		    src_map->terminated) {
12886 			/* tell the caller that this address space is gone */
12887 			kr = KERN_TERMINATED;
12888 		}
12889 		return kr;
12890 	}
12891 	if (required_cur_prot != VM_PROT_NONE) {
12892 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
12893 		assert((*max_prot & required_max_prot) == required_max_prot);
12894 	}
12895 
12896 	*copy_result = copy;
12897 	return KERN_SUCCESS;
12898 }
12899 
12900 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12901 vm_map_fork_share(
12902 	vm_map_t        old_map,
12903 	vm_map_entry_t  old_entry,
12904 	vm_map_t        new_map)
12905 {
12906 	vm_object_t     object;
12907 	vm_map_entry_t  new_entry;
12908 
12909 	/*
12910 	 *	New sharing code.  New map entry
12911 	 *	references original object.  Internal
12912 	 *	objects use asynchronous copy algorithm for
12913 	 *	future copies.  First make sure we have
12914 	 *	the right object.  If we need a shadow,
12915 	 *	or someone else already has one, then
12916 	 *	make a new shadow and share it.
12917 	 */
12918 
12919 	if (!old_entry->is_sub_map) {
12920 		object = VME_OBJECT(old_entry);
12921 	}
12922 
12923 	if (old_entry->is_sub_map) {
12924 		assert(old_entry->wired_count == 0);
12925 #ifndef NO_NESTED_PMAP
12926 #if !PMAP_FORK_NEST
12927 		if (old_entry->use_pmap) {
12928 			kern_return_t   result;
12929 
12930 			result = pmap_nest(new_map->pmap,
12931 			    (VME_SUBMAP(old_entry))->pmap,
12932 			    (addr64_t)old_entry->vme_start,
12933 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12934 			if (result) {
12935 				panic("vm_map_fork_share: pmap_nest failed!");
12936 			}
12937 		}
12938 #endif /* !PMAP_FORK_NEST */
12939 #endif  /* NO_NESTED_PMAP */
12940 	} else if (object == VM_OBJECT_NULL) {
12941 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12942 		    old_entry->vme_start));
12943 		VME_OFFSET_SET(old_entry, 0);
12944 		VME_OBJECT_SET(old_entry, object, false, 0);
12945 		old_entry->use_pmap = TRUE;
12946 //		assert(!old_entry->needs_copy);
12947 	} else if (object->copy_strategy !=
12948 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
12949 		/*
12950 		 *	We are already using an asymmetric
12951 		 *	copy, and therefore we already have
12952 		 *	the right object.
12953 		 */
12954 
12955 		assert(!old_entry->needs_copy);
12956 	} else if (old_entry->needs_copy ||       /* case 1 */
12957 	    object->shadowed ||                 /* case 2 */
12958 	    (!object->true_share &&             /* case 3 */
12959 	    !old_entry->is_shared &&
12960 	    (object->vo_size >
12961 	    (vm_map_size_t)(old_entry->vme_end -
12962 	    old_entry->vme_start)))) {
12963 		bool is_writable;
12964 
12965 		/*
12966 		 *	We need to create a shadow.
12967 		 *	There are three cases here.
12968 		 *	In the first case, we need to
12969 		 *	complete a deferred symmetrical
12970 		 *	copy that we participated in.
12971 		 *	In the second and third cases,
12972 		 *	we need to create the shadow so
12973 		 *	that changes that we make to the
12974 		 *	object do not interfere with
12975 		 *	any symmetrical copies which
12976 		 *	have occured (case 2) or which
12977 		 *	might occur (case 3).
12978 		 *
12979 		 *	The first case is when we had
12980 		 *	deferred shadow object creation
12981 		 *	via the entry->needs_copy mechanism.
12982 		 *	This mechanism only works when
12983 		 *	only one entry points to the source
12984 		 *	object, and we are about to create
12985 		 *	a second entry pointing to the
12986 		 *	same object. The problem is that
12987 		 *	there is no way of mapping from
12988 		 *	an object to the entries pointing
12989 		 *	to it. (Deferred shadow creation
12990 		 *	works with one entry because occurs
12991 		 *	at fault time, and we walk from the
12992 		 *	entry to the object when handling
12993 		 *	the fault.)
12994 		 *
12995 		 *	The second case is when the object
12996 		 *	to be shared has already been copied
12997 		 *	with a symmetric copy, but we point
12998 		 *	directly to the object without
12999 		 *	needs_copy set in our entry. (This
13000 		 *	can happen because different ranges
13001 		 *	of an object can be pointed to by
13002 		 *	different entries. In particular,
13003 		 *	a single entry pointing to an object
13004 		 *	can be split by a call to vm_inherit,
13005 		 *	which, combined with task_create, can
13006 		 *	result in the different entries
13007 		 *	having different needs_copy values.)
13008 		 *	The shadowed flag in the object allows
13009 		 *	us to detect this case. The problem
13010 		 *	with this case is that if this object
13011 		 *	has or will have shadows, then we
13012 		 *	must not perform an asymmetric copy
13013 		 *	of this object, since such a copy
13014 		 *	allows the object to be changed, which
13015 		 *	will break the previous symmetrical
13016 		 *	copies (which rely upon the object
13017 		 *	not changing). In a sense, the shadowed
13018 		 *	flag says "don't change this object".
13019 		 *	We fix this by creating a shadow
13020 		 *	object for this object, and sharing
13021 		 *	that. This works because we are free
13022 		 *	to change the shadow object (and thus
13023 		 *	to use an asymmetric copy strategy);
13024 		 *	this is also semantically correct,
13025 		 *	since this object is temporary, and
13026 		 *	therefore a copy of the object is
13027 		 *	as good as the object itself. (This
13028 		 *	is not true for permanent objects,
13029 		 *	since the pager needs to see changes,
13030 		 *	which won't happen if the changes
13031 		 *	are made to a copy.)
13032 		 *
13033 		 *	The third case is when the object
13034 		 *	to be shared has parts sticking
13035 		 *	outside of the entry we're working
13036 		 *	with, and thus may in the future
13037 		 *	be subject to a symmetrical copy.
13038 		 *	(This is a preemptive version of
13039 		 *	case 2.)
13040 		 */
13041 		VME_OBJECT_SHADOW(old_entry,
13042 		    (vm_map_size_t) (old_entry->vme_end -
13043 		    old_entry->vme_start),
13044 		    vm_map_always_shadow(old_map));
13045 
13046 		/*
13047 		 *	If we're making a shadow for other than
13048 		 *	copy on write reasons, then we have
13049 		 *	to remove write permission.
13050 		 */
13051 
13052 		is_writable = false;
13053 		if (old_entry->protection & VM_PROT_WRITE) {
13054 			is_writable = true;
13055 #if __arm64e__
13056 		} else if (old_entry->used_for_tpro) {
13057 			is_writable = true;
13058 #endif /* __arm64e__ */
13059 		}
13060 		if (!old_entry->needs_copy && is_writable) {
13061 			vm_prot_t prot;
13062 
13063 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13064 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13065 				    __FUNCTION__, old_map, old_map->pmap,
13066 				    old_entry,
13067 				    (uint64_t)old_entry->vme_start,
13068 				    (uint64_t)old_entry->vme_end,
13069 				    old_entry->protection);
13070 			}
13071 
13072 			prot = old_entry->protection & ~VM_PROT_WRITE;
13073 
13074 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13075 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13076 				    __FUNCTION__, old_map, old_map->pmap,
13077 				    old_entry,
13078 				    (uint64_t)old_entry->vme_start,
13079 				    (uint64_t)old_entry->vme_end,
13080 				    prot);
13081 			}
13082 
13083 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13084 				prot |= VM_PROT_EXECUTE;
13085 			}
13086 
13087 
13088 			if (old_map->mapped_in_other_pmaps) {
13089 				vm_object_pmap_protect(
13090 					VME_OBJECT(old_entry),
13091 					VME_OFFSET(old_entry),
13092 					(old_entry->vme_end -
13093 					old_entry->vme_start),
13094 					PMAP_NULL,
13095 					PAGE_SIZE,
13096 					old_entry->vme_start,
13097 					prot);
13098 			} else {
13099 				pmap_protect(old_map->pmap,
13100 				    old_entry->vme_start,
13101 				    old_entry->vme_end,
13102 				    prot);
13103 			}
13104 		}
13105 
13106 		old_entry->needs_copy = FALSE;
13107 		object = VME_OBJECT(old_entry);
13108 	}
13109 
13110 
13111 	/*
13112 	 *	If object was using a symmetric copy strategy,
13113 	 *	change its copy strategy to the default
13114 	 *	asymmetric copy strategy, which is copy_delay
13115 	 *	in the non-norma case and copy_call in the
13116 	 *	norma case. Bump the reference count for the
13117 	 *	new entry.
13118 	 */
13119 
13120 	if (old_entry->is_sub_map) {
13121 		vm_map_reference(VME_SUBMAP(old_entry));
13122 	} else {
13123 		vm_object_lock(object);
13124 		vm_object_reference_locked(object);
13125 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13126 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13127 		}
13128 		vm_object_unlock(object);
13129 	}
13130 
13131 	/*
13132 	 *	Clone the entry, using object ref from above.
13133 	 *	Mark both entries as shared.
13134 	 */
13135 
13136 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13137 	vm_map_entry_copy(old_map, new_entry, old_entry);
13138 	old_entry->is_shared = TRUE;
13139 	new_entry->is_shared = TRUE;
13140 
13141 	/*
13142 	 * We're dealing with a shared mapping, so the resulting mapping
13143 	 * should inherit some of the original mapping's accounting settings.
13144 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13145 	 * "use_pmap" should stay the same as before (if it hasn't been reset
13146 	 * to TRUE when we cleared "iokit_acct").
13147 	 */
13148 	assert(!new_entry->iokit_acct);
13149 
13150 	/*
13151 	 *	If old entry's inheritence is VM_INHERIT_NONE,
13152 	 *	the new entry is for corpse fork, remove the
13153 	 *	write permission from the new entry.
13154 	 */
13155 	if (old_entry->inheritance == VM_INHERIT_NONE) {
13156 		new_entry->protection &= ~VM_PROT_WRITE;
13157 		new_entry->max_protection &= ~VM_PROT_WRITE;
13158 	}
13159 
13160 	/*
13161 	 *	Insert the entry into the new map -- we
13162 	 *	know we're inserting at the end of the new
13163 	 *	map.
13164 	 */
13165 
13166 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13167 	    VM_MAP_KERNEL_FLAGS_NONE);
13168 
13169 	/*
13170 	 *	Update the physical map
13171 	 */
13172 
13173 	if (old_entry->is_sub_map) {
13174 		/* Bill Angell pmap support goes here */
13175 	} else {
13176 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13177 		    old_entry->vme_end - old_entry->vme_start,
13178 		    old_entry->vme_start);
13179 	}
13180 }
13181 
13182 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13183 vm_map_fork_copy(
13184 	vm_map_t        old_map,
13185 	vm_map_entry_t  *old_entry_p,
13186 	vm_map_t        new_map,
13187 	int             vm_map_copyin_flags)
13188 {
13189 	vm_map_entry_t old_entry = *old_entry_p;
13190 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13191 	vm_map_offset_t start = old_entry->vme_start;
13192 	vm_map_copy_t copy;
13193 	vm_map_entry_t last = vm_map_last_entry(new_map);
13194 
13195 	vm_map_unlock(old_map);
13196 	/*
13197 	 *	Use maxprot version of copyin because we
13198 	 *	care about whether this memory can ever
13199 	 *	be accessed, not just whether it's accessible
13200 	 *	right now.
13201 	 */
13202 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13203 	if (vm_map_copyin_internal(old_map, start, entry_size,
13204 	    vm_map_copyin_flags, &copy)
13205 	    != KERN_SUCCESS) {
13206 		/*
13207 		 *	The map might have changed while it
13208 		 *	was unlocked, check it again.  Skip
13209 		 *	any blank space or permanently
13210 		 *	unreadable region.
13211 		 */
13212 		vm_map_lock(old_map);
13213 		if (!vm_map_lookup_entry(old_map, start, &last) ||
13214 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13215 			last = last->vme_next;
13216 		}
13217 		*old_entry_p = last;
13218 
13219 		/*
13220 		 * XXX	For some error returns, want to
13221 		 * XXX	skip to the next element.  Note
13222 		 *	that INVALID_ADDRESS and
13223 		 *	PROTECTION_FAILURE are handled above.
13224 		 */
13225 
13226 		return FALSE;
13227 	}
13228 
13229 	/*
13230 	 * Assert that the vm_map_copy is coming from the right
13231 	 * zone and hasn't been forged
13232 	 */
13233 	vm_map_copy_require(copy);
13234 
13235 	/*
13236 	 *	Insert the copy into the new map
13237 	 */
13238 	vm_map_copy_insert(new_map, last, copy);
13239 
13240 	/*
13241 	 *	Pick up the traversal at the end of
13242 	 *	the copied region.
13243 	 */
13244 
13245 	vm_map_lock(old_map);
13246 	start += entry_size;
13247 	if (!vm_map_lookup_entry(old_map, start, &last)) {
13248 		last = last->vme_next;
13249 	} else {
13250 		if (last->vme_start == start) {
13251 			/*
13252 			 * No need to clip here and we don't
13253 			 * want to cause any unnecessary
13254 			 * unnesting...
13255 			 */
13256 		} else {
13257 			vm_map_clip_start(old_map, last, start);
13258 		}
13259 	}
13260 	*old_entry_p = last;
13261 
13262 	return TRUE;
13263 }
13264 
13265 #if PMAP_FORK_NEST
13266 #define PMAP_FORK_NEST_DEBUG 0
13267 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13268 vm_map_fork_unnest(
13269 	pmap_t new_pmap,
13270 	vm_map_offset_t pre_nested_start,
13271 	vm_map_offset_t pre_nested_end,
13272 	vm_map_offset_t start,
13273 	vm_map_offset_t end)
13274 {
13275 	kern_return_t kr;
13276 	vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13277 
13278 	assertf(pre_nested_start <= pre_nested_end,
13279 	    "pre_nested start 0x%llx end 0x%llx",
13280 	    (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13281 	assertf(start <= end,
13282 	    "start 0x%llx end 0x%llx",
13283 	    (uint64_t) start, (uint64_t)end);
13284 
13285 	if (pre_nested_start == pre_nested_end) {
13286 		/* nothing was pre-nested: done */
13287 		return;
13288 	}
13289 	if (end <= pre_nested_start) {
13290 		/* fully before pre-nested range: done */
13291 		return;
13292 	}
13293 	if (start >= pre_nested_end) {
13294 		/* fully after pre-nested range: done */
13295 		return;
13296 	}
13297 	/* ignore parts of range outside of pre_nested range */
13298 	if (start < pre_nested_start) {
13299 		start = pre_nested_start;
13300 	}
13301 	if (end > pre_nested_end) {
13302 		end = pre_nested_end;
13303 	}
13304 	nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13305 	start_unnest = start & ~nesting_mask;
13306 	end_unnest = (end + nesting_mask) & ~nesting_mask;
13307 	kr = pmap_unnest(new_pmap,
13308 	    (addr64_t)start_unnest,
13309 	    (uint64_t)(end_unnest - start_unnest));
13310 #if PMAP_FORK_NEST_DEBUG
13311 	printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13312 #endif /* PMAP_FORK_NEST_DEBUG */
13313 	assertf(kr == KERN_SUCCESS,
13314 	    "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13315 	    (uint64_t)start, (uint64_t)end, new_pmap,
13316 	    (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13317 	    kr);
13318 }
13319 #endif /* PMAP_FORK_NEST */
13320 
13321 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13322 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13323 {
13324 	new_map->size_limit = old_map->size_limit;
13325 	new_map->data_limit = old_map->data_limit;
13326 	new_map->user_wire_limit = old_map->user_wire_limit;
13327 	new_map->reserved_regions = old_map->reserved_regions;
13328 }
13329 
13330 /*
13331  *	vm_map_fork:
13332  *
13333  *	Create and return a new map based on the old
13334  *	map, according to the inheritance values on the
13335  *	regions in that map and the options.
13336  *
13337  *	The source map must not be locked.
13338  */
13339 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13340 vm_map_fork(
13341 	ledger_t        ledger,
13342 	vm_map_t        old_map,
13343 	int             options)
13344 {
13345 	pmap_t          new_pmap;
13346 	vm_map_t        new_map;
13347 	vm_map_entry_t  old_entry;
13348 	vm_map_size_t   new_size = 0, entry_size;
13349 	vm_map_entry_t  new_entry;
13350 	boolean_t       src_needs_copy;
13351 	boolean_t       new_entry_needs_copy;
13352 	boolean_t       pmap_is64bit;
13353 	int             vm_map_copyin_flags;
13354 	vm_inherit_t    old_entry_inheritance;
13355 	int             map_create_options;
13356 	kern_return_t   footprint_collect_kr;
13357 
13358 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13359 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
13360 	    VM_MAP_FORK_CORPSE_FOOTPRINT |
13361 	    VM_MAP_FORK_SHARE_IF_OWNED)) {
13362 		/* unsupported option */
13363 		return VM_MAP_NULL;
13364 	}
13365 
13366 	pmap_is64bit =
13367 #if defined(__i386__) || defined(__x86_64__)
13368 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13369 #elif defined(__arm64__)
13370 	    old_map->pmap->is_64bit;
13371 #else
13372 #error Unknown architecture.
13373 #endif
13374 
13375 	unsigned int pmap_flags = 0;
13376 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13377 #if defined(HAS_APPLE_PAC)
13378 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13379 #endif
13380 #if CONFIG_ROSETTA
13381 	pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13382 #endif
13383 #if PMAP_CREATE_FORCE_4K_PAGES
13384 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13385 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
13386 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13387 	}
13388 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13389 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13390 	if (new_pmap == NULL) {
13391 		return VM_MAP_NULL;
13392 	}
13393 
13394 	vm_map_reference(old_map);
13395 	vm_map_lock(old_map);
13396 
13397 	map_create_options = 0;
13398 	if (old_map->hdr.entries_pageable) {
13399 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
13400 	}
13401 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13402 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13403 		footprint_collect_kr = KERN_SUCCESS;
13404 	}
13405 	new_map = vm_map_create_options(new_pmap,
13406 	    old_map->min_offset,
13407 	    old_map->max_offset,
13408 	    map_create_options);
13409 
13410 	/* inherit cs_enforcement */
13411 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13412 
13413 	vm_map_lock(new_map);
13414 	vm_commit_pagezero_status(new_map);
13415 	/* inherit the parent map's page size */
13416 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13417 
13418 	/* inherit the parent rlimits */
13419 	vm_map_inherit_limits(new_map, old_map);
13420 
13421 #if CONFIG_MAP_RANGES
13422 	/* inherit the parent map's VM ranges */
13423 	vm_map_range_fork(new_map, old_map);
13424 #endif
13425 
13426 #if CODE_SIGNING_MONITOR
13427 	/* Prepare the monitor for the fork */
13428 	csm_fork_prepare(old_map->pmap, new_pmap);
13429 #endif
13430 
13431 #if PMAP_FORK_NEST
13432 	/*
13433 	 * Pre-nest the shared region's pmap.
13434 	 */
13435 	vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13436 	pmap_fork_nest(old_map->pmap, new_pmap,
13437 	    &pre_nested_start, &pre_nested_end);
13438 #if PMAP_FORK_NEST_DEBUG
13439 	printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13440 #endif /* PMAP_FORK_NEST_DEBUG */
13441 #endif /* PMAP_FORK_NEST */
13442 
13443 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13444 		/*
13445 		 * Abort any corpse collection if the system is shutting down.
13446 		 */
13447 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13448 		    get_system_inshutdown()) {
13449 #if PMAP_FORK_NEST
13450 			new_entry = vm_map_last_entry(new_map);
13451 			if (new_entry == vm_map_to_entry(new_map)) {
13452 				/* unnest all that was pre-nested */
13453 				vm_map_fork_unnest(new_pmap,
13454 				    pre_nested_start, pre_nested_end,
13455 				    vm_map_min(new_map), vm_map_max(new_map));
13456 			} else if (new_entry->vme_end < vm_map_max(new_map)) {
13457 				/* unnest hole at the end, if pre-nested */
13458 				vm_map_fork_unnest(new_pmap,
13459 				    pre_nested_start, pre_nested_end,
13460 				    new_entry->vme_end, vm_map_max(new_map));
13461 			}
13462 #endif /* PMAP_FORK_NEST */
13463 			vm_map_corpse_footprint_collect_done(new_map);
13464 			vm_map_unlock(new_map);
13465 			vm_map_unlock(old_map);
13466 			vm_map_deallocate(new_map);
13467 			vm_map_deallocate(old_map);
13468 			printf("Aborting corpse map due to system shutdown\n");
13469 			return VM_MAP_NULL;
13470 		}
13471 
13472 		entry_size = old_entry->vme_end - old_entry->vme_start;
13473 
13474 #if PMAP_FORK_NEST
13475 		/*
13476 		 * Undo any unnecessary pre-nesting.
13477 		 */
13478 		vm_map_offset_t prev_end;
13479 		if (old_entry == vm_map_first_entry(old_map)) {
13480 			prev_end = vm_map_min(old_map);
13481 		} else {
13482 			prev_end = old_entry->vme_prev->vme_end;
13483 		}
13484 		if (prev_end < old_entry->vme_start) {
13485 			/* unnest hole before this entry, if pre-nested */
13486 			vm_map_fork_unnest(new_pmap,
13487 			    pre_nested_start, pre_nested_end,
13488 			    prev_end, old_entry->vme_start);
13489 		}
13490 		if (old_entry->is_sub_map && old_entry->use_pmap) {
13491 			/* keep this entry nested in the child */
13492 #if PMAP_FORK_NEST_DEBUG
13493 			printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13494 #endif /* PMAP_FORK_NEST_DEBUG */
13495 		} else {
13496 			/* undo nesting for this entry, if pre-nested */
13497 			vm_map_fork_unnest(new_pmap,
13498 			    pre_nested_start, pre_nested_end,
13499 			    old_entry->vme_start, old_entry->vme_end);
13500 		}
13501 #endif /* PMAP_FORK_NEST */
13502 
13503 		old_entry_inheritance = old_entry->inheritance;
13504 		/*
13505 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13506 		 * share VM_INHERIT_NONE entries that are not backed by a
13507 		 * device pager.
13508 		 */
13509 		if (old_entry_inheritance == VM_INHERIT_NONE &&
13510 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13511 		    (old_entry->protection & VM_PROT_READ) &&
13512 		    !(!old_entry->is_sub_map &&
13513 		    VME_OBJECT(old_entry) != NULL &&
13514 		    VME_OBJECT(old_entry)->pager != NULL &&
13515 		    is_device_pager_ops(
13516 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13517 			old_entry_inheritance = VM_INHERIT_SHARE;
13518 		}
13519 		if (old_entry_inheritance == VM_INHERIT_COPY &&
13520 		    (options & VM_MAP_FORK_SHARE_IF_OWNED) &&
13521 		    !old_entry->is_sub_map &&
13522 		    VME_OBJECT(old_entry) != VM_OBJECT_NULL) {
13523 			vm_object_t object;
13524 			task_t owner;
13525 			object = VME_OBJECT(old_entry);
13526 			owner = VM_OBJECT_OWNER(object);
13527 			if (owner != TASK_NULL &&
13528 			    owner->map == old_map) {
13529 				/*
13530 				 * This mapping points at a VM object owned
13531 				 * by the task being forked.
13532 				 * Some tools reporting memory accounting
13533 				 * info rely on the object ID, so share this
13534 				 * mapping instead of copying, to make the
13535 				 * corpse look exactly like the original
13536 				 * task in that respect.
13537 				 */
13538 				assert(object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC);
13539 				old_entry_inheritance = VM_INHERIT_SHARE;
13540 			}
13541 		}
13542 
13543 		if (old_entry_inheritance != VM_INHERIT_NONE &&
13544 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13545 		    footprint_collect_kr == KERN_SUCCESS) {
13546 			/*
13547 			 * The corpse won't have old_map->pmap to query
13548 			 * footprint information, so collect that data now
13549 			 * and store it in new_map->vmmap_corpse_footprint
13550 			 * for later autopsy.
13551 			 */
13552 			footprint_collect_kr =
13553 			    vm_map_corpse_footprint_collect(old_map,
13554 			    old_entry,
13555 			    new_map);
13556 		}
13557 
13558 		switch (old_entry_inheritance) {
13559 		case VM_INHERIT_NONE:
13560 			break;
13561 
13562 		case VM_INHERIT_SHARE:
13563 			vm_map_fork_share(old_map, old_entry, new_map);
13564 			new_size += entry_size;
13565 			break;
13566 
13567 		case VM_INHERIT_COPY:
13568 
13569 			/*
13570 			 *	Inline the copy_quickly case;
13571 			 *	upon failure, fall back on call
13572 			 *	to vm_map_fork_copy.
13573 			 */
13574 
13575 			if (old_entry->is_sub_map) {
13576 				break;
13577 			}
13578 			if ((old_entry->wired_count != 0) ||
13579 			    ((VME_OBJECT(old_entry) != NULL) &&
13580 			    (VME_OBJECT(old_entry)->true_share))) {
13581 				goto slow_vm_map_fork_copy;
13582 			}
13583 
13584 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13585 			vm_map_entry_copy(old_map, new_entry, old_entry);
13586 			if (old_entry->vme_permanent) {
13587 				/* inherit "permanent" on fork() */
13588 				new_entry->vme_permanent = TRUE;
13589 			}
13590 
13591 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13592 				new_map->jit_entry_exists = TRUE;
13593 			}
13594 
13595 			if (new_entry->is_sub_map) {
13596 				/* clear address space specifics */
13597 				new_entry->use_pmap = FALSE;
13598 			} else {
13599 				/*
13600 				 * We're dealing with a copy-on-write operation,
13601 				 * so the resulting mapping should not inherit
13602 				 * the original mapping's accounting settings.
13603 				 * "iokit_acct" should have been cleared in
13604 				 * vm_map_entry_copy().
13605 				 * "use_pmap" should be reset to its default
13606 				 * (TRUE) so that the new mapping gets
13607 				 * accounted for in the task's memory footprint.
13608 				 */
13609 				assert(!new_entry->iokit_acct);
13610 				new_entry->use_pmap = TRUE;
13611 			}
13612 
13613 			if (!vm_object_copy_quickly(
13614 				    VME_OBJECT(new_entry),
13615 				    VME_OFFSET(old_entry),
13616 				    (old_entry->vme_end -
13617 				    old_entry->vme_start),
13618 				    &src_needs_copy,
13619 				    &new_entry_needs_copy)) {
13620 				vm_map_entry_dispose(new_entry);
13621 				goto slow_vm_map_fork_copy;
13622 			}
13623 
13624 			/*
13625 			 *	Handle copy-on-write obligations
13626 			 */
13627 
13628 			if (src_needs_copy && !old_entry->needs_copy) {
13629 				vm_prot_t prot;
13630 
13631 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13632 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13633 					    __FUNCTION__,
13634 					    old_map, old_map->pmap, old_entry,
13635 					    (uint64_t)old_entry->vme_start,
13636 					    (uint64_t)old_entry->vme_end,
13637 					    old_entry->protection);
13638 				}
13639 
13640 				prot = old_entry->protection & ~VM_PROT_WRITE;
13641 
13642 				if (override_nx(old_map, VME_ALIAS(old_entry))
13643 				    && prot) {
13644 					prot |= VM_PROT_EXECUTE;
13645 				}
13646 
13647 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13648 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13649 					    __FUNCTION__,
13650 					    old_map, old_map->pmap, old_entry,
13651 					    (uint64_t)old_entry->vme_start,
13652 					    (uint64_t)old_entry->vme_end,
13653 					    prot);
13654 				}
13655 
13656 				vm_object_pmap_protect(
13657 					VME_OBJECT(old_entry),
13658 					VME_OFFSET(old_entry),
13659 					(old_entry->vme_end -
13660 					old_entry->vme_start),
13661 					((old_entry->is_shared
13662 					|| old_map->mapped_in_other_pmaps)
13663 					? PMAP_NULL :
13664 					old_map->pmap),
13665 					VM_MAP_PAGE_SIZE(old_map),
13666 					old_entry->vme_start,
13667 					prot);
13668 
13669 				assert(old_entry->wired_count == 0);
13670 				old_entry->needs_copy = TRUE;
13671 			}
13672 			new_entry->needs_copy = new_entry_needs_copy;
13673 
13674 			/*
13675 			 *	Insert the entry at the end
13676 			 *	of the map.
13677 			 */
13678 
13679 			vm_map_store_entry_link(new_map,
13680 			    vm_map_last_entry(new_map),
13681 			    new_entry,
13682 			    VM_MAP_KERNEL_FLAGS_NONE);
13683 			new_size += entry_size;
13684 			break;
13685 
13686 slow_vm_map_fork_copy:
13687 			vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13688 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13689 				vm_map_copyin_flags |=
13690 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13691 			}
13692 			if (vm_map_fork_copy(old_map,
13693 			    &old_entry,
13694 			    new_map,
13695 			    vm_map_copyin_flags)) {
13696 				new_size += entry_size;
13697 			}
13698 			continue;
13699 		}
13700 		old_entry = old_entry->vme_next;
13701 	}
13702 
13703 #if PMAP_FORK_NEST
13704 	new_entry = vm_map_last_entry(new_map);
13705 	if (new_entry == vm_map_to_entry(new_map)) {
13706 		/* unnest all that was pre-nested */
13707 		vm_map_fork_unnest(new_pmap,
13708 		    pre_nested_start, pre_nested_end,
13709 		    vm_map_min(new_map), vm_map_max(new_map));
13710 	} else if (new_entry->vme_end < vm_map_max(new_map)) {
13711 		/* unnest hole at the end, if pre-nested */
13712 		vm_map_fork_unnest(new_pmap,
13713 		    pre_nested_start, pre_nested_end,
13714 		    new_entry->vme_end, vm_map_max(new_map));
13715 	}
13716 #endif /* PMAP_FORK_NEST */
13717 
13718 #if defined(__arm64__)
13719 	pmap_insert_commpage(new_map->pmap);
13720 #endif /* __arm64__ */
13721 
13722 	new_map->size = new_size;
13723 
13724 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13725 		vm_map_corpse_footprint_collect_done(new_map);
13726 	}
13727 
13728 	/* Propagate JIT entitlement for the pmap layer. */
13729 	if (pmap_get_jit_entitled(old_map->pmap)) {
13730 		/* Tell the pmap that it supports JIT. */
13731 		pmap_set_jit_entitled(new_map->pmap);
13732 	}
13733 
13734 	/* Propagate TPRO settings for the pmap layer */
13735 	if (pmap_get_tpro(old_map->pmap)) {
13736 		/* Tell the pmap that it supports TPRO */
13737 		pmap_set_tpro(new_map->pmap);
13738 	}
13739 
13740 
13741 	vm_map_unlock(new_map);
13742 	vm_map_unlock(old_map);
13743 	vm_map_deallocate(old_map);
13744 
13745 	return new_map;
13746 }
13747 
13748 /*
13749  * vm_map_exec:
13750  *
13751  *      Setup the "new_map" with the proper execution environment according
13752  *	to the type of executable (platform, 64bit, chroot environment).
13753  *	Map the comm page and shared region, etc...
13754  */
13755 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13756 vm_map_exec(
13757 	vm_map_t        new_map,
13758 	task_t          task,
13759 	boolean_t       is64bit,
13760 	void            *fsroot,
13761 	cpu_type_t      cpu,
13762 	cpu_subtype_t   cpu_subtype,
13763 	boolean_t       reslide,
13764 	boolean_t       is_driverkit,
13765 	uint32_t        rsr_version)
13766 {
13767 	SHARED_REGION_TRACE_DEBUG(
13768 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13769 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13770 		(void *)VM_KERNEL_ADDRPERM(new_map),
13771 		(void *)VM_KERNEL_ADDRPERM(task),
13772 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13773 		cpu,
13774 		cpu_subtype));
13775 	(void) vm_commpage_enter(new_map, task, is64bit);
13776 
13777 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13778 
13779 	SHARED_REGION_TRACE_DEBUG(
13780 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13781 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13782 		(void *)VM_KERNEL_ADDRPERM(new_map),
13783 		(void *)VM_KERNEL_ADDRPERM(task),
13784 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13785 		cpu,
13786 		cpu_subtype));
13787 
13788 	/*
13789 	 * Some devices have region(s) of memory that shouldn't get allocated by
13790 	 * user processes. The following code creates dummy vm_map_entry_t's for each
13791 	 * of the regions that needs to be reserved to prevent any allocations in
13792 	 * those regions.
13793 	 */
13794 	kern_return_t kr = KERN_FAILURE;
13795 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13796 	vmk_flags.vmkf_beyond_max = true;
13797 
13798 	const struct vm_reserved_region *regions = NULL;
13799 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
13800 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13801 
13802 	for (size_t i = 0; i < num_regions; ++i) {
13803 		vm_map_offset_t address = regions[i].vmrr_addr;
13804 
13805 		kr = vm_map_enter(
13806 			new_map,
13807 			&address,
13808 			regions[i].vmrr_size,
13809 			(vm_map_offset_t)0,
13810 			vmk_flags,
13811 			VM_OBJECT_NULL,
13812 			(vm_object_offset_t)0,
13813 			FALSE,
13814 			VM_PROT_NONE,
13815 			VM_PROT_NONE,
13816 			VM_INHERIT_COPY);
13817 
13818 		if (kr != KERN_SUCCESS) {
13819 			panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13820 		}
13821 	}
13822 
13823 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13824 
13825 	return KERN_SUCCESS;
13826 }
13827 
13828 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13829 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13830 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13831 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13832 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13833 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13834 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13835 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13836 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13837 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13838 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13839 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13840 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13841 /*
13842  *	vm_map_lookup_and_lock_object:
13843  *
13844  *	Finds the VM object, offset, and
13845  *	protection for a given virtual address in the
13846  *	specified map, assuming a page fault of the
13847  *	type specified.
13848  *
13849  *	Returns the (object, offset, protection) for
13850  *	this address, whether it is wired down, and whether
13851  *	this map has the only reference to the data in question.
13852  *	In order to later verify this lookup, a "version"
13853  *	is returned.
13854  *	If contended != NULL, *contended will be set to
13855  *	true iff the thread had to spin or block to acquire
13856  *	an exclusive lock.
13857  *
13858  *	The map MUST be locked by the caller and WILL be
13859  *	locked on exit.  In order to guarantee the
13860  *	existence of the returned object, it is returned
13861  *	locked.
13862  *
13863  *	If a lookup is requested with "write protection"
13864  *	specified, the map may be changed to perform virtual
13865  *	copying operations, although the data referenced will
13866  *	remain the same.
13867  */
13868 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13869 vm_map_lookup_and_lock_object(
13870 	vm_map_t                *var_map,       /* IN/OUT */
13871 	vm_map_offset_t         vaddr,
13872 	vm_prot_t               fault_type,
13873 	int                     object_lock_type,
13874 	vm_map_version_t        *out_version,   /* OUT */
13875 	vm_object_t             *object,        /* OUT */
13876 	vm_object_offset_t      *offset,        /* OUT */
13877 	vm_prot_t               *out_prot,      /* OUT */
13878 	boolean_t               *wired,         /* OUT */
13879 	vm_object_fault_info_t  fault_info,     /* OUT */
13880 	vm_map_t                *real_map,      /* OUT */
13881 	bool                    *contended)     /* OUT */
13882 {
13883 	vm_map_entry_t                  entry;
13884 	vm_map_t                        map = *var_map;
13885 	vm_map_t                        old_map = *var_map;
13886 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
13887 	vm_map_offset_t                 cow_parent_vaddr = 0;
13888 	vm_map_offset_t                 old_start = 0;
13889 	vm_map_offset_t                 old_end = 0;
13890 	vm_prot_t                       prot;
13891 	boolean_t                       mask_protections;
13892 	boolean_t                       force_copy;
13893 	boolean_t                       no_force_copy_if_executable;
13894 	boolean_t                       submap_needed_copy;
13895 	vm_prot_t                       original_fault_type;
13896 	vm_map_size_t                   fault_page_mask;
13897 
13898 	/*
13899 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13900 	 * as a mask against the mapping's actual protections, not as an
13901 	 * absolute value.
13902 	 */
13903 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13904 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13905 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13906 	fault_type &= VM_PROT_ALL;
13907 	original_fault_type = fault_type;
13908 	if (contended) {
13909 		*contended = false;
13910 	}
13911 
13912 	*real_map = map;
13913 
13914 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13915 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13916 
13917 RetryLookup:
13918 	fault_type = original_fault_type;
13919 
13920 	/*
13921 	 *	If the map has an interesting hint, try it before calling
13922 	 *	full blown lookup routine.
13923 	 */
13924 	entry = map->hint;
13925 
13926 	if ((entry == vm_map_to_entry(map)) ||
13927 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13928 		vm_map_entry_t  tmp_entry;
13929 
13930 		/*
13931 		 *	Entry was either not a valid hint, or the vaddr
13932 		 *	was not contained in the entry, so do a full lookup.
13933 		 */
13934 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13935 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13936 				vm_map_unlock(cow_sub_map_parent);
13937 			}
13938 			if ((*real_map != map)
13939 			    && (*real_map != cow_sub_map_parent)) {
13940 				vm_map_unlock(*real_map);
13941 			}
13942 			return KERN_INVALID_ADDRESS;
13943 		}
13944 
13945 		entry = tmp_entry;
13946 	}
13947 	if (map == old_map) {
13948 		old_start = entry->vme_start;
13949 		old_end = entry->vme_end;
13950 	}
13951 
13952 	/*
13953 	 *	Handle submaps.  Drop lock on upper map, submap is
13954 	 *	returned locked.
13955 	 */
13956 
13957 	submap_needed_copy = FALSE;
13958 submap_recurse:
13959 	if (entry->is_sub_map) {
13960 		vm_map_offset_t         local_vaddr;
13961 		vm_map_offset_t         end_delta;
13962 		vm_map_offset_t         start_delta;
13963 		vm_map_offset_t         top_entry_saved_start;
13964 		vm_object_offset_t      top_entry_saved_offset;
13965 		vm_map_entry_t          submap_entry, saved_submap_entry;
13966 		vm_object_offset_t      submap_entry_offset;
13967 		vm_object_size_t        submap_entry_size;
13968 		vm_prot_t               subentry_protection;
13969 		vm_prot_t               subentry_max_protection;
13970 		boolean_t               subentry_no_copy_on_read;
13971 		boolean_t               subentry_permanent;
13972 		boolean_t               subentry_csm_associated;
13973 #if __arm64e__
13974 		boolean_t               subentry_used_for_tpro;
13975 #endif /* __arm64e__ */
13976 		boolean_t               mapped_needs_copy = FALSE;
13977 		vm_map_version_t        version;
13978 
13979 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13980 		    "map %p (%d) entry %p submap %p (%d)\n",
13981 		    map, VM_MAP_PAGE_SHIFT(map), entry,
13982 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13983 
13984 		local_vaddr = vaddr;
13985 		top_entry_saved_start = entry->vme_start;
13986 		top_entry_saved_offset = VME_OFFSET(entry);
13987 
13988 		if ((entry->use_pmap &&
13989 		    !((fault_type & VM_PROT_WRITE) ||
13990 		    force_copy))) {
13991 			/* if real_map equals map we unlock below */
13992 			if ((*real_map != map) &&
13993 			    (*real_map != cow_sub_map_parent)) {
13994 				vm_map_unlock(*real_map);
13995 			}
13996 			*real_map = VME_SUBMAP(entry);
13997 		}
13998 
13999 		if (entry->needs_copy &&
14000 		    ((fault_type & VM_PROT_WRITE) ||
14001 		    force_copy)) {
14002 			if (!mapped_needs_copy) {
14003 				if (vm_map_lock_read_to_write(map)) {
14004 					vm_map_lock_read(map);
14005 					*real_map = map;
14006 					goto RetryLookup;
14007 				}
14008 				vm_map_lock_read(VME_SUBMAP(entry));
14009 				*var_map = VME_SUBMAP(entry);
14010 				cow_sub_map_parent = map;
14011 				/* reset base to map before cow object */
14012 				/* this is the map which will accept   */
14013 				/* the new cow object */
14014 				old_start = entry->vme_start;
14015 				old_end = entry->vme_end;
14016 				cow_parent_vaddr = vaddr;
14017 				mapped_needs_copy = TRUE;
14018 			} else {
14019 				vm_map_lock_read(VME_SUBMAP(entry));
14020 				*var_map = VME_SUBMAP(entry);
14021 				if ((cow_sub_map_parent != map) &&
14022 				    (*real_map != map)) {
14023 					vm_map_unlock(map);
14024 				}
14025 			}
14026 		} else {
14027 			if (entry->needs_copy) {
14028 				submap_needed_copy = TRUE;
14029 			}
14030 			vm_map_lock_read(VME_SUBMAP(entry));
14031 			*var_map = VME_SUBMAP(entry);
14032 			/* leave map locked if it is a target */
14033 			/* cow sub_map above otherwise, just  */
14034 			/* follow the maps down to the object */
14035 			/* here we unlock knowing we are not  */
14036 			/* revisiting the map.  */
14037 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
14038 				vm_map_unlock_read(map);
14039 			}
14040 		}
14041 
14042 		entry = NULL;
14043 		map = *var_map;
14044 
14045 		/* calculate the offset in the submap for vaddr */
14046 		local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
14047 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
14048 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
14049 		    (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
14050 
14051 RetrySubMap:
14052 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
14053 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14054 				vm_map_unlock(cow_sub_map_parent);
14055 			}
14056 			if ((*real_map != map)
14057 			    && (*real_map != cow_sub_map_parent)) {
14058 				vm_map_unlock(*real_map);
14059 			}
14060 			*real_map = map;
14061 			return KERN_INVALID_ADDRESS;
14062 		}
14063 
14064 		/* find the attenuated shadow of the underlying object */
14065 		/* on our target map */
14066 
14067 		/* in english the submap object may extend beyond the     */
14068 		/* region mapped by the entry or, may only fill a portion */
14069 		/* of it.  For our purposes, we only care if the object   */
14070 		/* doesn't fill.  In this case the area which will        */
14071 		/* ultimately be clipped in the top map will only need    */
14072 		/* to be as big as the portion of the underlying entry    */
14073 		/* which is mapped */
14074 		start_delta = submap_entry->vme_start > top_entry_saved_offset ?
14075 		    submap_entry->vme_start - top_entry_saved_offset : 0;
14076 
14077 		end_delta =
14078 		    (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
14079 		    submap_entry->vme_end ?
14080 		    0 : (top_entry_saved_offset +
14081 		    (old_end - old_start))
14082 		    - submap_entry->vme_end;
14083 
14084 		old_start += start_delta;
14085 		old_end -= end_delta;
14086 
14087 		if (submap_entry->is_sub_map) {
14088 			entry = submap_entry;
14089 			vaddr = local_vaddr;
14090 			goto submap_recurse;
14091 		}
14092 
14093 		if (((fault_type & VM_PROT_WRITE) ||
14094 		    force_copy)
14095 		    && cow_sub_map_parent) {
14096 			vm_object_t     sub_object, copy_object;
14097 			vm_object_offset_t copy_offset;
14098 			vm_map_offset_t local_start;
14099 			vm_map_offset_t local_end;
14100 			boolean_t       object_copied = FALSE;
14101 			vm_object_offset_t object_copied_offset = 0;
14102 			boolean_t       object_copied_needs_copy = FALSE;
14103 			kern_return_t   kr = KERN_SUCCESS;
14104 
14105 			if (vm_map_lock_read_to_write(map)) {
14106 				vm_map_lock_read(map);
14107 				old_start -= start_delta;
14108 				old_end += end_delta;
14109 				goto RetrySubMap;
14110 			}
14111 
14112 
14113 			sub_object = VME_OBJECT(submap_entry);
14114 			if (sub_object == VM_OBJECT_NULL) {
14115 				sub_object =
14116 				    vm_object_allocate(
14117 					(vm_map_size_t)
14118 					(submap_entry->vme_end -
14119 					submap_entry->vme_start));
14120 				VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14121 				VME_OFFSET_SET(submap_entry, 0);
14122 				assert(!submap_entry->is_sub_map);
14123 				assert(submap_entry->use_pmap);
14124 			}
14125 			local_start =  local_vaddr -
14126 			    (cow_parent_vaddr - old_start);
14127 			local_end = local_vaddr +
14128 			    (old_end - cow_parent_vaddr);
14129 			vm_map_clip_start(map, submap_entry, local_start);
14130 			vm_map_clip_end(map, submap_entry, local_end);
14131 			if (submap_entry->is_sub_map) {
14132 				/* unnesting was done when clipping */
14133 				assert(!submap_entry->use_pmap);
14134 			}
14135 
14136 			/* This is the COW case, lets connect */
14137 			/* an entry in our space to the underlying */
14138 			/* object in the submap, bypassing the  */
14139 			/* submap. */
14140 			submap_entry_offset = VME_OFFSET(submap_entry);
14141 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14142 
14143 			if ((submap_entry->wired_count != 0 ||
14144 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14145 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
14146 			    no_force_copy_if_executable) {
14147 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14148 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14149 					vm_map_unlock(cow_sub_map_parent);
14150 				}
14151 				if ((*real_map != map)
14152 				    && (*real_map != cow_sub_map_parent)) {
14153 					vm_map_unlock(*real_map);
14154 				}
14155 				*real_map = map;
14156 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14157 				vm_map_lock_write_to_read(map);
14158 				kr = KERN_PROTECTION_FAILURE;
14159 				DTRACE_VM4(submap_no_copy_executable,
14160 				    vm_map_t, map,
14161 				    vm_object_offset_t, submap_entry_offset,
14162 				    vm_object_size_t, submap_entry_size,
14163 				    int, kr);
14164 				return kr;
14165 			}
14166 
14167 			if (submap_entry->wired_count != 0) {
14168 				vm_object_reference(sub_object);
14169 
14170 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14171 				    "submap_entry %p offset 0x%llx\n",
14172 				    submap_entry, VME_OFFSET(submap_entry));
14173 
14174 				DTRACE_VM6(submap_copy_slowly,
14175 				    vm_map_t, cow_sub_map_parent,
14176 				    vm_map_offset_t, vaddr,
14177 				    vm_map_t, map,
14178 				    vm_object_size_t, submap_entry_size,
14179 				    int, submap_entry->wired_count,
14180 				    int, sub_object->copy_strategy);
14181 
14182 				saved_submap_entry = submap_entry;
14183 				version.main_timestamp = map->timestamp;
14184 				vm_map_unlock(map); /* Increments timestamp by 1 */
14185 				submap_entry = VM_MAP_ENTRY_NULL;
14186 
14187 				vm_object_lock(sub_object);
14188 				kr = vm_object_copy_slowly(sub_object,
14189 				    submap_entry_offset,
14190 				    submap_entry_size,
14191 				    FALSE,
14192 				    &copy_object);
14193 				object_copied = TRUE;
14194 				object_copied_offset = 0;
14195 				/* 4k: account for extra offset in physical page */
14196 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14197 				object_copied_needs_copy = FALSE;
14198 				vm_object_deallocate(sub_object);
14199 
14200 				vm_map_lock(map);
14201 
14202 				if (kr != KERN_SUCCESS &&
14203 				    kr != KERN_MEMORY_RESTART_COPY) {
14204 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14205 						vm_map_unlock(cow_sub_map_parent);
14206 					}
14207 					if ((*real_map != map)
14208 					    && (*real_map != cow_sub_map_parent)) {
14209 						vm_map_unlock(*real_map);
14210 					}
14211 					*real_map = map;
14212 					vm_object_deallocate(copy_object);
14213 					copy_object = VM_OBJECT_NULL;
14214 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14215 					vm_map_lock_write_to_read(map);
14216 					DTRACE_VM4(submap_copy_error_slowly,
14217 					    vm_object_t, sub_object,
14218 					    vm_object_offset_t, submap_entry_offset,
14219 					    vm_object_size_t, submap_entry_size,
14220 					    int, kr);
14221 					vm_map_lookup_and_lock_object_copy_slowly_error++;
14222 					return kr;
14223 				}
14224 
14225 				if ((kr == KERN_SUCCESS) &&
14226 				    (version.main_timestamp + 1) == map->timestamp) {
14227 					submap_entry = saved_submap_entry;
14228 				} else {
14229 					saved_submap_entry = NULL;
14230 					old_start -= start_delta;
14231 					old_end += end_delta;
14232 					vm_object_deallocate(copy_object);
14233 					copy_object = VM_OBJECT_NULL;
14234 					vm_map_lock_write_to_read(map);
14235 					vm_map_lookup_and_lock_object_copy_slowly_restart++;
14236 					goto RetrySubMap;
14237 				}
14238 				vm_map_lookup_and_lock_object_copy_slowly_count++;
14239 				vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14240 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14241 					vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14242 				}
14243 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14244 				submap_entry_offset = VME_OFFSET(submap_entry);
14245 				copy_object = VM_OBJECT_NULL;
14246 				object_copied_offset = submap_entry_offset;
14247 				object_copied_needs_copy = FALSE;
14248 				DTRACE_VM6(submap_copy_strategically,
14249 				    vm_map_t, cow_sub_map_parent,
14250 				    vm_map_offset_t, vaddr,
14251 				    vm_map_t, map,
14252 				    vm_object_size_t, submap_entry_size,
14253 				    int, submap_entry->wired_count,
14254 				    int, sub_object->copy_strategy);
14255 				kr = vm_object_copy_strategically(
14256 					sub_object,
14257 					submap_entry_offset,
14258 					submap_entry->vme_end - submap_entry->vme_start,
14259 					false, /* forking */
14260 					&copy_object,
14261 					&object_copied_offset,
14262 					&object_copied_needs_copy);
14263 				if (kr == KERN_MEMORY_RESTART_COPY) {
14264 					old_start -= start_delta;
14265 					old_end += end_delta;
14266 					vm_object_deallocate(copy_object);
14267 					copy_object = VM_OBJECT_NULL;
14268 					vm_map_lock_write_to_read(map);
14269 					vm_map_lookup_and_lock_object_copy_strategically_restart++;
14270 					goto RetrySubMap;
14271 				}
14272 				if (kr != KERN_SUCCESS) {
14273 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14274 						vm_map_unlock(cow_sub_map_parent);
14275 					}
14276 					if ((*real_map != map)
14277 					    && (*real_map != cow_sub_map_parent)) {
14278 						vm_map_unlock(*real_map);
14279 					}
14280 					*real_map = map;
14281 					vm_object_deallocate(copy_object);
14282 					copy_object = VM_OBJECT_NULL;
14283 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14284 					vm_map_lock_write_to_read(map);
14285 					DTRACE_VM4(submap_copy_error_strategically,
14286 					    vm_object_t, sub_object,
14287 					    vm_object_offset_t, submap_entry_offset,
14288 					    vm_object_size_t, submap_entry_size,
14289 					    int, kr);
14290 					vm_map_lookup_and_lock_object_copy_strategically_error++;
14291 					return kr;
14292 				}
14293 				assert(copy_object != VM_OBJECT_NULL);
14294 				assert(copy_object != sub_object);
14295 				object_copied = TRUE;
14296 				vm_map_lookup_and_lock_object_copy_strategically_count++;
14297 				vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14298 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14299 					vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14300 				}
14301 			} else {
14302 				/* set up shadow object */
14303 				object_copied = FALSE;
14304 				copy_object = sub_object;
14305 				vm_object_lock(sub_object);
14306 				vm_object_reference_locked(sub_object);
14307 				VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14308 				vm_object_unlock(sub_object);
14309 
14310 				assert(submap_entry->wired_count == 0);
14311 				submap_entry->needs_copy = TRUE;
14312 
14313 				prot = submap_entry->protection;
14314 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14315 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14316 					    __FUNCTION__,
14317 					    map, map->pmap, submap_entry,
14318 					    (uint64_t)submap_entry->vme_start,
14319 					    (uint64_t)submap_entry->vme_end,
14320 					    prot);
14321 				}
14322 				prot = prot & ~VM_PROT_WRITE;
14323 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14324 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14325 					    __FUNCTION__,
14326 					    map, map->pmap, submap_entry,
14327 					    (uint64_t)submap_entry->vme_start,
14328 					    (uint64_t)submap_entry->vme_end,
14329 					    prot);
14330 				}
14331 
14332 				if (override_nx(old_map,
14333 				    VME_ALIAS(submap_entry))
14334 				    && prot) {
14335 					prot |= VM_PROT_EXECUTE;
14336 				}
14337 
14338 				vm_object_pmap_protect(
14339 					sub_object,
14340 					VME_OFFSET(submap_entry),
14341 					submap_entry->vme_end -
14342 					submap_entry->vme_start,
14343 					(submap_entry->is_shared
14344 					|| map->mapped_in_other_pmaps) ?
14345 					PMAP_NULL : map->pmap,
14346 					VM_MAP_PAGE_SIZE(map),
14347 					submap_entry->vme_start,
14348 					prot);
14349 				vm_map_lookup_and_lock_object_copy_shadow_count++;
14350 				vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14351 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14352 					vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14353 				}
14354 			}
14355 
14356 			/*
14357 			 * Adjust the fault offset to the submap entry.
14358 			 */
14359 			copy_offset = (local_vaddr -
14360 			    submap_entry->vme_start +
14361 			    VME_OFFSET(submap_entry));
14362 
14363 			/* This works diffently than the   */
14364 			/* normal submap case. We go back  */
14365 			/* to the parent of the cow map and*/
14366 			/* clip out the target portion of  */
14367 			/* the sub_map, substituting the   */
14368 			/* new copy object,                */
14369 
14370 			subentry_protection = submap_entry->protection;
14371 			subentry_max_protection = submap_entry->max_protection;
14372 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14373 			subentry_permanent = submap_entry->vme_permanent;
14374 			subentry_csm_associated = submap_entry->csm_associated;
14375 #if __arm64e__
14376 			subentry_used_for_tpro = submap_entry->used_for_tpro;
14377 #endif // __arm64e__
14378 			vm_map_unlock(map);
14379 			submap_entry = NULL; /* not valid after map unlock */
14380 
14381 			local_start = old_start;
14382 			local_end = old_end;
14383 			map = cow_sub_map_parent;
14384 			*var_map = cow_sub_map_parent;
14385 			vaddr = cow_parent_vaddr;
14386 			cow_sub_map_parent = NULL;
14387 
14388 			if (!vm_map_lookup_entry(map,
14389 			    vaddr, &entry)) {
14390 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14391 					vm_map_unlock(cow_sub_map_parent);
14392 				}
14393 				if ((*real_map != map)
14394 				    && (*real_map != cow_sub_map_parent)) {
14395 					vm_map_unlock(*real_map);
14396 				}
14397 				*real_map = map;
14398 				vm_object_deallocate(
14399 					copy_object);
14400 				copy_object = VM_OBJECT_NULL;
14401 				vm_map_lock_write_to_read(map);
14402 				DTRACE_VM4(submap_lookup_post_unlock,
14403 				    uint64_t, (uint64_t)entry->vme_start,
14404 				    uint64_t, (uint64_t)entry->vme_end,
14405 				    vm_map_offset_t, vaddr,
14406 				    int, object_copied);
14407 				return KERN_INVALID_ADDRESS;
14408 			}
14409 
14410 			/* clip out the portion of space */
14411 			/* mapped by the sub map which   */
14412 			/* corresponds to the underlying */
14413 			/* object */
14414 
14415 			/*
14416 			 * Clip (and unnest) the smallest nested chunk
14417 			 * possible around the faulting address...
14418 			 */
14419 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14420 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
14421 			/*
14422 			 * ... but don't go beyond the "old_start" to "old_end"
14423 			 * range, to avoid spanning over another VM region
14424 			 * with a possibly different VM object and/or offset.
14425 			 */
14426 			if (local_start < old_start) {
14427 				local_start = old_start;
14428 			}
14429 			if (local_end > old_end) {
14430 				local_end = old_end;
14431 			}
14432 			/*
14433 			 * Adjust copy_offset to the start of the range.
14434 			 */
14435 			copy_offset -= (vaddr - local_start);
14436 
14437 			vm_map_clip_start(map, entry, local_start);
14438 			vm_map_clip_end(map, entry, local_end);
14439 			if (entry->is_sub_map) {
14440 				/* unnesting was done when clipping */
14441 				assert(!entry->use_pmap);
14442 			}
14443 
14444 			/* substitute copy object for */
14445 			/* shared map entry           */
14446 			vm_map_deallocate(VME_SUBMAP(entry));
14447 			assert(!entry->iokit_acct);
14448 			entry->use_pmap = TRUE;
14449 			VME_OBJECT_SET(entry, copy_object, false, 0);
14450 
14451 			/* propagate the submap entry's protections */
14452 			if (entry->protection != VM_PROT_READ) {
14453 				/*
14454 				 * Someone has already altered the top entry's
14455 				 * protections via vm_protect(VM_PROT_COPY).
14456 				 * Respect these new values and ignore the
14457 				 * submap entry's protections.
14458 				 */
14459 			} else {
14460 				/*
14461 				 * Regular copy-on-write: propagate the submap
14462 				 * entry's protections to the top map entry.
14463 				 */
14464 				entry->protection |= subentry_protection;
14465 			}
14466 			entry->max_protection |= subentry_max_protection;
14467 			/* propagate some attributes from subentry */
14468 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14469 			entry->vme_permanent = subentry_permanent;
14470 			entry->csm_associated = subentry_csm_associated;
14471 #if __arm64e__
14472 			/* propagate TPRO iff the destination map has TPRO enabled */
14473 			if (subentry_used_for_tpro) {
14474 				if (vm_map_tpro(map)) {
14475 					entry->used_for_tpro = subentry_used_for_tpro;
14476 				} else {
14477 					/* "permanent" came from being TPRO */
14478 					entry->vme_permanent = FALSE;
14479 				}
14480 			}
14481 #endif /* __arm64e */
14482 			if ((entry->protection & VM_PROT_WRITE) &&
14483 			    (entry->protection & VM_PROT_EXECUTE) &&
14484 #if XNU_TARGET_OS_OSX
14485 			    map->pmap != kernel_pmap &&
14486 			    (vm_map_cs_enforcement(map)
14487 #if __arm64__
14488 			    || !VM_MAP_IS_EXOTIC(map)
14489 #endif /* __arm64__ */
14490 			    ) &&
14491 #endif /* XNU_TARGET_OS_OSX */
14492 #if CODE_SIGNING_MONITOR
14493 			    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14494 #endif
14495 			    !(entry->used_for_jit) &&
14496 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
14497 				DTRACE_VM3(cs_wx,
14498 				    uint64_t, (uint64_t)entry->vme_start,
14499 				    uint64_t, (uint64_t)entry->vme_end,
14500 				    vm_prot_t, entry->protection);
14501 				printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14502 				    proc_selfpid(),
14503 				    (get_bsdtask_info(current_task())
14504 				    ? proc_name_address(get_bsdtask_info(current_task()))
14505 				    : "?"),
14506 				    __FUNCTION__, __LINE__,
14507 #if DEVELOPMENT || DEBUG
14508 				    (uint64_t)entry->vme_start,
14509 				    (uint64_t)entry->vme_end,
14510 #else /* DEVELOPMENT || DEBUG */
14511 				    (uint64_t)0,
14512 				    (uint64_t)0,
14513 #endif /* DEVELOPMENT || DEBUG */
14514 				    entry->protection);
14515 				entry->protection &= ~VM_PROT_EXECUTE;
14516 			}
14517 
14518 			if (object_copied) {
14519 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14520 				entry->needs_copy = object_copied_needs_copy;
14521 				entry->is_shared = FALSE;
14522 			} else {
14523 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14524 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14525 				assert(entry->wired_count == 0);
14526 				VME_OFFSET_SET(entry, copy_offset);
14527 				entry->needs_copy = TRUE;
14528 				if (map != old_map) {
14529 					entry->is_shared = TRUE;
14530 				}
14531 			}
14532 			if (entry->inheritance == VM_INHERIT_SHARE) {
14533 				entry->inheritance = VM_INHERIT_COPY;
14534 			}
14535 
14536 			vm_map_lock_write_to_read(map);
14537 		} else {
14538 			if ((cow_sub_map_parent)
14539 			    && (cow_sub_map_parent != *real_map)
14540 			    && (cow_sub_map_parent != map)) {
14541 				vm_map_unlock(cow_sub_map_parent);
14542 			}
14543 			entry = submap_entry;
14544 			vaddr = local_vaddr;
14545 		}
14546 	}
14547 
14548 	/*
14549 	 *	Check whether this task is allowed to have
14550 	 *	this page.
14551 	 */
14552 
14553 	prot = entry->protection;
14554 
14555 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14556 		/*
14557 		 * HACK -- if not a stack, then allow execution
14558 		 */
14559 		prot |= VM_PROT_EXECUTE;
14560 	}
14561 
14562 #if __arm64e__
14563 	/*
14564 	 * If the entry we're dealing with is TPRO and we have a write
14565 	 * fault, inject VM_PROT_WRITE into protections. This allows us
14566 	 * to maintain RO permissions when not marked as TPRO.
14567 	 */
14568 	if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14569 		prot |= VM_PROT_WRITE;
14570 	}
14571 #endif /* __arm64e__ */
14572 	if (mask_protections) {
14573 		fault_type &= prot;
14574 		if (fault_type == VM_PROT_NONE) {
14575 			goto protection_failure;
14576 		}
14577 	}
14578 	if (((fault_type & prot) != fault_type)
14579 #if __arm64__
14580 	    /* prefetch abort in execute-only page */
14581 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14582 #elif defined(__x86_64__)
14583 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
14584 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14585 #endif
14586 	    ) {
14587 protection_failure:
14588 		if (*real_map != map) {
14589 			vm_map_unlock(*real_map);
14590 		}
14591 		*real_map = map;
14592 
14593 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
14594 			log_stack_execution_failure((addr64_t)vaddr, prot);
14595 		}
14596 
14597 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14598 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14599 		/*
14600 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14601 		 *
14602 		 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14603 		 */
14604 		return KERN_PROTECTION_FAILURE;
14605 	}
14606 
14607 	/*
14608 	 *	If this page is not pageable, we have to get
14609 	 *	it for all possible accesses.
14610 	 */
14611 
14612 	*wired = (entry->wired_count != 0);
14613 	if (*wired) {
14614 		fault_type = prot;
14615 	}
14616 
14617 	/*
14618 	 *	If the entry was copy-on-write, we either ...
14619 	 */
14620 
14621 	if (entry->needs_copy) {
14622 		/*
14623 		 *	If we want to write the page, we may as well
14624 		 *	handle that now since we've got the map locked.
14625 		 *
14626 		 *	If we don't need to write the page, we just
14627 		 *	demote the permissions allowed.
14628 		 */
14629 
14630 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14631 			/*
14632 			 *	Make a new object, and place it in the
14633 			 *	object chain.  Note that no new references
14634 			 *	have appeared -- one just moved from the
14635 			 *	map to the new object.
14636 			 */
14637 
14638 			if (vm_map_lock_read_to_write(map)) {
14639 				vm_map_lock_read(map);
14640 				goto RetryLookup;
14641 			}
14642 
14643 			if (VME_OBJECT(entry)->shadowed == FALSE) {
14644 				vm_object_lock(VME_OBJECT(entry));
14645 				VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14646 				vm_object_unlock(VME_OBJECT(entry));
14647 			}
14648 			VME_OBJECT_SHADOW(entry,
14649 			    (vm_map_size_t) (entry->vme_end -
14650 			    entry->vme_start),
14651 			    vm_map_always_shadow(map));
14652 			entry->needs_copy = FALSE;
14653 
14654 			vm_map_lock_write_to_read(map);
14655 		}
14656 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14657 			/*
14658 			 *	We're attempting to read a copy-on-write
14659 			 *	page -- don't allow writes.
14660 			 */
14661 
14662 			prot &= (~VM_PROT_WRITE);
14663 		}
14664 	}
14665 
14666 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14667 		/*
14668 		 * We went through a "needs_copy" submap without triggering
14669 		 * a copy, so granting write access to the page would bypass
14670 		 * that submap's "needs_copy".
14671 		 */
14672 		assert(!(fault_type & VM_PROT_WRITE));
14673 		assert(!*wired);
14674 		assert(!force_copy);
14675 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14676 		prot &= ~VM_PROT_WRITE;
14677 	}
14678 
14679 	/*
14680 	 *	Create an object if necessary.
14681 	 */
14682 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14683 		if (vm_map_lock_read_to_write(map)) {
14684 			vm_map_lock_read(map);
14685 			goto RetryLookup;
14686 		}
14687 
14688 		VME_OBJECT_SET(entry,
14689 		    vm_object_allocate(
14690 			    (vm_map_size_t)(entry->vme_end -
14691 			    entry->vme_start)), false, 0);
14692 		VME_OFFSET_SET(entry, 0);
14693 		assert(entry->use_pmap);
14694 		vm_map_lock_write_to_read(map);
14695 	}
14696 
14697 	/*
14698 	 *	Return the object/offset from this entry.  If the entry
14699 	 *	was copy-on-write or empty, it has been fixed up.  Also
14700 	 *	return the protection.
14701 	 */
14702 
14703 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14704 	*object = VME_OBJECT(entry);
14705 	*out_prot = prot;
14706 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14707 
14708 	if (fault_info) {
14709 		/* ... the caller will change "interruptible" if needed */
14710 		fault_info->user_tag = VME_ALIAS(entry);
14711 		fault_info->pmap_options = 0;
14712 		if (entry->iokit_acct ||
14713 		    (!entry->is_sub_map && !entry->use_pmap)) {
14714 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14715 		}
14716 		if (fault_info->behavior == VM_BEHAVIOR_DEFAULT) {
14717 			fault_info->behavior = entry->behavior;
14718 		}
14719 		fault_info->lo_offset = VME_OFFSET(entry);
14720 		fault_info->hi_offset =
14721 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14722 		fault_info->no_cache  = entry->no_cache;
14723 		fault_info->stealth = FALSE;
14724 		fault_info->io_sync = FALSE;
14725 		if (entry->used_for_jit ||
14726 #if CODE_SIGNING_MONITOR
14727 		    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14728 #endif
14729 		    entry->vme_resilient_codesign) {
14730 			fault_info->cs_bypass = TRUE;
14731 		} else {
14732 			fault_info->cs_bypass = FALSE;
14733 		}
14734 		fault_info->csm_associated = FALSE;
14735 #if CODE_SIGNING_MONITOR
14736 		if (entry->csm_associated) {
14737 			/*
14738 			 * The pmap layer will validate this page
14739 			 * before allowing it to be executed from.
14740 			 */
14741 			fault_info->csm_associated = TRUE;
14742 		}
14743 #endif
14744 		fault_info->mark_zf_absent = FALSE;
14745 		fault_info->batch_pmap_op = FALSE;
14746 		fault_info->resilient_media = entry->vme_resilient_media;
14747 		fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14748 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14749 #if __arm64e__
14750 		fault_info->fi_used_for_tpro = entry->used_for_tpro;
14751 #else /* __arm64e__ */
14752 		fault_info->fi_used_for_tpro = FALSE;
14753 #endif
14754 		if (entry->translated_allow_execute) {
14755 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14756 		}
14757 	}
14758 
14759 	/*
14760 	 *	Lock the object to prevent it from disappearing
14761 	 */
14762 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14763 		if (contended == NULL) {
14764 			vm_object_lock(*object);
14765 		} else {
14766 			*contended = vm_object_lock_check_contended(*object);
14767 		}
14768 	} else {
14769 		vm_object_lock_shared(*object);
14770 	}
14771 
14772 	/*
14773 	 *	Save the version number
14774 	 */
14775 
14776 	out_version->main_timestamp = map->timestamp;
14777 
14778 	return KERN_SUCCESS;
14779 }
14780 
14781 
14782 /*
14783  *	vm_map_verify:
14784  *
14785  *	Verifies that the map in question has not changed
14786  *	since the given version. The map has to be locked
14787  *	("shared" mode is fine) before calling this function
14788  *	and it will be returned locked too.
14789  */
14790 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14791 vm_map_verify(
14792 	vm_map_t                map,
14793 	vm_map_version_t        *version)       /* REF */
14794 {
14795 	boolean_t       result;
14796 
14797 	vm_map_lock_assert_held(map);
14798 	result = (map->timestamp == version->main_timestamp);
14799 
14800 	return result;
14801 }
14802 
14803 /*
14804  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14805  *	Goes away after regular vm_region_recurse function migrates to
14806  *	64 bits
14807  *	vm_region_recurse: A form of vm_region which follows the
14808  *	submaps in a target map
14809  *
14810  */
14811 
14812 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14813 vm_map_region_recurse_64(
14814 	vm_map_t                map,
14815 	vm_map_offset_ut       *address_u,      /* IN/OUT */
14816 	vm_map_size_ut         *size_u,         /* OUT */
14817 	natural_t              *nesting_depth,  /* IN/OUT */
14818 	vm_region_submap_info_64_t submap_info, /* IN/OUT */
14819 	mach_msg_type_number_t *count)          /* IN/OUT */
14820 {
14821 	mach_msg_type_number_t  original_count;
14822 	vm_region_extended_info_data_t  extended;
14823 	vm_map_entry_t                  tmp_entry;
14824 	vm_map_offset_t                 user_address;
14825 	unsigned int                    user_max_depth;
14826 
14827 	/*
14828 	 * "curr_entry" is the VM map entry preceding or including the
14829 	 * address we're looking for.
14830 	 * "curr_map" is the map or sub-map containing "curr_entry".
14831 	 * "curr_address" is the equivalent of the top map's "user_address"
14832 	 * in the current map.
14833 	 * "curr_offset" is the cumulated offset of "curr_map" in the
14834 	 * target task's address space.
14835 	 * "curr_depth" is the depth of "curr_map" in the chain of
14836 	 * sub-maps.
14837 	 *
14838 	 * "curr_max_below" and "curr_max_above" limit the range (around
14839 	 * "curr_address") we should take into account in the current (sub)map.
14840 	 * They limit the range to what's visible through the map entries
14841 	 * we've traversed from the top map to the current map.
14842 	 *
14843 	 */
14844 	vm_map_entry_t                  curr_entry;
14845 	vm_map_address_t                curr_address;
14846 	vm_map_offset_t                 curr_offset;
14847 	vm_map_t                        curr_map;
14848 	unsigned int                    curr_depth;
14849 	vm_map_offset_t                 curr_max_below, curr_max_above;
14850 	vm_map_offset_t                 curr_skip;
14851 
14852 	/*
14853 	 * "next_" is the same as "curr_" but for the VM region immediately
14854 	 * after the address we're looking for.  We need to keep track of this
14855 	 * too because we want to return info about that region if the
14856 	 * address we're looking for is not mapped.
14857 	 */
14858 	vm_map_entry_t                  next_entry;
14859 	vm_map_offset_t                 next_offset;
14860 	vm_map_offset_t                 next_address;
14861 	vm_map_t                        next_map;
14862 	unsigned int                    next_depth;
14863 	vm_map_offset_t                 next_max_below, next_max_above;
14864 	vm_map_offset_t                 next_skip;
14865 
14866 	boolean_t                       look_for_pages;
14867 	vm_region_submap_short_info_64_t short_info;
14868 	boolean_t                       do_region_footprint;
14869 	int                             effective_page_size, effective_page_shift;
14870 	boolean_t                       submap_needed_copy;
14871 
14872 	if (map == VM_MAP_NULL) {
14873 		/* no address space to work on */
14874 		return KERN_INVALID_ARGUMENT;
14875 	}
14876 
14877 	user_address = vm_sanitize_addr(map, *address_u);
14878 
14879 	effective_page_shift = vm_self_region_page_shift(map);
14880 	effective_page_size = (1 << effective_page_shift);
14881 
14882 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14883 		/*
14884 		 * "info" structure is not big enough and
14885 		 * would overflow
14886 		 */
14887 		return KERN_INVALID_ARGUMENT;
14888 	}
14889 
14890 	do_region_footprint = task_self_region_footprint();
14891 	original_count = *count;
14892 
14893 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14894 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14895 		look_for_pages = FALSE;
14896 		short_info = (vm_region_submap_short_info_64_t) submap_info;
14897 		submap_info = NULL;
14898 	} else {
14899 		look_for_pages = TRUE;
14900 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14901 		short_info = NULL;
14902 
14903 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14904 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14905 		}
14906 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14907 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14908 		}
14909 	}
14910 
14911 	user_max_depth = *nesting_depth;
14912 	submap_needed_copy = FALSE;
14913 
14914 	if (not_in_kdp) {
14915 		vm_map_lock_read(map);
14916 	}
14917 
14918 recurse_again:
14919 	curr_entry = NULL;
14920 	curr_map = map;
14921 	curr_address = user_address;
14922 	curr_offset = 0;
14923 	curr_skip = 0;
14924 	curr_depth = 0;
14925 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14926 	curr_max_below = curr_address;
14927 
14928 	next_entry = NULL;
14929 	next_map = NULL;
14930 	next_address = 0;
14931 	next_offset = 0;
14932 	next_skip = 0;
14933 	next_depth = 0;
14934 	next_max_above = (vm_map_offset_t) -1;
14935 	next_max_below = (vm_map_offset_t) -1;
14936 
14937 	for (;;) {
14938 		if (vm_map_lookup_entry(curr_map,
14939 		    curr_address,
14940 		    &tmp_entry)) {
14941 			/* tmp_entry contains the address we're looking for */
14942 			curr_entry = tmp_entry;
14943 		} else {
14944 			vm_map_offset_t skip;
14945 			/*
14946 			 * The address is not mapped.  "tmp_entry" is the
14947 			 * map entry preceding the address.  We want the next
14948 			 * one, if it exists.
14949 			 */
14950 			curr_entry = tmp_entry->vme_next;
14951 
14952 			if (curr_entry == vm_map_to_entry(curr_map) ||
14953 			    (curr_entry->vme_start >=
14954 			    curr_address + curr_max_above)) {
14955 				/* no next entry at this level: stop looking */
14956 				if (not_in_kdp) {
14957 					vm_map_unlock_read(curr_map);
14958 				}
14959 				curr_entry = NULL;
14960 				curr_map = NULL;
14961 				curr_skip = 0;
14962 				curr_offset = 0;
14963 				curr_depth = 0;
14964 				curr_max_above = 0;
14965 				curr_max_below = 0;
14966 				break;
14967 			}
14968 
14969 			/* adjust current address and offset */
14970 			skip = curr_entry->vme_start - curr_address;
14971 			curr_address = curr_entry->vme_start;
14972 			curr_skip += skip;
14973 			curr_offset += skip;
14974 			curr_max_above -= skip;
14975 			curr_max_below = 0;
14976 		}
14977 
14978 		/*
14979 		 * Is the next entry at this level closer to the address (or
14980 		 * deeper in the submap chain) than the one we had
14981 		 * so far ?
14982 		 */
14983 		tmp_entry = curr_entry->vme_next;
14984 		if (tmp_entry == vm_map_to_entry(curr_map)) {
14985 			/* no next entry at this level */
14986 		} else if (tmp_entry->vme_start >=
14987 		    curr_address + curr_max_above) {
14988 			/*
14989 			 * tmp_entry is beyond the scope of what we mapped of
14990 			 * this submap in the upper level: ignore it.
14991 			 */
14992 		} else if ((next_entry == NULL) ||
14993 		    (tmp_entry->vme_start + curr_offset <=
14994 		    next_entry->vme_start + next_offset)) {
14995 			/*
14996 			 * We didn't have a "next_entry" or this one is
14997 			 * closer to the address we're looking for:
14998 			 * use this "tmp_entry" as the new "next_entry".
14999 			 */
15000 			if (next_entry != NULL) {
15001 				/* unlock the last "next_map" */
15002 				if (next_map != curr_map && not_in_kdp) {
15003 					vm_map_unlock_read(next_map);
15004 				}
15005 			}
15006 			next_entry = tmp_entry;
15007 			next_map = curr_map;
15008 			next_depth = curr_depth;
15009 			next_address = next_entry->vme_start;
15010 			next_skip = curr_skip;
15011 			next_skip += (next_address - curr_address);
15012 			next_offset = curr_offset;
15013 			next_offset += (next_address - curr_address);
15014 			next_max_above = MIN(next_max_above, curr_max_above);
15015 			next_max_above = MIN(next_max_above,
15016 			    next_entry->vme_end - next_address);
15017 			next_max_below = MIN(next_max_below, curr_max_below);
15018 			next_max_below = MIN(next_max_below,
15019 			    next_address - next_entry->vme_start);
15020 		}
15021 
15022 		/*
15023 		 * "curr_max_{above,below}" allow us to keep track of the
15024 		 * portion of the submap that is actually mapped at this level:
15025 		 * the rest of that submap is irrelevant to us, since it's not
15026 		 * mapped here.
15027 		 * The relevant portion of the map starts at
15028 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
15029 		 */
15030 		curr_max_above = MIN(curr_max_above,
15031 		    curr_entry->vme_end - curr_address);
15032 		curr_max_below = MIN(curr_max_below,
15033 		    curr_address - curr_entry->vme_start);
15034 
15035 		if (!curr_entry->is_sub_map ||
15036 		    curr_depth >= user_max_depth) {
15037 			/*
15038 			 * We hit a leaf map or we reached the maximum depth
15039 			 * we could, so stop looking.  Keep the current map
15040 			 * locked.
15041 			 */
15042 			break;
15043 		}
15044 
15045 		/*
15046 		 * Get down to the next submap level.
15047 		 */
15048 
15049 		if (curr_entry->needs_copy) {
15050 			/* everything below this is effectively copy-on-write */
15051 			submap_needed_copy = TRUE;
15052 		}
15053 
15054 		/*
15055 		 * Lock the next level and unlock the current level,
15056 		 * unless we need to keep it locked to access the "next_entry"
15057 		 * later.
15058 		 */
15059 		if (not_in_kdp) {
15060 			vm_map_lock_read(VME_SUBMAP(curr_entry));
15061 		}
15062 		if (curr_map == next_map) {
15063 			/* keep "next_map" locked in case we need it */
15064 		} else {
15065 			/* release this map */
15066 			if (not_in_kdp) {
15067 				vm_map_unlock_read(curr_map);
15068 			}
15069 		}
15070 
15071 		/*
15072 		 * Adjust the offset.  "curr_entry" maps the submap
15073 		 * at relative address "curr_entry->vme_start" in the
15074 		 * curr_map but skips the first "VME_OFFSET(curr_entry)"
15075 		 * bytes of the submap.
15076 		 * "curr_offset" always represents the offset of a virtual
15077 		 * address in the curr_map relative to the absolute address
15078 		 * space (i.e. the top-level VM map).
15079 		 */
15080 		curr_offset +=
15081 		    (VME_OFFSET(curr_entry) - curr_entry->vme_start);
15082 		curr_address = user_address + curr_offset;
15083 		/* switch to the submap */
15084 		curr_map = VME_SUBMAP(curr_entry);
15085 		curr_depth++;
15086 		curr_entry = NULL;
15087 	}
15088 
15089 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
15090 // so probably should be a real 32b ID vs. ptr.
15091 // Current users just check for equality
15092 
15093 	if (curr_entry == NULL) {
15094 		/* no VM region contains the address... */
15095 
15096 		if (do_region_footprint && /* we want footprint numbers */
15097 		    next_entry == NULL && /* & there are no more regions */
15098 		    /* & we haven't already provided our fake region: */
15099 		    user_address <= vm_map_last_entry(map)->vme_end) {
15100 			ledger_amount_t ledger_resident, ledger_compressed;
15101 
15102 			/*
15103 			 * Add a fake memory region to account for
15104 			 * purgeable and/or ledger-tagged memory that
15105 			 * counts towards this task's memory footprint,
15106 			 * i.e. the resident/compressed pages of non-volatile
15107 			 * objects owned by that task.
15108 			 */
15109 			task_ledgers_footprint(map->pmap->ledger,
15110 			    &ledger_resident,
15111 			    &ledger_compressed);
15112 			if (ledger_resident + ledger_compressed == 0) {
15113 				/* no purgeable memory usage to report */
15114 				return KERN_INVALID_ADDRESS;
15115 			}
15116 			/* fake region to show nonvolatile footprint */
15117 			if (look_for_pages) {
15118 				submap_info->protection = VM_PROT_DEFAULT;
15119 				submap_info->max_protection = VM_PROT_DEFAULT;
15120 				submap_info->inheritance = VM_INHERIT_DEFAULT;
15121 				submap_info->offset = 0;
15122 				submap_info->user_tag = -1;
15123 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15124 				submap_info->pages_shared_now_private = 0;
15125 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15126 				submap_info->pages_dirtied = submap_info->pages_resident;
15127 				submap_info->ref_count = 1;
15128 				submap_info->shadow_depth = 0;
15129 				submap_info->external_pager = 0;
15130 				submap_info->share_mode = SM_PRIVATE;
15131 				if (submap_needed_copy) {
15132 					submap_info->share_mode = SM_COW;
15133 				}
15134 				submap_info->is_submap = 0;
15135 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15136 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15137 				submap_info->user_wired_count = 0;
15138 				submap_info->pages_reusable = 0;
15139 			} else {
15140 				short_info->user_tag = -1;
15141 				short_info->offset = 0;
15142 				short_info->protection = VM_PROT_DEFAULT;
15143 				short_info->inheritance = VM_INHERIT_DEFAULT;
15144 				short_info->max_protection = VM_PROT_DEFAULT;
15145 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
15146 				short_info->user_wired_count = 0;
15147 				short_info->is_submap = 0;
15148 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15149 				short_info->external_pager = 0;
15150 				short_info->shadow_depth = 0;
15151 				short_info->share_mode = SM_PRIVATE;
15152 				if (submap_needed_copy) {
15153 					short_info->share_mode = SM_COW;
15154 				}
15155 				short_info->ref_count = 1;
15156 			}
15157 			*nesting_depth = 0;
15158 			*address_u = vm_sanitize_wrap_addr(vm_map_last_entry(map)->vme_end);
15159 			*size_u    = vm_sanitize_wrap_size(ledger_resident + ledger_compressed);
15160 			return KERN_SUCCESS;
15161 		}
15162 
15163 		if (next_entry == NULL) {
15164 			/* ... and no VM region follows it either */
15165 			return KERN_INVALID_ADDRESS;
15166 		}
15167 		/* ... gather info about the next VM region */
15168 		curr_entry = next_entry;
15169 		curr_map = next_map;    /* still locked ... */
15170 		curr_address = next_address;
15171 		curr_skip = next_skip;
15172 		curr_offset = next_offset;
15173 		curr_depth = next_depth;
15174 		curr_max_above = next_max_above;
15175 		curr_max_below = next_max_below;
15176 	} else {
15177 		/* we won't need "next_entry" after all */
15178 		if (next_entry != NULL) {
15179 			/* release "next_map" */
15180 			if (next_map != curr_map && not_in_kdp) {
15181 				vm_map_unlock_read(next_map);
15182 			}
15183 		}
15184 	}
15185 	next_entry = NULL;
15186 	next_map = NULL;
15187 	next_offset = 0;
15188 	next_skip = 0;
15189 	next_depth = 0;
15190 	next_max_below = -1;
15191 	next_max_above = -1;
15192 
15193 	if (curr_entry->is_sub_map &&
15194 	    curr_depth < user_max_depth) {
15195 		/*
15196 		 * We're not as deep as we could be:  we must have
15197 		 * gone back up after not finding anything mapped
15198 		 * below the original top-level map entry's.
15199 		 * Let's move "curr_address" forward and recurse again.
15200 		 */
15201 		user_address = curr_address;
15202 		goto recurse_again;
15203 	}
15204 
15205 	*nesting_depth = curr_depth;
15206 	*address_u = vm_sanitize_wrap_addr(
15207 		user_address + curr_skip - curr_max_below);
15208 	*size_u    = vm_sanitize_wrap_size(curr_max_above + curr_max_below);
15209 
15210 	if (look_for_pages) {
15211 		submap_info->user_tag = VME_ALIAS(curr_entry);
15212 		submap_info->offset = VME_OFFSET(curr_entry);
15213 		submap_info->protection = curr_entry->protection;
15214 		submap_info->inheritance = curr_entry->inheritance;
15215 		submap_info->max_protection = curr_entry->max_protection;
15216 		submap_info->behavior = curr_entry->behavior;
15217 		submap_info->user_wired_count = curr_entry->user_wired_count;
15218 		submap_info->is_submap = curr_entry->is_sub_map;
15219 		if (curr_entry->is_sub_map) {
15220 			submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15221 		} else {
15222 			submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15223 		}
15224 	} else {
15225 		short_info->user_tag = VME_ALIAS(curr_entry);
15226 		short_info->offset = VME_OFFSET(curr_entry);
15227 		short_info->protection = curr_entry->protection;
15228 		short_info->inheritance = curr_entry->inheritance;
15229 		short_info->max_protection = curr_entry->max_protection;
15230 		short_info->behavior = curr_entry->behavior;
15231 		short_info->user_wired_count = curr_entry->user_wired_count;
15232 		short_info->is_submap = curr_entry->is_sub_map;
15233 		if (curr_entry->is_sub_map) {
15234 			short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15235 		} else {
15236 			short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15237 		}
15238 	}
15239 
15240 	extended.pages_resident = 0;
15241 	extended.pages_swapped_out = 0;
15242 	extended.pages_shared_now_private = 0;
15243 	extended.pages_dirtied = 0;
15244 	extended.pages_reusable = 0;
15245 	extended.external_pager = 0;
15246 	extended.shadow_depth = 0;
15247 	extended.share_mode = SM_EMPTY;
15248 	extended.ref_count = 0;
15249 
15250 	if (not_in_kdp) {
15251 		if (!curr_entry->is_sub_map) {
15252 			vm_map_offset_t range_start, range_end;
15253 			range_start = MAX((curr_address - curr_max_below),
15254 			    curr_entry->vme_start);
15255 			range_end = MIN((curr_address + curr_max_above),
15256 			    curr_entry->vme_end);
15257 			vm_map_region_walk(curr_map,
15258 			    range_start,
15259 			    curr_entry,
15260 			    (VME_OFFSET(curr_entry) +
15261 			    (range_start -
15262 			    curr_entry->vme_start)),
15263 			    range_end - range_start,
15264 			    &extended,
15265 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15266 			if (submap_needed_copy) {
15267 				extended.share_mode = SM_COW;
15268 			}
15269 		} else {
15270 			if (curr_entry->use_pmap) {
15271 				extended.share_mode = SM_TRUESHARED;
15272 			} else {
15273 				extended.share_mode = SM_PRIVATE;
15274 			}
15275 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15276 		}
15277 	}
15278 
15279 	if (look_for_pages) {
15280 		submap_info->pages_resident = extended.pages_resident;
15281 		submap_info->pages_swapped_out = extended.pages_swapped_out;
15282 		submap_info->pages_shared_now_private =
15283 		    extended.pages_shared_now_private;
15284 		submap_info->pages_dirtied = extended.pages_dirtied;
15285 		submap_info->external_pager = extended.external_pager;
15286 		submap_info->shadow_depth = extended.shadow_depth;
15287 		submap_info->share_mode = extended.share_mode;
15288 		submap_info->ref_count = extended.ref_count;
15289 
15290 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15291 			submap_info->pages_reusable = extended.pages_reusable;
15292 		}
15293 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15294 			if (curr_entry->is_sub_map) {
15295 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15296 			} else if (VME_OBJECT(curr_entry)) {
15297 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15298 			} else {
15299 				submap_info->object_id_full = 0ull;
15300 			}
15301 		}
15302 	} else {
15303 		short_info->external_pager = extended.external_pager;
15304 		short_info->shadow_depth = extended.shadow_depth;
15305 		short_info->share_mode = extended.share_mode;
15306 		short_info->ref_count = extended.ref_count;
15307 	}
15308 
15309 	if (not_in_kdp) {
15310 		vm_map_unlock_read(curr_map);
15311 	}
15312 
15313 	return KERN_SUCCESS;
15314 }
15315 
15316 /*
15317  *	vm_region:
15318  *
15319  *	User call to obtain information about a region in
15320  *	a task's address map. Currently, only one flavor is
15321  *	supported.
15322  *
15323  *	XXX The reserved and behavior fields cannot be filled
15324  *	    in until the vm merge from the IK is completed, and
15325  *	    vm_reserve is implemented.
15326  */
15327 
15328 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15329 vm_map_region(
15330 	vm_map_t                map,
15331 	vm_map_offset_ut       *address_u,      /* IN/OUT */
15332 	vm_map_size_ut         *size_u,         /* OUT */
15333 	vm_region_flavor_t      flavor,         /* IN */
15334 	vm_region_info_t        info,           /* OUT */
15335 	mach_msg_type_number_t *count,          /* IN/OUT */
15336 	mach_port_t            *object_name)    /* OUT */
15337 {
15338 	vm_map_entry_t          tmp_entry;
15339 	vm_map_entry_t          entry;
15340 	vm_map_offset_t         start;
15341 
15342 	if (map == VM_MAP_NULL) {
15343 		return KERN_INVALID_ARGUMENT;
15344 	}
15345 
15346 	start = vm_sanitize_addr(map, *address_u);
15347 
15348 	switch (flavor) {
15349 	case VM_REGION_BASIC_INFO:
15350 		/* legacy for old 32-bit objects info */
15351 	{
15352 		vm_region_basic_info_t  basic;
15353 
15354 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
15355 			return KERN_INVALID_ARGUMENT;
15356 		}
15357 
15358 		basic = (vm_region_basic_info_t) info;
15359 		*count = VM_REGION_BASIC_INFO_COUNT;
15360 
15361 		vm_map_lock_read(map);
15362 
15363 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15364 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15365 				vm_map_unlock_read(map);
15366 				return KERN_INVALID_ADDRESS;
15367 			}
15368 		} else {
15369 			entry = tmp_entry;
15370 		}
15371 
15372 		start = entry->vme_start;
15373 
15374 		basic->offset = (uint32_t)VME_OFFSET(entry);
15375 		basic->protection = entry->protection;
15376 		basic->inheritance = entry->inheritance;
15377 		basic->max_protection = entry->max_protection;
15378 		basic->behavior = entry->behavior;
15379 		basic->user_wired_count = entry->user_wired_count;
15380 		basic->reserved = entry->is_sub_map;
15381 
15382 		*address_u = vm_sanitize_wrap_addr(start);
15383 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15384 
15385 		if (object_name) {
15386 			*object_name = IP_NULL;
15387 		}
15388 		if (entry->is_sub_map) {
15389 			basic->shared = FALSE;
15390 		} else {
15391 			basic->shared = entry->is_shared;
15392 		}
15393 
15394 		vm_map_unlock_read(map);
15395 		return KERN_SUCCESS;
15396 	}
15397 
15398 	case VM_REGION_BASIC_INFO_64:
15399 	{
15400 		vm_region_basic_info_64_t       basic;
15401 
15402 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15403 			return KERN_INVALID_ARGUMENT;
15404 		}
15405 
15406 		basic = (vm_region_basic_info_64_t) info;
15407 		*count = VM_REGION_BASIC_INFO_COUNT_64;
15408 
15409 		vm_map_lock_read(map);
15410 
15411 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15412 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15413 				vm_map_unlock_read(map);
15414 				return KERN_INVALID_ADDRESS;
15415 			}
15416 		} else {
15417 			entry = tmp_entry;
15418 		}
15419 
15420 		start = entry->vme_start;
15421 
15422 		basic->offset = VME_OFFSET(entry);
15423 		basic->protection = entry->protection;
15424 		basic->inheritance = entry->inheritance;
15425 		basic->max_protection = entry->max_protection;
15426 		basic->behavior = entry->behavior;
15427 		basic->user_wired_count = entry->user_wired_count;
15428 		basic->reserved = entry->is_sub_map;
15429 
15430 		*address_u = vm_sanitize_wrap_addr(start);
15431 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15432 
15433 		if (object_name) {
15434 			*object_name = IP_NULL;
15435 		}
15436 		if (entry->is_sub_map) {
15437 			basic->shared = FALSE;
15438 		} else {
15439 			basic->shared = entry->is_shared;
15440 		}
15441 
15442 		vm_map_unlock_read(map);
15443 		return KERN_SUCCESS;
15444 	}
15445 	case VM_REGION_EXTENDED_INFO:
15446 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15447 			return KERN_INVALID_ARGUMENT;
15448 		}
15449 		OS_FALLTHROUGH;
15450 	case VM_REGION_EXTENDED_INFO__legacy:
15451 	{
15452 		vm_region_extended_info_t       extended;
15453 		mach_msg_type_number_t original_count;
15454 		int effective_page_size, effective_page_shift;
15455 
15456 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15457 			return KERN_INVALID_ARGUMENT;
15458 		}
15459 
15460 		extended = (vm_region_extended_info_t) info;
15461 
15462 		effective_page_shift = vm_self_region_page_shift(map);
15463 		effective_page_size = (1 << effective_page_shift);
15464 
15465 		vm_map_lock_read(map);
15466 
15467 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15468 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15469 				vm_map_unlock_read(map);
15470 				return KERN_INVALID_ADDRESS;
15471 			}
15472 		} else {
15473 			entry = tmp_entry;
15474 		}
15475 		start = entry->vme_start;
15476 
15477 		extended->protection = entry->protection;
15478 		extended->user_tag = VME_ALIAS(entry);
15479 		extended->pages_resident = 0;
15480 		extended->pages_swapped_out = 0;
15481 		extended->pages_shared_now_private = 0;
15482 		extended->pages_dirtied = 0;
15483 		extended->external_pager = 0;
15484 		extended->shadow_depth = 0;
15485 
15486 		original_count = *count;
15487 		if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15488 			*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15489 		} else {
15490 			extended->pages_reusable = 0;
15491 			*count = VM_REGION_EXTENDED_INFO_COUNT;
15492 		}
15493 
15494 		vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15495 
15496 		if (object_name) {
15497 			*object_name = IP_NULL;
15498 		}
15499 
15500 		*address_u = vm_sanitize_wrap_addr(start);
15501 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15502 
15503 		vm_map_unlock_read(map);
15504 		return KERN_SUCCESS;
15505 	}
15506 	case VM_REGION_TOP_INFO:
15507 	{
15508 		vm_region_top_info_t    top;
15509 
15510 		if (*count < VM_REGION_TOP_INFO_COUNT) {
15511 			return KERN_INVALID_ARGUMENT;
15512 		}
15513 
15514 		top = (vm_region_top_info_t) info;
15515 		*count = VM_REGION_TOP_INFO_COUNT;
15516 
15517 		vm_map_lock_read(map);
15518 
15519 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15520 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15521 				vm_map_unlock_read(map);
15522 				return KERN_INVALID_ADDRESS;
15523 			}
15524 		} else {
15525 			entry = tmp_entry;
15526 		}
15527 		start = entry->vme_start;
15528 
15529 		top->private_pages_resident = 0;
15530 		top->shared_pages_resident = 0;
15531 
15532 		vm_map_region_top_walk(entry, top);
15533 
15534 		if (object_name) {
15535 			*object_name = IP_NULL;
15536 		}
15537 
15538 		*address_u = vm_sanitize_wrap_addr(start);
15539 		*size_u    = vm_sanitize_wrap_size(entry->vme_end - start);
15540 
15541 		vm_map_unlock_read(map);
15542 		return KERN_SUCCESS;
15543 	}
15544 	default:
15545 		return KERN_INVALID_ARGUMENT;
15546 	}
15547 }
15548 
15549 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
15550 	MIN((entry_size),                                               \
15551 	    ((obj)->all_reusable ?                                      \
15552 	     (obj)->wired_page_count :                                  \
15553 	     (obj)->resident_page_count - (obj)->reusable_page_count))
15554 
15555 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15556 vm_map_region_top_walk(
15557 	vm_map_entry_t             entry,
15558 	vm_region_top_info_t       top)
15559 {
15560 	if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15561 		top->share_mode = SM_EMPTY;
15562 		top->ref_count = 0;
15563 		top->obj_id = 0;
15564 		return;
15565 	}
15566 
15567 	{
15568 		struct  vm_object *obj, *tmp_obj;
15569 		int             ref_count;
15570 		uint32_t        entry_size;
15571 
15572 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15573 
15574 		obj = VME_OBJECT(entry);
15575 
15576 		vm_object_lock(obj);
15577 
15578 		if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15579 		    obj->paging_in_progress) {
15580 			ref_count--;
15581 		}
15582 
15583 		assert(obj->reusable_page_count <= obj->resident_page_count);
15584 		if (obj->shadow) {
15585 			if (ref_count == 1) {
15586 				top->private_pages_resident =
15587 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15588 			} else {
15589 				top->shared_pages_resident =
15590 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15591 			}
15592 			top->ref_count  = ref_count;
15593 			top->share_mode = SM_COW;
15594 
15595 			while ((tmp_obj = obj->shadow)) {
15596 				vm_object_lock(tmp_obj);
15597 				vm_object_unlock(obj);
15598 				obj = tmp_obj;
15599 
15600 				if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15601 				    obj->paging_in_progress) {
15602 					ref_count--;
15603 				}
15604 
15605 				assert(obj->reusable_page_count <= obj->resident_page_count);
15606 				top->shared_pages_resident +=
15607 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15608 				top->ref_count += ref_count - 1;
15609 			}
15610 		} else {
15611 			if (entry->superpage_size) {
15612 				top->share_mode = SM_LARGE_PAGE;
15613 				top->shared_pages_resident = 0;
15614 				top->private_pages_resident = entry_size;
15615 			} else if (entry->needs_copy) {
15616 				top->share_mode = SM_COW;
15617 				top->shared_pages_resident =
15618 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15619 			} else {
15620 				if (ref_count == 1 ||
15621 				    (ref_count == 2 && obj->named)) {
15622 					top->share_mode = SM_PRIVATE;
15623 					top->private_pages_resident =
15624 					    OBJ_RESIDENT_COUNT(obj,
15625 					    entry_size);
15626 				} else {
15627 					top->share_mode = SM_SHARED;
15628 					top->shared_pages_resident =
15629 					    OBJ_RESIDENT_COUNT(obj,
15630 					    entry_size);
15631 				}
15632 			}
15633 			top->ref_count = ref_count;
15634 		}
15635 
15636 		vm_object_unlock(obj);
15637 
15638 		/* XXX K64: obj_id will be truncated */
15639 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15640 	}
15641 }
15642 
15643 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15644 vm_map_region_walk(
15645 	vm_map_t                        map,
15646 	vm_map_offset_t                 va,
15647 	vm_map_entry_t                  entry,
15648 	vm_object_offset_t              offset,
15649 	vm_object_size_t                range,
15650 	vm_region_extended_info_t       extended,
15651 	boolean_t                       look_for_pages,
15652 	mach_msg_type_number_t count)
15653 {
15654 	struct vm_object *obj, *tmp_obj;
15655 	vm_map_offset_t       last_offset;
15656 	int               i;
15657 	int               ref_count;
15658 	struct vm_object        *shadow_object;
15659 	unsigned short          shadow_depth;
15660 	boolean_t         do_region_footprint;
15661 	int                     effective_page_size, effective_page_shift;
15662 	vm_map_offset_t         effective_page_mask;
15663 
15664 	do_region_footprint = task_self_region_footprint();
15665 
15666 	if ((entry->is_sub_map) ||
15667 	    (VME_OBJECT(entry) == 0) ||
15668 	    (VME_OBJECT(entry)->phys_contiguous &&
15669 	    !entry->superpage_size)) {
15670 		extended->share_mode = SM_EMPTY;
15671 		extended->ref_count = 0;
15672 		return;
15673 	}
15674 
15675 	if (entry->superpage_size) {
15676 		extended->shadow_depth = 0;
15677 		extended->share_mode = SM_LARGE_PAGE;
15678 		extended->ref_count = 1;
15679 		extended->external_pager = 0;
15680 
15681 		/* TODO4K: Superpage in 4k mode? */
15682 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15683 		extended->shadow_depth = 0;
15684 		return;
15685 	}
15686 
15687 	effective_page_shift = vm_self_region_page_shift(map);
15688 	effective_page_size = (1 << effective_page_shift);
15689 	effective_page_mask = effective_page_size - 1;
15690 
15691 	offset = vm_map_trunc_page(offset, effective_page_mask);
15692 
15693 	obj = VME_OBJECT(entry);
15694 
15695 	vm_object_lock(obj);
15696 
15697 	if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15698 	    obj->paging_in_progress) {
15699 		ref_count--;
15700 	}
15701 
15702 	if (look_for_pages) {
15703 		for (last_offset = offset + range;
15704 		    offset < last_offset;
15705 		    offset += effective_page_size, va += effective_page_size) {
15706 			if (do_region_footprint) {
15707 				int disp;
15708 
15709 				disp = 0;
15710 				if (map->has_corpse_footprint) {
15711 					/*
15712 					 * Query the page info data we saved
15713 					 * while forking the corpse.
15714 					 */
15715 					vm_map_corpse_footprint_query_page_info(
15716 						map,
15717 						va,
15718 						&disp);
15719 				} else {
15720 					/*
15721 					 * Query the pmap.
15722 					 */
15723 					vm_map_footprint_query_page_info(
15724 						map,
15725 						entry,
15726 						va,
15727 						&disp);
15728 				}
15729 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15730 					extended->pages_resident++;
15731 				}
15732 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15733 					extended->pages_reusable++;
15734 				}
15735 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15736 					extended->pages_dirtied++;
15737 				}
15738 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15739 					extended->pages_swapped_out++;
15740 				}
15741 				continue;
15742 			}
15743 
15744 			vm_map_region_look_for_page(map, va, obj,
15745 			    vm_object_trunc_page(offset), ref_count,
15746 			    0, extended, count);
15747 		}
15748 
15749 		if (do_region_footprint) {
15750 			goto collect_object_info;
15751 		}
15752 	} else {
15753 collect_object_info:
15754 		shadow_object = obj->shadow;
15755 		shadow_depth = 0;
15756 
15757 		if (!(obj->internal)) {
15758 			extended->external_pager = 1;
15759 		}
15760 
15761 		if (shadow_object != VM_OBJECT_NULL) {
15762 			vm_object_lock(shadow_object);
15763 			for (;
15764 			    shadow_object != VM_OBJECT_NULL;
15765 			    shadow_depth++) {
15766 				vm_object_t     next_shadow;
15767 
15768 				if (!(shadow_object->internal)) {
15769 					extended->external_pager = 1;
15770 				}
15771 
15772 				next_shadow = shadow_object->shadow;
15773 				if (next_shadow) {
15774 					vm_object_lock(next_shadow);
15775 				}
15776 				vm_object_unlock(shadow_object);
15777 				shadow_object = next_shadow;
15778 			}
15779 		}
15780 		extended->shadow_depth = shadow_depth;
15781 	}
15782 
15783 	if (extended->shadow_depth || entry->needs_copy) {
15784 		extended->share_mode = SM_COW;
15785 	} else {
15786 		if (ref_count == 1) {
15787 			extended->share_mode = SM_PRIVATE;
15788 		} else {
15789 			if (obj->true_share) {
15790 				extended->share_mode = SM_TRUESHARED;
15791 			} else {
15792 				extended->share_mode = SM_SHARED;
15793 			}
15794 		}
15795 	}
15796 	extended->ref_count = ref_count - extended->shadow_depth;
15797 
15798 	for (i = 0; i < extended->shadow_depth; i++) {
15799 		if ((tmp_obj = obj->shadow) == 0) {
15800 			break;
15801 		}
15802 		vm_object_lock(tmp_obj);
15803 		vm_object_unlock(obj);
15804 
15805 		if ((ref_count = os_ref_get_count_raw(&tmp_obj->ref_count)) > 1 &&
15806 		    tmp_obj->paging_in_progress) {
15807 			ref_count--;
15808 		}
15809 
15810 		extended->ref_count += ref_count;
15811 		obj = tmp_obj;
15812 	}
15813 	vm_object_unlock(obj);
15814 
15815 	if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15816 		extended->share_mode = SM_PRIVATE;
15817 	} else if (extended->share_mode == SM_SHARED && !(task_self_region_info_flags() & VM_REGION_INFO_FLAGS_NO_ALIASED)) {
15818 		vm_map_entry_t       cur;
15819 		vm_map_entry_t       last;
15820 		int      my_refs;
15821 
15822 		obj = VME_OBJECT(entry);
15823 		last = vm_map_to_entry(map);
15824 		my_refs = 0;
15825 
15826 		if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15827 		    obj->paging_in_progress) {
15828 			ref_count--;
15829 		}
15830 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15831 			if (vm_map_region_has_obj_ref(cur, obj)) {
15832 				my_refs++;
15833 			}
15834 		}
15835 
15836 		if (my_refs == ref_count) {
15837 			extended->share_mode = SM_PRIVATE_ALIASED;
15838 		} else if (my_refs > 1) {
15839 			extended->share_mode = SM_SHARED_ALIASED;
15840 		}
15841 	}
15842 }
15843 
15844 
15845 /* object is locked on entry and locked on return */
15846 
15847 
15848 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15849 vm_map_region_look_for_page(
15850 	__unused vm_map_t               map,
15851 	__unused vm_map_offset_t        va,
15852 	vm_object_t                     object,
15853 	vm_object_offset_t              offset,
15854 	int                             max_refcnt,
15855 	unsigned short                  depth,
15856 	vm_region_extended_info_t       extended,
15857 	mach_msg_type_number_t count)
15858 {
15859 	vm_page_t       p;
15860 	vm_object_t     shadow;
15861 	int             ref_count;
15862 	vm_object_t     caller_object;
15863 
15864 	shadow = object->shadow;
15865 	caller_object = object;
15866 
15867 
15868 	while (TRUE) {
15869 		if (!(object->internal)) {
15870 			extended->external_pager = 1;
15871 		}
15872 
15873 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15874 			if (shadow && (max_refcnt == 1)) {
15875 				extended->pages_shared_now_private++;
15876 			}
15877 
15878 			if (!p->vmp_fictitious &&
15879 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15880 				extended->pages_dirtied++;
15881 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15882 				if (p->vmp_reusable || object->all_reusable) {
15883 					extended->pages_reusable++;
15884 				}
15885 			}
15886 
15887 			extended->pages_resident++;
15888 
15889 			if (object != caller_object) {
15890 				vm_object_unlock(object);
15891 			}
15892 
15893 			return;
15894 		}
15895 		if (object->internal &&
15896 		    object->alive &&
15897 		    !object->terminating &&
15898 		    object->pager_ready) {
15899 			if (vm_object_compressor_pager_state_get(object, offset)
15900 			    == VM_EXTERNAL_STATE_EXISTS) {
15901 				/* the pager has that page */
15902 				extended->pages_swapped_out++;
15903 				if (object != caller_object) {
15904 					vm_object_unlock(object);
15905 				}
15906 				return;
15907 			}
15908 		}
15909 
15910 		if (shadow) {
15911 			vm_object_lock(shadow);
15912 			if ((ref_count = os_ref_get_count_raw(&shadow->ref_count)) > 1 &&
15913 			    shadow->paging_in_progress) {
15914 				ref_count--;
15915 			}
15916 
15917 			if (++depth > extended->shadow_depth) {
15918 				extended->shadow_depth = depth;
15919 			}
15920 
15921 			if (ref_count > max_refcnt) {
15922 				max_refcnt = ref_count;
15923 			}
15924 
15925 			if (object != caller_object) {
15926 				vm_object_unlock(object);
15927 			}
15928 
15929 			offset = offset + object->vo_shadow_offset;
15930 			object = shadow;
15931 			shadow = object->shadow;
15932 			continue;
15933 		}
15934 		if (object != caller_object) {
15935 			vm_object_unlock(object);
15936 		}
15937 		break;
15938 	}
15939 }
15940 
15941 static inline boolean_t
vm_map_region_has_obj_ref(vm_map_entry_t entry,vm_object_t object)15942 vm_map_region_has_obj_ref(
15943 	vm_map_entry_t    entry,
15944 	vm_object_t       object)
15945 {
15946 	vm_object_t cur_obj;
15947 	vm_object_t shadow_obj;
15948 
15949 	if (entry->is_sub_map) {
15950 		return FALSE;
15951 	}
15952 
15953 	cur_obj = VME_OBJECT(entry);
15954 	if (cur_obj == VM_OBJECT_NULL) {
15955 		return FALSE;
15956 	} else if (cur_obj == object) {
15957 		return TRUE;
15958 	}
15959 
15960 	/*
15961 	 * Avoid locks for first shadow check, otherwise diagnostic tools will
15962 	 * spend most of their time obtaining locks in this function when analyzing
15963 	 * processes with many VM entries which may commonly have no shadow chain.
15964 	 *
15965 	 * This is acceptable because:
15966 	 *  - Shadow's fields are not accessed outside of its lock
15967 	 *  - Objects are unlikely to be modified due to:
15968 	 *	  - Many diagnostic tools suspend the task
15969 	 *	  - VM map is locked
15970 	 *	- The rare incorrect return from this function turns a guess into a
15971 	 *	  slightly worse guess
15972 	 *	- Entire shadow chain is not locked as a whole, so can still change
15973 	 *	  while traversing, resulting in incorrect guess even with locking
15974 	 */
15975 	shadow_obj = cur_obj->shadow;
15976 	if (shadow_obj == VM_OBJECT_NULL) {
15977 		return FALSE;
15978 	} else if (shadow_obj == object) {
15979 		return TRUE;
15980 	}
15981 
15982 	vm_object_lock(cur_obj);
15983 
15984 	while ((shadow_obj = cur_obj->shadow)) {
15985 		/* check if object was found before grabbing a lock */
15986 		if (shadow_obj == object) {
15987 			vm_object_unlock(cur_obj);
15988 			return TRUE;
15989 		}
15990 
15991 		vm_object_lock(shadow_obj);
15992 		vm_object_unlock(cur_obj);
15993 		cur_obj = shadow_obj;
15994 	}
15995 
15996 	/* exhausted the shadow chain */
15997 	vm_object_unlock(cur_obj);
15998 	return FALSE;
15999 }
16000 
16001 
16002 /*
16003  *	Routine:	vm_map_simplify
16004  *
16005  *	Description:
16006  *		Attempt to simplify the map representation in
16007  *		the vicinity of the given starting address.
16008  *	Note:
16009  *		This routine is intended primarily to keep the
16010  *		kernel maps more compact -- they generally don't
16011  *		benefit from the "expand a map entry" technology
16012  *		at allocation time because the adjacent entry
16013  *		is often wired down.
16014  */
16015 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)16016 vm_map_simplify_entry(
16017 	vm_map_t        map,
16018 	vm_map_entry_t  this_entry)
16019 {
16020 	vm_map_entry_t  prev_entry;
16021 
16022 	prev_entry = this_entry->vme_prev;
16023 
16024 	if ((this_entry != vm_map_to_entry(map)) &&
16025 	    (prev_entry != vm_map_to_entry(map)) &&
16026 
16027 	    (prev_entry->vme_end == this_entry->vme_start) &&
16028 
16029 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
16030 	    (prev_entry->vme_object_value == this_entry->vme_object_value) &&
16031 	    (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
16032 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
16033 	    prev_entry->vme_start))
16034 	    == VME_OFFSET(this_entry)) &&
16035 
16036 	    (prev_entry->behavior == this_entry->behavior) &&
16037 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
16038 	    (prev_entry->protection == this_entry->protection) &&
16039 	    (prev_entry->max_protection == this_entry->max_protection) &&
16040 	    (prev_entry->inheritance == this_entry->inheritance) &&
16041 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
16042 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
16043 	    (prev_entry->no_cache == this_entry->no_cache) &&
16044 	    (prev_entry->vme_permanent == this_entry->vme_permanent) &&
16045 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
16046 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
16047 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
16048 #if __arm64e__
16049 	    (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
16050 #endif
16051 	    (prev_entry->csm_associated == this_entry->csm_associated) &&
16052 	    (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
16053 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
16054 	    (prev_entry->vme_resilient_codesign ==
16055 	    this_entry->vme_resilient_codesign) &&
16056 	    (prev_entry->vme_resilient_media ==
16057 	    this_entry->vme_resilient_media) &&
16058 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
16059 	    (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
16060 
16061 	    (prev_entry->wired_count == this_entry->wired_count) &&
16062 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
16063 
16064 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
16065 	    (prev_entry->in_transition == FALSE) &&
16066 	    (this_entry->in_transition == FALSE) &&
16067 	    (prev_entry->needs_wakeup == FALSE) &&
16068 	    (this_entry->needs_wakeup == FALSE) &&
16069 	    (prev_entry->is_shared == this_entry->is_shared) &&
16070 	    (prev_entry->superpage_size == FALSE) &&
16071 	    (this_entry->superpage_size == FALSE)
16072 	    ) {
16073 		if (prev_entry->vme_permanent) {
16074 			assert(this_entry->vme_permanent);
16075 			prev_entry->vme_permanent = false;
16076 		}
16077 		vm_map_store_entry_unlink(map, prev_entry, true);
16078 		assert(prev_entry->vme_start < this_entry->vme_end);
16079 		if (prev_entry->map_aligned) {
16080 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
16081 			    VM_MAP_PAGE_MASK(map)));
16082 		}
16083 		this_entry->vme_start = prev_entry->vme_start;
16084 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
16085 
16086 		if (map->holelistenabled) {
16087 			vm_map_store_update_first_free(map, this_entry, TRUE);
16088 		}
16089 
16090 		if (prev_entry->is_sub_map) {
16091 			vm_map_deallocate(VME_SUBMAP(prev_entry));
16092 		} else {
16093 			vm_object_deallocate(VME_OBJECT(prev_entry));
16094 		}
16095 		vm_map_entry_dispose(prev_entry);
16096 		SAVE_HINT_MAP_WRITE(map, this_entry);
16097 	}
16098 }
16099 
16100 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)16101 vm_map_simplify(
16102 	vm_map_t        map,
16103 	vm_map_offset_t start)
16104 {
16105 	vm_map_entry_t  this_entry;
16106 
16107 	vm_map_lock(map);
16108 	if (vm_map_lookup_entry(map, start, &this_entry)) {
16109 		vm_map_simplify_entry(map, this_entry);
16110 		vm_map_simplify_entry(map, this_entry->vme_next);
16111 	}
16112 	vm_map_unlock(map);
16113 }
16114 
16115 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16116 vm_map_simplify_range(
16117 	vm_map_t        map,
16118 	vm_map_offset_t start,
16119 	vm_map_offset_t end)
16120 {
16121 	vm_map_entry_t  entry;
16122 
16123 	/*
16124 	 * The map should be locked (for "write") by the caller.
16125 	 */
16126 
16127 	if (start >= end) {
16128 		/* invalid address range */
16129 		return;
16130 	}
16131 
16132 	start = vm_map_trunc_page(start,
16133 	    VM_MAP_PAGE_MASK(map));
16134 	end = vm_map_round_page(end,
16135 	    VM_MAP_PAGE_MASK(map));
16136 
16137 	if (!vm_map_lookup_entry(map, start, &entry)) {
16138 		/* "start" is not mapped and "entry" ends before "start" */
16139 		if (entry == vm_map_to_entry(map)) {
16140 			/* start with first entry in the map */
16141 			entry = vm_map_first_entry(map);
16142 		} else {
16143 			/* start with next entry */
16144 			entry = entry->vme_next;
16145 		}
16146 	}
16147 
16148 	while (entry != vm_map_to_entry(map) &&
16149 	    entry->vme_start <= end) {
16150 		/* try and coalesce "entry" with its previous entry */
16151 		vm_map_simplify_entry(map, entry);
16152 		entry = entry->vme_next;
16153 	}
16154 }
16155 
16156 static __attribute__((always_inline, warn_unused_result))
16157 kern_return_t
vm_map_machine_attribute_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,mach_vm_offset_t * start,mach_vm_offset_t * end,vm_map_size_t * size)16158 vm_map_machine_attribute_sanitize(
16159 	vm_map_t                map,
16160 	vm_map_offset_ut        start_u,
16161 	vm_map_offset_ut        end_u,
16162 	mach_vm_offset_t       *start,
16163 	mach_vm_offset_t       *end,
16164 	vm_map_size_t          *size)
16165 {
16166 	return vm_sanitize_addr_end(start_u, end_u,
16167 	           VM_SANITIZE_CALLER_VM_MAP_MACHINE_ATTRIBUTE, map,
16168 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
16169 	           size);
16170 }
16171 
16172 
16173 /*
16174  *	Routine:	vm_map_machine_attribute
16175  *	Purpose:
16176  *		Provide machine-specific attributes to mappings,
16177  *		such as cachability etc. for machines that provide
16178  *		them.  NUMA architectures and machines with big/strange
16179  *		caches will use this.
16180  *	Note:
16181  *		Responsibilities for locking and checking are handled here,
16182  *		everything else in the pmap module. If any non-volatile
16183  *		information must be kept, the pmap module should handle
16184  *		it itself. [This assumes that attributes do not
16185  *		need to be inherited, which seems ok to me]
16186  */
16187 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16188 vm_map_machine_attribute(
16189 	vm_map_t                map,
16190 	vm_map_offset_ut        start_u,
16191 	vm_map_offset_ut        end_u,
16192 	vm_machine_attribute_t  attribute,
16193 	vm_machine_attribute_val_t *value) /* IN/OUT */
16194 {
16195 	mach_vm_offset_t start, end;
16196 	vm_map_size_t    sync_size;
16197 	kern_return_t    ret;
16198 	vm_map_entry_t   entry;
16199 
16200 	ret = vm_map_machine_attribute_sanitize(map,
16201 	    start_u,
16202 	    end_u,
16203 	    &start,
16204 	    &end,
16205 	    &sync_size);
16206 	if (__improbable(ret != KERN_SUCCESS)) {
16207 		return vm_sanitize_get_kr(ret);
16208 	}
16209 
16210 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
16211 		return KERN_INVALID_ADDRESS;
16212 	}
16213 
16214 	vm_map_lock(map);
16215 
16216 	if (attribute != MATTR_CACHE) {
16217 		/* If we don't have to find physical addresses, we */
16218 		/* don't have to do an explicit traversal here.    */
16219 		ret = pmap_attribute(map->pmap, start, end - start,
16220 		    attribute, value);
16221 		vm_map_unlock(map);
16222 		return ret;
16223 	}
16224 
16225 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
16226 
16227 	while (sync_size) {
16228 		if (vm_map_lookup_entry(map, start, &entry)) {
16229 			vm_map_size_t   sub_size;
16230 			if ((entry->vme_end - start) > sync_size) {
16231 				sub_size = sync_size;
16232 				sync_size = 0;
16233 			} else {
16234 				sub_size = entry->vme_end - start;
16235 				sync_size -= sub_size;
16236 			}
16237 			if (entry->is_sub_map) {
16238 				vm_map_offset_t sub_start;
16239 				vm_map_offset_t sub_end;
16240 
16241 				sub_start = (start - entry->vme_start)
16242 				    + VME_OFFSET(entry);
16243 				sub_end = sub_start + sub_size;
16244 				vm_map_machine_attribute(
16245 					VME_SUBMAP(entry),
16246 					sub_start,
16247 					sub_end,
16248 					attribute, value);
16249 			} else if (VME_OBJECT(entry)) {
16250 				vm_page_t               m;
16251 				vm_object_t             object;
16252 				vm_object_t             base_object;
16253 				vm_object_t             last_object;
16254 				vm_object_offset_t      offset;
16255 				vm_object_offset_t      base_offset;
16256 				vm_map_size_t           range;
16257 				range = sub_size;
16258 				offset = (start - entry->vme_start)
16259 				    + VME_OFFSET(entry);
16260 				offset = vm_object_trunc_page(offset);
16261 				base_offset = offset;
16262 				object = VME_OBJECT(entry);
16263 				base_object = object;
16264 				last_object = NULL;
16265 
16266 				vm_object_lock(object);
16267 
16268 				while (range) {
16269 					m = vm_page_lookup(
16270 						object, offset);
16271 
16272 					if (m && !m->vmp_fictitious) {
16273 						ret =
16274 						    pmap_attribute_cache_sync(
16275 							VM_PAGE_GET_PHYS_PAGE(m),
16276 							PAGE_SIZE,
16277 							attribute, value);
16278 					} else if (object->shadow) {
16279 						offset = offset + object->vo_shadow_offset;
16280 						last_object = object;
16281 						object = object->shadow;
16282 						vm_object_lock(last_object->shadow);
16283 						vm_object_unlock(last_object);
16284 						continue;
16285 					}
16286 					if (range < PAGE_SIZE) {
16287 						range = 0;
16288 					} else {
16289 						range -= PAGE_SIZE;
16290 					}
16291 
16292 					if (base_object != object) {
16293 						vm_object_unlock(object);
16294 						vm_object_lock(base_object);
16295 						object = base_object;
16296 					}
16297 					/* Bump to the next page */
16298 					base_offset += PAGE_SIZE;
16299 					offset = base_offset;
16300 				}
16301 				vm_object_unlock(object);
16302 			}
16303 			start += sub_size;
16304 		} else {
16305 			vm_map_unlock(map);
16306 			return KERN_FAILURE;
16307 		}
16308 	}
16309 
16310 	vm_map_unlock(map);
16311 
16312 	return ret;
16313 }
16314 
16315 /*
16316  *	vm_map_behavior_set:
16317  *
16318  *	Sets the paging reference behavior of the specified address
16319  *	range in the target map.  Paging reference behavior affects
16320  *	how pagein operations resulting from faults on the map will be
16321  *	clustered.
16322  */
16323 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16324 vm_map_behavior_set(
16325 	vm_map_t        map,
16326 	vm_map_offset_t start,
16327 	vm_map_offset_t end,
16328 	vm_behavior_t   new_behavior)
16329 {
16330 	vm_map_entry_t  entry;
16331 	vm_map_entry_t  temp_entry;
16332 
16333 	if (start > end ||
16334 	    start < vm_map_min(map) ||
16335 	    end > vm_map_max(map)) {
16336 		return KERN_NO_SPACE;
16337 	}
16338 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16339 		return KERN_INVALID_ADDRESS;
16340 	}
16341 
16342 	switch (new_behavior) {
16343 	/*
16344 	 * This first block of behaviors all set a persistent state on the specified
16345 	 * memory range.  All we have to do here is to record the desired behavior
16346 	 * in the vm_map_entry_t's.
16347 	 */
16348 
16349 	case VM_BEHAVIOR_DEFAULT:
16350 	case VM_BEHAVIOR_RANDOM:
16351 	case VM_BEHAVIOR_SEQUENTIAL:
16352 	case VM_BEHAVIOR_RSEQNTL:
16353 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16354 		vm_map_lock(map);
16355 
16356 		/*
16357 		 *	The entire address range must be valid for the map.
16358 		 *      Note that vm_map_range_check() does a
16359 		 *	vm_map_lookup_entry() internally and returns the
16360 		 *	entry containing the start of the address range if
16361 		 *	the entire range is valid.
16362 		 */
16363 		if (vm_map_range_check(map, start, end, &temp_entry)) {
16364 			entry = temp_entry;
16365 			vm_map_clip_start(map, entry, start);
16366 		} else {
16367 			vm_map_unlock(map);
16368 			return KERN_INVALID_ADDRESS;
16369 		}
16370 
16371 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16372 			vm_map_clip_end(map, entry, end);
16373 			if (entry->is_sub_map) {
16374 				assert(!entry->use_pmap);
16375 			}
16376 
16377 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16378 				entry->zero_wired_pages = TRUE;
16379 			} else {
16380 				entry->behavior = new_behavior;
16381 			}
16382 			entry = entry->vme_next;
16383 		}
16384 
16385 		vm_map_unlock(map);
16386 		break;
16387 
16388 	/*
16389 	 * The rest of these are different from the above in that they cause
16390 	 * an immediate action to take place as opposed to setting a behavior that
16391 	 * affects future actions.
16392 	 */
16393 
16394 	case VM_BEHAVIOR_WILLNEED:
16395 		return vm_map_willneed(map, start, end);
16396 
16397 	case VM_BEHAVIOR_DONTNEED:
16398 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16399 
16400 	case VM_BEHAVIOR_FREE:
16401 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16402 
16403 	case VM_BEHAVIOR_REUSABLE:
16404 		return vm_map_reusable_pages(map, start, end);
16405 
16406 	case VM_BEHAVIOR_REUSE:
16407 		return vm_map_reuse_pages(map, start, end);
16408 
16409 	case VM_BEHAVIOR_CAN_REUSE:
16410 		return vm_map_can_reuse(map, start, end);
16411 
16412 #if MACH_ASSERT
16413 	case VM_BEHAVIOR_PAGEOUT:
16414 		return vm_map_pageout(map, start, end);
16415 #endif /* MACH_ASSERT */
16416 
16417 	case VM_BEHAVIOR_ZERO:
16418 		return vm_map_zero(map, start, end);
16419 
16420 	default:
16421 		return KERN_INVALID_ARGUMENT;
16422 	}
16423 
16424 	return KERN_SUCCESS;
16425 }
16426 
16427 
16428 /*
16429  * Internals for madvise(MADV_WILLNEED) system call.
16430  *
16431  * The implementation is to do:-
16432  * a) read-ahead if the mapping corresponds to a mapped regular file
16433  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16434  */
16435 
16436 
16437 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16438 vm_map_willneed(
16439 	vm_map_t        map,
16440 	vm_map_offset_t start,
16441 	vm_map_offset_t end
16442 	)
16443 {
16444 	vm_map_entry_t                  entry;
16445 	vm_object_t                     object;
16446 	memory_object_t                 pager;
16447 	struct vm_object_fault_info     fault_info = {};
16448 	kern_return_t                   kr;
16449 	vm_object_size_t                len;
16450 	vm_object_offset_t              offset;
16451 
16452 	KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_START,
16453 	    task_pid(current_task()), start, end);
16454 	fault_info.interruptible = THREAD_UNINT;        /* ignored value */
16455 	fault_info.behavior      = VM_BEHAVIOR_SEQUENTIAL;
16456 	fault_info.stealth       = TRUE;
16457 
16458 	/*
16459 	 * The MADV_WILLNEED operation doesn't require any changes to the
16460 	 * vm_map_entry_t's, so the read lock is sufficient.
16461 	 */
16462 
16463 	vm_map_lock_read(map);
16464 
16465 	/*
16466 	 * The madvise semantics require that the address range be fully
16467 	 * allocated with no holes.  Otherwise, we're required to return
16468 	 * an error.
16469 	 */
16470 
16471 	if (!vm_map_range_check(map, start, end, &entry)) {
16472 		vm_map_unlock_read(map);
16473 		KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16474 		    task_pid(current_task()), start, KERN_INVALID_ADDRESS);
16475 		return KERN_INVALID_ADDRESS;
16476 	}
16477 
16478 	/*
16479 	 * Examine each vm_map_entry_t in the range.
16480 	 */
16481 	for (; entry != vm_map_to_entry(map) && start < end;) {
16482 		/*
16483 		 * The first time through, the start address could be anywhere
16484 		 * within the vm_map_entry we found.  So adjust the offset to
16485 		 * correspond.  After that, the offset will always be zero to
16486 		 * correspond to the beginning of the current vm_map_entry.
16487 		 */
16488 		offset = (start - entry->vme_start) + VME_OFFSET(entry);
16489 
16490 		/*
16491 		 * Set the length so we don't go beyond the end of the
16492 		 * map_entry or beyond the end of the range we were given.
16493 		 * This range could span also multiple map entries all of which
16494 		 * map different files, so make sure we only do the right amount
16495 		 * of I/O for each object.  Note that it's possible for there
16496 		 * to be multiple map entries all referring to the same object
16497 		 * but with different page permissions, but it's not worth
16498 		 * trying to optimize that case.
16499 		 */
16500 		len = MIN(entry->vme_end - start, end - start);
16501 
16502 		if ((vm_size_t) len != len) {
16503 			/* 32-bit overflow */
16504 			len = (vm_size_t) (0 - PAGE_SIZE);
16505 		}
16506 		fault_info.cluster_size = (vm_size_t) len;
16507 		fault_info.lo_offset    = offset;
16508 		fault_info.hi_offset    = offset + len;
16509 		fault_info.user_tag     = VME_ALIAS(entry);
16510 		fault_info.pmap_options = 0;
16511 		if (entry->iokit_acct ||
16512 		    (!entry->is_sub_map && !entry->use_pmap)) {
16513 			fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16514 		}
16515 		fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16516 
16517 		/*
16518 		 * If the entry is a submap OR there's no read permission
16519 		 * to this mapping, then just skip it.
16520 		 */
16521 		if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16522 			entry = entry->vme_next;
16523 			start = entry->vme_start;
16524 			continue;
16525 		}
16526 
16527 		object = VME_OBJECT(entry);
16528 
16529 		if (object == NULL ||
16530 		    (object && object->internal)) {
16531 			/*
16532 			 * Memory range backed by anonymous memory.
16533 			 */
16534 			vm_size_t region_size = 0, effective_page_size = 0;
16535 			vm_map_offset_t addr = 0, effective_page_mask = 0;
16536 
16537 			region_size = len;
16538 			addr = start;
16539 
16540 			effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16541 			effective_page_size = effective_page_mask + 1;
16542 
16543 			vm_map_unlock_read(map);
16544 
16545 			while (region_size) {
16546 				vm_pre_fault(
16547 					vm_map_trunc_page(addr, effective_page_mask),
16548 					VM_PROT_READ | VM_PROT_WRITE);
16549 
16550 				region_size -= effective_page_size;
16551 				addr += effective_page_size;
16552 			}
16553 		} else {
16554 			/*
16555 			 * Find the file object backing this map entry.  If there is
16556 			 * none, then we simply ignore the "will need" advice for this
16557 			 * entry and go on to the next one.
16558 			 */
16559 			if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16560 				entry = entry->vme_next;
16561 				start = entry->vme_start;
16562 				continue;
16563 			}
16564 
16565 			vm_object_paging_begin(object);
16566 			pager = object->pager;
16567 			vm_object_unlock(object);
16568 
16569 			/*
16570 			 * The data_request() could take a long time, so let's
16571 			 * release the map lock to avoid blocking other threads.
16572 			 */
16573 			vm_map_unlock_read(map);
16574 
16575 			/*
16576 			 * Get the data from the object asynchronously.
16577 			 *
16578 			 * Note that memory_object_data_request() places limits on the
16579 			 * amount of I/O it will do.  Regardless of the len we
16580 			 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16581 			 * silently truncates the len to that size.  This isn't
16582 			 * necessarily bad since madvise shouldn't really be used to
16583 			 * page in unlimited amounts of data.  Other Unix variants
16584 			 * limit the willneed case as well.  If this turns out to be an
16585 			 * issue for developers, then we can always adjust the policy
16586 			 * here and still be backwards compatible since this is all
16587 			 * just "advice".
16588 			 */
16589 			kr = memory_object_data_request(
16590 				pager,
16591 				vm_object_trunc_page(offset) + object->paging_offset,
16592 				0,      /* ignored */
16593 				VM_PROT_READ,
16594 				(memory_object_fault_info_t)&fault_info);
16595 
16596 			vm_object_lock(object);
16597 			vm_object_paging_end(object);
16598 			vm_object_unlock(object);
16599 
16600 			/*
16601 			 * If we couldn't do the I/O for some reason, just give up on
16602 			 * the madvise.  We still return success to the user since
16603 			 * madvise isn't supposed to fail when the advice can't be
16604 			 * taken.
16605 			 */
16606 
16607 			if (kr != KERN_SUCCESS) {
16608 				KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16609 				    task_pid(current_task()), start, kr);
16610 				return KERN_SUCCESS;
16611 			}
16612 		}
16613 
16614 		start += len;
16615 		if (start >= end) {
16616 			/* done */
16617 			KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16618 			    task_pid(current_task()), start, KERN_SUCCESS);
16619 			return KERN_SUCCESS;
16620 		}
16621 
16622 		/* look up next entry */
16623 		vm_map_lock_read(map);
16624 		if (!vm_map_lookup_entry(map, start, &entry)) {
16625 			/*
16626 			 * There's a new hole in the address range.
16627 			 */
16628 			vm_map_unlock_read(map);
16629 			KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16630 			    task_pid(current_task()), start, KERN_INVALID_ADDRESS);
16631 			return KERN_INVALID_ADDRESS;
16632 		}
16633 	}
16634 
16635 	vm_map_unlock_read(map);
16636 	KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16637 	    task_pid(current_task()), start, KERN_SUCCESS);
16638 	return KERN_SUCCESS;
16639 }
16640 
16641 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16642 vm_map_entry_is_reusable(
16643 	vm_map_entry_t entry)
16644 {
16645 	/* Only user map entries */
16646 
16647 	vm_object_t object;
16648 
16649 	if (entry->is_sub_map) {
16650 		return FALSE;
16651 	}
16652 
16653 	switch (VME_ALIAS(entry)) {
16654 	case VM_MEMORY_MALLOC:
16655 	case VM_MEMORY_MALLOC_SMALL:
16656 	case VM_MEMORY_MALLOC_LARGE:
16657 	case VM_MEMORY_REALLOC:
16658 	case VM_MEMORY_MALLOC_TINY:
16659 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16660 	case VM_MEMORY_MALLOC_LARGE_REUSED:
16661 		/*
16662 		 * This is a malloc() memory region: check if it's still
16663 		 * in its original state and can be re-used for more
16664 		 * malloc() allocations.
16665 		 */
16666 		break;
16667 	default:
16668 		/*
16669 		 * Not a malloc() memory region: let the caller decide if
16670 		 * it's re-usable.
16671 		 */
16672 		return TRUE;
16673 	}
16674 
16675 	if (/*entry->is_shared ||*/
16676 		entry->is_sub_map ||
16677 		entry->in_transition ||
16678 		entry->protection != VM_PROT_DEFAULT ||
16679 		entry->max_protection != VM_PROT_ALL ||
16680 		entry->inheritance != VM_INHERIT_DEFAULT ||
16681 		entry->no_cache ||
16682 		entry->vme_permanent ||
16683 		entry->superpage_size != FALSE ||
16684 		entry->zero_wired_pages ||
16685 		entry->wired_count != 0 ||
16686 		entry->user_wired_count != 0) {
16687 		return FALSE;
16688 	}
16689 
16690 	object = VME_OBJECT(entry);
16691 	if (object == VM_OBJECT_NULL) {
16692 		return TRUE;
16693 	}
16694 	if (
16695 #if 0
16696 		/*
16697 		 * Let's proceed even if the VM object is potentially
16698 		 * shared.
16699 		 * We check for this later when processing the actual
16700 		 * VM pages, so the contents will be safe if shared.
16701 		 *
16702 		 * But we can still mark this memory region as "reusable" to
16703 		 * acknowledge that the caller did let us know that the memory
16704 		 * could be re-used and should not be penalized for holding
16705 		 * on to it.  This allows its "resident size" to not include
16706 		 * the reusable range.
16707 		 */
16708 		object->ref_count == 1 &&
16709 #endif
16710 		object->vo_copy == VM_OBJECT_NULL &&
16711 		object->shadow == VM_OBJECT_NULL &&
16712 		object->internal &&
16713 		object->purgable == VM_PURGABLE_DENY &&
16714 		object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16715 		!object->code_signed) {
16716 		return TRUE;
16717 	}
16718 	return FALSE;
16719 }
16720 
16721 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16722 vm_map_reuse_pages(
16723 	vm_map_t        map,
16724 	vm_map_offset_t start,
16725 	vm_map_offset_t end)
16726 {
16727 	vm_map_entry_t                  entry;
16728 	vm_object_t                     object;
16729 	vm_object_offset_t              start_offset, end_offset;
16730 
16731 	/*
16732 	 * The MADV_REUSE operation doesn't require any changes to the
16733 	 * vm_map_entry_t's, so the read lock is sufficient.
16734 	 */
16735 
16736 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16737 		/*
16738 		 * XXX TODO4K
16739 		 * need to figure out what reusable means for a
16740 		 * portion of a native page.
16741 		 */
16742 		return KERN_SUCCESS;
16743 	}
16744 
16745 	vm_map_lock_read(map);
16746 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16747 
16748 	/*
16749 	 * The madvise semantics require that the address range be fully
16750 	 * allocated with no holes.  Otherwise, we're required to return
16751 	 * an error.
16752 	 */
16753 
16754 	if (!vm_map_range_check(map, start, end, &entry)) {
16755 		vm_map_unlock_read(map);
16756 		vm_page_stats_reusable.reuse_pages_failure++;
16757 		return KERN_INVALID_ADDRESS;
16758 	}
16759 
16760 	/*
16761 	 * Examine each vm_map_entry_t in the range.
16762 	 */
16763 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16764 	    entry = entry->vme_next) {
16765 		/*
16766 		 * Sanity check on the VM map entry.
16767 		 */
16768 		if (!vm_map_entry_is_reusable(entry)) {
16769 			vm_map_unlock_read(map);
16770 			vm_page_stats_reusable.reuse_pages_failure++;
16771 			return KERN_INVALID_ADDRESS;
16772 		}
16773 
16774 		/*
16775 		 * The first time through, the start address could be anywhere
16776 		 * within the vm_map_entry we found.  So adjust the offset to
16777 		 * correspond.
16778 		 */
16779 		if (entry->vme_start < start) {
16780 			start_offset = start - entry->vme_start;
16781 		} else {
16782 			start_offset = 0;
16783 		}
16784 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16785 		start_offset += VME_OFFSET(entry);
16786 		end_offset += VME_OFFSET(entry);
16787 
16788 		object = VME_OBJECT(entry);
16789 		if (object != VM_OBJECT_NULL) {
16790 			vm_object_lock(object);
16791 			vm_object_reuse_pages(object, start_offset, end_offset,
16792 			    TRUE);
16793 			vm_object_unlock(object);
16794 		}
16795 
16796 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16797 			/*
16798 			 * XXX
16799 			 * We do not hold the VM map exclusively here.
16800 			 * The "alias" field is not that critical, so it's
16801 			 * safe to update it here, as long as it is the only
16802 			 * one that can be modified while holding the VM map
16803 			 * "shared".
16804 			 */
16805 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16806 		}
16807 	}
16808 
16809 	vm_map_unlock_read(map);
16810 	vm_page_stats_reusable.reuse_pages_success++;
16811 	return KERN_SUCCESS;
16812 }
16813 
16814 
16815 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16816 vm_map_reusable_pages(
16817 	vm_map_t        map,
16818 	vm_map_offset_t start,
16819 	vm_map_offset_t end)
16820 {
16821 	vm_map_entry_t                  entry;
16822 	vm_object_t                     object;
16823 	vm_object_offset_t              start_offset, end_offset;
16824 	vm_map_offset_t                 pmap_offset;
16825 
16826 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16827 		/*
16828 		 * XXX TODO4K
16829 		 * need to figure out what reusable means for a portion
16830 		 * of a native page.
16831 		 */
16832 		return KERN_SUCCESS;
16833 	}
16834 
16835 	/*
16836 	 * The MADV_REUSABLE operation doesn't require any changes to the
16837 	 * vm_map_entry_t's, so the read lock is sufficient.
16838 	 */
16839 
16840 	vm_map_lock_read(map);
16841 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16842 
16843 	/*
16844 	 * The madvise semantics require that the address range be fully
16845 	 * allocated with no holes.  Otherwise, we're required to return
16846 	 * an error.
16847 	 */
16848 
16849 	if (!vm_map_range_check(map, start, end, &entry)) {
16850 		vm_map_unlock_read(map);
16851 		vm_page_stats_reusable.reusable_pages_failure++;
16852 		return KERN_INVALID_ADDRESS;
16853 	}
16854 
16855 	/*
16856 	 * Examine each vm_map_entry_t in the range.
16857 	 */
16858 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16859 	    entry = entry->vme_next) {
16860 		int kill_pages = 0;
16861 		boolean_t reusable_no_write = FALSE;
16862 
16863 		/*
16864 		 * Sanity check on the VM map entry.
16865 		 */
16866 		if (!vm_map_entry_is_reusable(entry)) {
16867 			vm_map_unlock_read(map);
16868 			vm_page_stats_reusable.reusable_pages_failure++;
16869 			return KERN_INVALID_ADDRESS;
16870 		}
16871 
16872 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16873 #if __arm64e__
16874 		    && !entry->used_for_tpro
16875 #endif
16876 		    ) {
16877 			/* not writable: can't discard contents */
16878 			vm_map_unlock_read(map);
16879 			vm_page_stats_reusable.reusable_nonwritable++;
16880 			vm_page_stats_reusable.reusable_pages_failure++;
16881 			return KERN_PROTECTION_FAILURE;
16882 		}
16883 
16884 		/*
16885 		 * The first time through, the start address could be anywhere
16886 		 * within the vm_map_entry we found.  So adjust the offset to
16887 		 * correspond.
16888 		 */
16889 		if (entry->vme_start < start) {
16890 			start_offset = start - entry->vme_start;
16891 			pmap_offset = start;
16892 		} else {
16893 			start_offset = 0;
16894 			pmap_offset = entry->vme_start;
16895 		}
16896 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16897 		start_offset += VME_OFFSET(entry);
16898 		end_offset += VME_OFFSET(entry);
16899 
16900 		object = VME_OBJECT(entry);
16901 		if (object == VM_OBJECT_NULL) {
16902 			continue;
16903 		}
16904 
16905 		if (entry->protection & VM_PROT_EXECUTE) {
16906 			/*
16907 			 * Executable mappings might be write-protected by
16908 			 * hardware, so do not attempt to write to these pages.
16909 			 */
16910 			reusable_no_write = TRUE;
16911 		}
16912 
16913 		if (entry->vme_xnu_user_debug) {
16914 			/*
16915 			 * User debug pages might be write-protected by hardware,
16916 			 * so do not attempt to write to these pages.
16917 			 */
16918 			reusable_no_write = TRUE;
16919 		}
16920 
16921 		vm_object_lock(object);
16922 		if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
16923 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16924 		    object->vo_copy == VM_OBJECT_NULL)) &&
16925 		    object->shadow == VM_OBJECT_NULL &&
16926 		    /*
16927 		     * "iokit_acct" entries are billed for their virtual size
16928 		     * (rather than for their resident pages only), so they
16929 		     * wouldn't benefit from making pages reusable, and it
16930 		     * would be hard to keep track of pages that are both
16931 		     * "iokit_acct" and "reusable" in the pmap stats and
16932 		     * ledgers.
16933 		     */
16934 		    !(entry->iokit_acct ||
16935 		    (!entry->is_sub_map && !entry->use_pmap))) {
16936 			if (os_ref_get_count_raw(&object->ref_count) != 1) {
16937 				vm_page_stats_reusable.reusable_shared++;
16938 			}
16939 			kill_pages = 1;
16940 		} else {
16941 			kill_pages = -1;
16942 		}
16943 		if (kill_pages != -1) {
16944 			vm_object_deactivate_pages(object,
16945 			    start_offset,
16946 			    end_offset - start_offset,
16947 			    kill_pages,
16948 			    TRUE /*reusable_pages*/,
16949 			    reusable_no_write,
16950 			    map->pmap,
16951 			    pmap_offset);
16952 		} else {
16953 			vm_page_stats_reusable.reusable_pages_shared++;
16954 			DTRACE_VM4(vm_map_reusable_pages_shared,
16955 			    unsigned int, VME_ALIAS(entry),
16956 			    vm_map_t, map,
16957 			    vm_map_entry_t, entry,
16958 			    vm_object_t, object);
16959 		}
16960 		vm_object_unlock(object);
16961 
16962 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16963 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16964 			/*
16965 			 * XXX
16966 			 * We do not hold the VM map exclusively here.
16967 			 * The "alias" field is not that critical, so it's
16968 			 * safe to update it here, as long as it is the only
16969 			 * one that can be modified while holding the VM map
16970 			 * "shared".
16971 			 */
16972 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16973 		}
16974 	}
16975 
16976 	vm_map_unlock_read(map);
16977 	vm_page_stats_reusable.reusable_pages_success++;
16978 	return KERN_SUCCESS;
16979 }
16980 
16981 
16982 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16983 vm_map_can_reuse(
16984 	vm_map_t        map,
16985 	vm_map_offset_t start,
16986 	vm_map_offset_t end)
16987 {
16988 	vm_map_entry_t                  entry;
16989 
16990 	/*
16991 	 * The MADV_REUSABLE operation doesn't require any changes to the
16992 	 * vm_map_entry_t's, so the read lock is sufficient.
16993 	 */
16994 
16995 	vm_map_lock_read(map);
16996 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16997 
16998 	/*
16999 	 * The madvise semantics require that the address range be fully
17000 	 * allocated with no holes.  Otherwise, we're required to return
17001 	 * an error.
17002 	 */
17003 
17004 	if (!vm_map_range_check(map, start, end, &entry)) {
17005 		vm_map_unlock_read(map);
17006 		vm_page_stats_reusable.can_reuse_failure++;
17007 		return KERN_INVALID_ADDRESS;
17008 	}
17009 
17010 	/*
17011 	 * Examine each vm_map_entry_t in the range.
17012 	 */
17013 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17014 	    entry = entry->vme_next) {
17015 		/*
17016 		 * Sanity check on the VM map entry.
17017 		 */
17018 		if (!vm_map_entry_is_reusable(entry)) {
17019 			vm_map_unlock_read(map);
17020 			vm_page_stats_reusable.can_reuse_failure++;
17021 			return KERN_INVALID_ADDRESS;
17022 		}
17023 	}
17024 
17025 	vm_map_unlock_read(map);
17026 	vm_page_stats_reusable.can_reuse_success++;
17027 	return KERN_SUCCESS;
17028 }
17029 
17030 
17031 #if MACH_ASSERT
17032 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17033 vm_map_pageout(
17034 	vm_map_t        map,
17035 	vm_map_offset_t start,
17036 	vm_map_offset_t end)
17037 {
17038 	vm_map_entry_t                  entry;
17039 
17040 	/*
17041 	 * The MADV_PAGEOUT operation doesn't require any changes to the
17042 	 * vm_map_entry_t's, so the read lock is sufficient.
17043 	 */
17044 
17045 	vm_map_lock_read(map);
17046 
17047 	/*
17048 	 * The madvise semantics require that the address range be fully
17049 	 * allocated with no holes.  Otherwise, we're required to return
17050 	 * an error.
17051 	 */
17052 
17053 	if (!vm_map_range_check(map, start, end, &entry)) {
17054 		vm_map_unlock_read(map);
17055 		return KERN_INVALID_ADDRESS;
17056 	}
17057 
17058 	/*
17059 	 * Examine each vm_map_entry_t in the range.
17060 	 */
17061 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17062 	    entry = entry->vme_next) {
17063 		vm_object_t     object;
17064 
17065 		/*
17066 		 * Sanity check on the VM map entry.
17067 		 */
17068 		if (entry->is_sub_map) {
17069 			vm_map_t submap;
17070 			vm_map_offset_t submap_start;
17071 			vm_map_offset_t submap_end;
17072 			vm_map_entry_t submap_entry;
17073 
17074 			submap = VME_SUBMAP(entry);
17075 			submap_start = VME_OFFSET(entry);
17076 			submap_end = submap_start + (entry->vme_end -
17077 			    entry->vme_start);
17078 
17079 			vm_map_lock_read(submap);
17080 
17081 			if (!vm_map_range_check(submap,
17082 			    submap_start,
17083 			    submap_end,
17084 			    &submap_entry)) {
17085 				vm_map_unlock_read(submap);
17086 				vm_map_unlock_read(map);
17087 				return KERN_INVALID_ADDRESS;
17088 			}
17089 
17090 			if (submap_entry->is_sub_map) {
17091 				vm_map_unlock_read(submap);
17092 				continue;
17093 			}
17094 
17095 			object = VME_OBJECT(submap_entry);
17096 			if (object == VM_OBJECT_NULL || !object->internal) {
17097 				vm_map_unlock_read(submap);
17098 				continue;
17099 			}
17100 
17101 			vm_object_pageout(object);
17102 
17103 			vm_map_unlock_read(submap);
17104 			submap = VM_MAP_NULL;
17105 			submap_entry = VM_MAP_ENTRY_NULL;
17106 			continue;
17107 		}
17108 
17109 		object = VME_OBJECT(entry);
17110 		if (object == VM_OBJECT_NULL || !object->internal) {
17111 			continue;
17112 		}
17113 
17114 		vm_object_pageout(object);
17115 	}
17116 
17117 	vm_map_unlock_read(map);
17118 	return KERN_SUCCESS;
17119 }
17120 #endif /* MACH_ASSERT */
17121 
17122 /*
17123  * This function determines if the zero operation can be run on the
17124  * respective entry. Additional checks on the object are in
17125  * vm_object_zero_preflight.
17126  */
17127 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)17128 vm_map_zero_entry_preflight(vm_map_entry_t entry)
17129 {
17130 	/*
17131 	 * Zeroing is restricted to writable non-executable entries and non-JIT
17132 	 * regions.
17133 	 */
17134 	if (!(entry->protection & VM_PROT_WRITE) ||
17135 	    (entry->protection & VM_PROT_EXECUTE) ||
17136 	    entry->used_for_jit ||
17137 	    entry->vme_xnu_user_debug) {
17138 		return KERN_PROTECTION_FAILURE;
17139 	}
17140 
17141 	/*
17142 	 * Zeroing for copy on write isn't yet supported. Zeroing is also not
17143 	 * allowed for submaps.
17144 	 */
17145 	if (entry->needs_copy || entry->is_sub_map) {
17146 		return KERN_NO_ACCESS;
17147 	}
17148 
17149 	return KERN_SUCCESS;
17150 }
17151 
17152 /*
17153  * This function translates entry's start and end to offsets in the object
17154  */
17155 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)17156 vm_map_get_bounds_in_object(
17157 	vm_map_entry_t      entry,
17158 	vm_map_offset_t     start,
17159 	vm_map_offset_t     end,
17160 	vm_map_offset_t    *start_offset,
17161 	vm_map_offset_t    *end_offset)
17162 {
17163 	if (entry->vme_start < start) {
17164 		*start_offset = start - entry->vme_start;
17165 	} else {
17166 		*start_offset = 0;
17167 	}
17168 	*end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17169 	*start_offset += VME_OFFSET(entry);
17170 	*end_offset += VME_OFFSET(entry);
17171 }
17172 
17173 /*
17174  * This function iterates through the entries in the requested range
17175  * and zeroes any resident pages in the corresponding objects. Compressed
17176  * pages are dropped instead of being faulted in and zeroed.
17177  */
17178 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17179 vm_map_zero(
17180 	vm_map_t        map,
17181 	vm_map_offset_t start,
17182 	vm_map_offset_t end)
17183 {
17184 	vm_map_entry_t                  entry;
17185 	vm_map_offset_t                 cur = start;
17186 	kern_return_t                   ret;
17187 
17188 	/*
17189 	 * This operation isn't supported where the map page size is less than
17190 	 * the hardware page size. Caller will need to handle error and
17191 	 * explicitly zero memory if needed.
17192 	 */
17193 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17194 		return KERN_NO_ACCESS;
17195 	}
17196 
17197 	/*
17198 	 * The MADV_ZERO operation doesn't require any changes to the
17199 	 * vm_map_entry_t's, so the read lock is sufficient.
17200 	 */
17201 	vm_map_lock_read(map);
17202 	assert(map->pmap != kernel_pmap);       /* protect alias access */
17203 
17204 	/*
17205 	 * The madvise semantics require that the address range be fully
17206 	 * allocated with no holes. Otherwise, we're required to return
17207 	 * an error. This check needs to be redone if the map has changed.
17208 	 */
17209 	if (!vm_map_range_check(map, cur, end, &entry)) {
17210 		vm_map_unlock_read(map);
17211 		return KERN_INVALID_ADDRESS;
17212 	}
17213 
17214 	/*
17215 	 * Examine each vm_map_entry_t in the range.
17216 	 */
17217 	while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
17218 		vm_map_offset_t cur_offset;
17219 		vm_map_offset_t end_offset;
17220 		unsigned int last_timestamp = map->timestamp;
17221 		vm_object_t object = VME_OBJECT(entry);
17222 
17223 		ret = vm_map_zero_entry_preflight(entry);
17224 		if (ret != KERN_SUCCESS) {
17225 			vm_map_unlock_read(map);
17226 			return ret;
17227 		}
17228 
17229 		if (object == VM_OBJECT_NULL) {
17230 			entry = entry->vme_next;
17231 			continue;
17232 		}
17233 
17234 		vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
17235 		vm_object_lock(object);
17236 		/*
17237 		 * Take a reference on the object as vm_object_zero will drop the object
17238 		 * lock when it encounters a busy page.
17239 		 */
17240 		vm_object_reference_locked(object);
17241 		vm_map_unlock_read(map);
17242 
17243 		ret = vm_object_zero(object, cur_offset, end_offset);
17244 		vm_object_unlock(object);
17245 		vm_object_deallocate(object);
17246 		if (ret != KERN_SUCCESS) {
17247 			return ret;
17248 		}
17249 		/*
17250 		 * Update cur as vm_object_zero has succeeded.
17251 		 */
17252 		cur += (end_offset - cur_offset);
17253 		if (cur == end) {
17254 			return KERN_SUCCESS;
17255 		}
17256 
17257 		/*
17258 		 * If the map timestamp has changed, restart by relooking up cur in the
17259 		 * map
17260 		 */
17261 		vm_map_lock_read(map);
17262 		if (last_timestamp != map->timestamp) {
17263 			/*
17264 			 * Relookup cur in the map
17265 			 */
17266 			if (!vm_map_range_check(map, cur, end, &entry)) {
17267 				vm_map_unlock_read(map);
17268 				return KERN_INVALID_ADDRESS;
17269 			}
17270 			continue;
17271 		}
17272 		/*
17273 		 * If the map hasn't changed proceed with the next entry
17274 		 */
17275 		entry = entry->vme_next;
17276 	}
17277 
17278 	vm_map_unlock_read(map);
17279 	return KERN_SUCCESS;
17280 }
17281 
17282 
17283 /*
17284  *	Routine:	vm_map_entry_insert
17285  *
17286  *	Description:	This routine inserts a new vm_entry in a locked map.
17287  */
17288 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17289 vm_map_entry_insert(
17290 	vm_map_t                map,
17291 	vm_map_entry_t          insp_entry,
17292 	vm_map_offset_t         start,
17293 	vm_map_offset_t         end,
17294 	vm_object_t             object,
17295 	vm_object_offset_t      offset,
17296 	vm_map_kernel_flags_t   vmk_flags,
17297 	boolean_t               needs_copy,
17298 	vm_prot_t               cur_protection,
17299 	vm_prot_t               max_protection,
17300 	vm_inherit_t            inheritance,
17301 	boolean_t               clear_map_aligned)
17302 {
17303 	vm_map_entry_t  new_entry;
17304 	boolean_t map_aligned = FALSE;
17305 
17306 	assert(insp_entry != (vm_map_entry_t)0);
17307 	vm_map_lock_assert_exclusive(map);
17308 
17309 	__assert_only vm_object_offset_t      end_offset = 0;
17310 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17311 
17312 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17313 		map_aligned = TRUE;
17314 	}
17315 	if (clear_map_aligned &&
17316 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17317 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17318 		map_aligned = FALSE;
17319 	}
17320 	if (map_aligned) {
17321 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17322 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17323 	} else {
17324 		assert(page_aligned(start));
17325 		assert(page_aligned(end));
17326 	}
17327 	assert(start < end);
17328 
17329 	new_entry = vm_map_entry_create(map);
17330 
17331 	new_entry->vme_start = start;
17332 	new_entry->vme_end = end;
17333 
17334 	if (vmk_flags.vmkf_submap) {
17335 		new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17336 		VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17337 	} else {
17338 		VME_OBJECT_SET(new_entry, object, false, 0);
17339 	}
17340 	VME_OFFSET_SET(new_entry, offset);
17341 	VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17342 
17343 	new_entry->map_aligned = map_aligned;
17344 	new_entry->needs_copy = needs_copy;
17345 	new_entry->inheritance = inheritance;
17346 	new_entry->protection = cur_protection;
17347 	new_entry->max_protection = max_protection;
17348 	/*
17349 	 * submap: "use_pmap" means "nested".
17350 	 * default: false.
17351 	 *
17352 	 * object: "use_pmap" means "use pmap accounting" for footprint.
17353 	 * default: true.
17354 	 */
17355 	new_entry->use_pmap = !vmk_flags.vmkf_submap;
17356 	new_entry->no_cache = vmk_flags.vmf_no_cache;
17357 	new_entry->vme_permanent = vmk_flags.vmf_permanent;
17358 	new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17359 	new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17360 	new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17361 
17362 	if (vmk_flags.vmkf_map_jit) {
17363 		if (!(map->jit_entry_exists) ||
17364 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17365 			new_entry->used_for_jit = TRUE;
17366 			map->jit_entry_exists = TRUE;
17367 		}
17368 	}
17369 
17370 	/*
17371 	 *	Insert the new entry into the list.
17372 	 */
17373 
17374 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17375 	map->size += end - start;
17376 
17377 	/*
17378 	 *	Update the free space hint and the lookup hint.
17379 	 */
17380 
17381 	SAVE_HINT_MAP_WRITE(map, new_entry);
17382 	return new_entry;
17383 }
17384 
17385 /*
17386  *	Routine:	vm_map_remap_extract
17387  *
17388  *	Description:	This routine returns a vm_entry list from a map.
17389  */
17390 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17391 vm_map_remap_extract(
17392 	vm_map_t                map,
17393 	vm_map_offset_t         addr,
17394 	vm_map_size_t           size,
17395 	boolean_t               copy,
17396 	vm_map_copy_t           map_copy,
17397 	vm_prot_t               *cur_protection,   /* IN/OUT */
17398 	vm_prot_t               *max_protection,   /* IN/OUT */
17399 	/* What, no behavior? */
17400 	vm_inherit_t            inheritance,
17401 	vm_map_kernel_flags_t   vmk_flags)
17402 {
17403 	struct vm_map_header   *map_header = &map_copy->cpy_hdr;
17404 	kern_return_t           result;
17405 	vm_map_size_t           mapped_size;
17406 	vm_map_size_t           tmp_size;
17407 	vm_map_entry_t          src_entry;     /* result of last map lookup */
17408 	vm_map_entry_t          new_entry;
17409 	vm_object_offset_t      offset;
17410 	vm_map_offset_t         map_address;
17411 	vm_map_offset_t         src_start;     /* start of entry to map */
17412 	vm_map_offset_t         src_end;       /* end of region to be mapped */
17413 	vm_object_t             object;
17414 	vm_map_version_t        version;
17415 	boolean_t               src_needs_copy;
17416 	boolean_t               new_entry_needs_copy;
17417 	vm_map_entry_t          saved_src_entry;
17418 	boolean_t               src_entry_was_wired;
17419 	vm_prot_t               max_prot_for_prot_copy;
17420 	vm_map_offset_t         effective_page_mask;
17421 	bool                    pageable, same_map;
17422 	boolean_t               vm_remap_legacy;
17423 	vm_prot_t               required_cur_prot, required_max_prot;
17424 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
17425 	boolean_t               saved_used_for_jit;  /* Saved used_for_jit. */
17426 
17427 	pageable = vmk_flags.vmkf_copy_pageable;
17428 	same_map = vmk_flags.vmkf_copy_same_map;
17429 
17430 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17431 
17432 	assert(map != VM_MAP_NULL);
17433 	assert(size != 0);
17434 	assert(size == vm_map_round_page(size, effective_page_mask));
17435 	assert(inheritance == VM_INHERIT_NONE ||
17436 	    inheritance == VM_INHERIT_COPY ||
17437 	    inheritance == VM_INHERIT_SHARE);
17438 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17439 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17440 	assert((*cur_protection & *max_protection) == *cur_protection);
17441 
17442 	/*
17443 	 *	Compute start and end of region.
17444 	 */
17445 	src_start = vm_map_trunc_page(addr, effective_page_mask);
17446 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
17447 
17448 	/*
17449 	 *	Initialize map_header.
17450 	 */
17451 	map_header->nentries = 0;
17452 	map_header->entries_pageable = pageable;
17453 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17454 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17455 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17456 	vm_map_store_init(map_header);
17457 
17458 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
17459 		/*
17460 		 * Special case for vm_map_protect(VM_PROT_COPY):
17461 		 * we want to set the new mappings' max protection to the
17462 		 * specified *max_protection...
17463 		 */
17464 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17465 		/* ... but we want to use the vm_remap() legacy mode */
17466 		vmk_flags.vmkf_remap_legacy_mode = true;
17467 		*max_protection = VM_PROT_NONE;
17468 		*cur_protection = VM_PROT_NONE;
17469 	} else {
17470 		max_prot_for_prot_copy = VM_PROT_NONE;
17471 	}
17472 
17473 	if (vmk_flags.vmkf_remap_legacy_mode) {
17474 		/*
17475 		 * vm_remap() legacy mode:
17476 		 * Extract all memory regions in the specified range and
17477 		 * collect the strictest set of protections allowed on the
17478 		 * entire range, so the caller knows what they can do with
17479 		 * the remapped range.
17480 		 * We start with VM_PROT_ALL and we'll remove the protections
17481 		 * missing from each memory region.
17482 		 */
17483 		vm_remap_legacy = TRUE;
17484 		*cur_protection = VM_PROT_ALL;
17485 		*max_protection = VM_PROT_ALL;
17486 		required_cur_prot = VM_PROT_NONE;
17487 		required_max_prot = VM_PROT_NONE;
17488 	} else {
17489 		/*
17490 		 * vm_remap_new() mode:
17491 		 * Extract all memory regions in the specified range and
17492 		 * ensure that they have at least the protections specified
17493 		 * by the caller via *cur_protection and *max_protection.
17494 		 * The resulting mapping should have these protections.
17495 		 */
17496 		vm_remap_legacy = FALSE;
17497 		if (copy) {
17498 			required_cur_prot = VM_PROT_NONE;
17499 			required_max_prot = VM_PROT_READ;
17500 		} else {
17501 			required_cur_prot = *cur_protection;
17502 			required_max_prot = *max_protection;
17503 		}
17504 	}
17505 
17506 	map_address = 0;
17507 	mapped_size = 0;
17508 	result = KERN_SUCCESS;
17509 
17510 	/*
17511 	 *	The specified source virtual space might correspond to
17512 	 *	multiple map entries, need to loop on them.
17513 	 */
17514 	vm_map_lock(map);
17515 
17516 	if (map->pmap == kernel_pmap) {
17517 		map_copy->is_kernel_range = true;
17518 		map_copy->orig_range = kmem_addr_get_range(addr, size);
17519 #if CONFIG_MAP_RANGES
17520 	} else if (map->uses_user_ranges) {
17521 		map_copy->is_user_range = true;
17522 		map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17523 #endif /* CONFIG_MAP_RANGES */
17524 	}
17525 
17526 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17527 		/*
17528 		 * This address space uses sub-pages so the range might
17529 		 * not be re-mappable in an address space with larger
17530 		 * pages. Re-assemble any broken-up VM map entries to
17531 		 * improve our chances of making it work.
17532 		 */
17533 		vm_map_simplify_range(map, src_start, src_end);
17534 	}
17535 	while (mapped_size != size) {
17536 		vm_map_size_t   entry_size;
17537 
17538 		/*
17539 		 *	Find the beginning of the region.
17540 		 */
17541 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17542 			result = KERN_INVALID_ADDRESS;
17543 			break;
17544 		}
17545 
17546 		if (src_start < src_entry->vme_start ||
17547 		    (mapped_size && src_start != src_entry->vme_start)) {
17548 			result = KERN_INVALID_ADDRESS;
17549 			break;
17550 		}
17551 
17552 		tmp_size = size - mapped_size;
17553 		if (src_end > src_entry->vme_end) {
17554 			tmp_size -= (src_end - src_entry->vme_end);
17555 		}
17556 
17557 		entry_size = (vm_map_size_t)(src_entry->vme_end -
17558 		    src_entry->vme_start);
17559 
17560 		if (src_entry->is_sub_map &&
17561 		    vmk_flags.vmkf_copy_single_object) {
17562 			vm_map_t submap;
17563 			vm_map_offset_t submap_start;
17564 			vm_map_size_t submap_size;
17565 			boolean_t submap_needs_copy;
17566 
17567 			/*
17568 			 * No check for "required protection" on "src_entry"
17569 			 * because the protections that matter are the ones
17570 			 * on the submap's VM map entry, which will be checked
17571 			 * during the call to vm_map_remap_extract() below.
17572 			 */
17573 			object = VM_OBJECT_NULL;
17574 
17575 			submap_size = src_entry->vme_end - src_start;
17576 			if (submap_size > size) {
17577 				submap_size = size;
17578 			}
17579 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17580 			submap = VME_SUBMAP(src_entry);
17581 			if (copy) {
17582 				/*
17583 				 * The caller wants a copy-on-write re-mapping,
17584 				 * so let's extract from the submap accordingly.
17585 				 */
17586 				submap_needs_copy = TRUE;
17587 			} else if (src_entry->needs_copy) {
17588 				/*
17589 				 * The caller wants a shared re-mapping but the
17590 				 * submap is mapped with "needs_copy", so its
17591 				 * contents can't be shared as is. Extract the
17592 				 * contents of the submap as "copy-on-write".
17593 				 * The re-mapping won't be shared with the
17594 				 * original mapping but this is equivalent to
17595 				 * what happened with the original "remap from
17596 				 * submap" code.
17597 				 * The shared region is mapped "needs_copy", for
17598 				 * example.
17599 				 */
17600 				submap_needs_copy = TRUE;
17601 			} else {
17602 				/*
17603 				 * The caller wants a shared re-mapping and
17604 				 * this mapping can be shared (no "needs_copy"),
17605 				 * so let's extract from the submap accordingly.
17606 				 * Kernel submaps are mapped without
17607 				 * "needs_copy", for example.
17608 				 */
17609 				submap_needs_copy = FALSE;
17610 			}
17611 			vm_map_reference(submap);
17612 			vm_map_unlock(map);
17613 			src_entry = NULL;
17614 			if (vm_remap_legacy) {
17615 				*cur_protection = VM_PROT_NONE;
17616 				*max_protection = VM_PROT_NONE;
17617 			}
17618 
17619 			DTRACE_VM7(remap_submap_recurse,
17620 			    vm_map_t, map,
17621 			    vm_map_offset_t, addr,
17622 			    vm_map_size_t, size,
17623 			    boolean_t, copy,
17624 			    vm_map_offset_t, submap_start,
17625 			    vm_map_size_t, submap_size,
17626 			    boolean_t, submap_needs_copy);
17627 
17628 			result = vm_map_remap_extract(submap,
17629 			    submap_start,
17630 			    submap_size,
17631 			    submap_needs_copy,
17632 			    map_copy,
17633 			    cur_protection,
17634 			    max_protection,
17635 			    inheritance,
17636 			    vmk_flags);
17637 			vm_map_deallocate(submap);
17638 
17639 			if (result == KERN_SUCCESS &&
17640 			    submap_needs_copy &&
17641 			    !copy) {
17642 				/*
17643 				 * We were asked for a "shared"
17644 				 * re-mapping but had to ask for a
17645 				 * "copy-on-write" remapping of the
17646 				 * submap's mapping to honor the
17647 				 * submap's "needs_copy".
17648 				 * We now need to resolve that
17649 				 * pending "copy-on-write" to
17650 				 * get something we can share.
17651 				 */
17652 				vm_map_entry_t copy_entry;
17653 				vm_object_offset_t copy_offset;
17654 				vm_map_size_t copy_size;
17655 				vm_object_t copy_object;
17656 				copy_entry = vm_map_copy_first_entry(map_copy);
17657 				copy_size = copy_entry->vme_end - copy_entry->vme_start;
17658 				copy_object = VME_OBJECT(copy_entry);
17659 				copy_offset = VME_OFFSET(copy_entry);
17660 				if (copy_object == VM_OBJECT_NULL) {
17661 					assert(copy_offset == 0);
17662 					assert(!copy_entry->needs_copy);
17663 					if (copy_entry->max_protection == VM_PROT_NONE) {
17664 						assert(copy_entry->protection == VM_PROT_NONE);
17665 						/* nothing to share */
17666 					} else {
17667 						assert(copy_offset == 0);
17668 						copy_object = vm_object_allocate(copy_size);
17669 						VME_OFFSET_SET(copy_entry, 0);
17670 						VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17671 						assert(copy_entry->use_pmap);
17672 					}
17673 				} else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17674 					/* already shareable */
17675 					assert(!copy_entry->needs_copy);
17676 				} else if (copy_entry->needs_copy ||
17677 				    copy_object->shadowed ||
17678 				    (copy_object->internal &&
17679 				    !copy_object->true_share &&
17680 				    !copy_entry->is_shared &&
17681 				    copy_object->vo_size > copy_size)) {
17682 					VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17683 					assert(copy_entry->use_pmap);
17684 					if (copy_entry->needs_copy) {
17685 						/* already write-protected */
17686 					} else {
17687 						vm_prot_t prot;
17688 						prot = copy_entry->protection & ~VM_PROT_WRITE;
17689 						vm_object_pmap_protect(copy_object,
17690 						    copy_offset,
17691 						    copy_size,
17692 						    PMAP_NULL,
17693 						    PAGE_SIZE,
17694 						    0,
17695 						    prot);
17696 					}
17697 					copy_entry->needs_copy = FALSE;
17698 				}
17699 				copy_object = VME_OBJECT(copy_entry);
17700 				copy_offset = VME_OFFSET(copy_entry);
17701 				if (copy_object &&
17702 				    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17703 					copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17704 					copy_object->true_share = TRUE;
17705 				}
17706 			}
17707 
17708 			return result;
17709 		}
17710 
17711 		if (src_entry->is_sub_map) {
17712 			/* protections for submap mapping are irrelevant here */
17713 		} else if (((src_entry->protection & required_cur_prot) !=
17714 		    required_cur_prot) ||
17715 		    ((src_entry->max_protection & required_max_prot) !=
17716 		    required_max_prot)) {
17717 			if (vmk_flags.vmkf_copy_single_object &&
17718 			    mapped_size != 0) {
17719 				/*
17720 				 * Single object extraction.
17721 				 * We can't extract more with the required
17722 				 * protection but we've extracted some, so
17723 				 * stop there and declare success.
17724 				 * The caller should check the size of
17725 				 * the copy entry we've extracted.
17726 				 */
17727 				result = KERN_SUCCESS;
17728 			} else {
17729 				/*
17730 				 * VM range extraction.
17731 				 * Required proctection is not available
17732 				 * for this part of the range: fail.
17733 				 */
17734 				result = KERN_PROTECTION_FAILURE;
17735 			}
17736 			break;
17737 		}
17738 
17739 		if (src_entry->is_sub_map) {
17740 			vm_map_t submap;
17741 			vm_map_offset_t submap_start;
17742 			vm_map_size_t submap_size;
17743 			vm_map_copy_t submap_copy;
17744 			vm_prot_t submap_curprot, submap_maxprot;
17745 			boolean_t submap_needs_copy;
17746 
17747 			/*
17748 			 * No check for "required protection" on "src_entry"
17749 			 * because the protections that matter are the ones
17750 			 * on the submap's VM map entry, which will be checked
17751 			 * during the call to vm_map_copy_extract() below.
17752 			 */
17753 			object = VM_OBJECT_NULL;
17754 			submap_copy = VM_MAP_COPY_NULL;
17755 
17756 			/* find equivalent range in the submap */
17757 			submap = VME_SUBMAP(src_entry);
17758 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17759 			submap_size = tmp_size;
17760 			if (copy) {
17761 				/*
17762 				 * The caller wants a copy-on-write re-mapping,
17763 				 * so let's extract from the submap accordingly.
17764 				 */
17765 				submap_needs_copy = TRUE;
17766 			} else if (src_entry->needs_copy) {
17767 				/*
17768 				 * The caller wants a shared re-mapping but the
17769 				 * submap is mapped with "needs_copy", so its
17770 				 * contents can't be shared as is. Extract the
17771 				 * contents of the submap as "copy-on-write".
17772 				 * The re-mapping won't be shared with the
17773 				 * original mapping but this is equivalent to
17774 				 * what happened with the original "remap from
17775 				 * submap" code.
17776 				 * The shared region is mapped "needs_copy", for
17777 				 * example.
17778 				 */
17779 				submap_needs_copy = TRUE;
17780 			} else {
17781 				/*
17782 				 * The caller wants a shared re-mapping and
17783 				 * this mapping can be shared (no "needs_copy"),
17784 				 * so let's extract from the submap accordingly.
17785 				 * Kernel submaps are mapped without
17786 				 * "needs_copy", for example.
17787 				 */
17788 				submap_needs_copy = FALSE;
17789 			}
17790 			/* extra ref to keep submap alive */
17791 			vm_map_reference(submap);
17792 
17793 			DTRACE_VM7(remap_submap_recurse,
17794 			    vm_map_t, map,
17795 			    vm_map_offset_t, addr,
17796 			    vm_map_size_t, size,
17797 			    boolean_t, copy,
17798 			    vm_map_offset_t, submap_start,
17799 			    vm_map_size_t, submap_size,
17800 			    boolean_t, submap_needs_copy);
17801 
17802 			/*
17803 			 * The map can be safely unlocked since we
17804 			 * already hold a reference on the submap.
17805 			 *
17806 			 * No timestamp since we don't care if the map
17807 			 * gets modified while we're down in the submap.
17808 			 * We'll resume the extraction at src_start + tmp_size
17809 			 * anyway.
17810 			 */
17811 			vm_map_unlock(map);
17812 			src_entry = NULL; /* not valid once map is unlocked */
17813 
17814 			if (vm_remap_legacy) {
17815 				submap_curprot = VM_PROT_NONE;
17816 				submap_maxprot = VM_PROT_NONE;
17817 				if (max_prot_for_prot_copy) {
17818 					submap_maxprot = max_prot_for_prot_copy;
17819 				}
17820 			} else {
17821 				assert(!max_prot_for_prot_copy);
17822 				submap_curprot = *cur_protection;
17823 				submap_maxprot = *max_protection;
17824 			}
17825 			result = vm_map_copy_extract(submap,
17826 			    submap_start,
17827 			    submap_size,
17828 			    submap_needs_copy,
17829 			    &submap_copy,
17830 			    &submap_curprot,
17831 			    &submap_maxprot,
17832 			    inheritance,
17833 			    vmk_flags);
17834 
17835 			/* release extra ref on submap */
17836 			vm_map_deallocate(submap);
17837 			submap = VM_MAP_NULL;
17838 
17839 			if (result != KERN_SUCCESS) {
17840 				vm_map_lock(map);
17841 				break;
17842 			}
17843 
17844 			/* transfer submap_copy entries to map_header */
17845 			while (vm_map_copy_first_entry(submap_copy) !=
17846 			    vm_map_copy_to_entry(submap_copy)) {
17847 				vm_map_entry_t copy_entry;
17848 				vm_map_size_t copy_entry_size;
17849 
17850 				copy_entry = vm_map_copy_first_entry(submap_copy);
17851 
17852 				/*
17853 				 * Prevent kernel_object from being exposed to
17854 				 * user space.
17855 				 */
17856 				if (__improbable(copy_entry->vme_kernel_object)) {
17857 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17858 					    proc_selfpid(),
17859 					    (get_bsdtask_info(current_task())
17860 					    ? proc_name_address(get_bsdtask_info(current_task()))
17861 					    : "?"));
17862 					DTRACE_VM(extract_kernel_only);
17863 					result = KERN_INVALID_RIGHT;
17864 					vm_map_copy_discard(submap_copy);
17865 					submap_copy = VM_MAP_COPY_NULL;
17866 					vm_map_lock(map);
17867 					break;
17868 				}
17869 
17870 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
17871 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17872 				copy_entry->vme_start = map_address;
17873 				copy_entry->vme_end = map_address + copy_entry_size;
17874 				map_address += copy_entry_size;
17875 				mapped_size += copy_entry_size;
17876 				src_start += copy_entry_size;
17877 				assert(src_start <= src_end);
17878 				_vm_map_store_entry_link(map_header,
17879 				    map_header->links.prev,
17880 				    copy_entry);
17881 			}
17882 			/* done with submap_copy */
17883 			vm_map_copy_discard(submap_copy);
17884 
17885 			if (vm_remap_legacy) {
17886 				*cur_protection &= submap_curprot;
17887 				*max_protection &= submap_maxprot;
17888 			}
17889 
17890 			/* re-acquire the map lock and continue to next entry */
17891 			vm_map_lock(map);
17892 			continue;
17893 		} else {
17894 			object = VME_OBJECT(src_entry);
17895 
17896 			/*
17897 			 * Prevent kernel_object from being exposed to
17898 			 * user space.
17899 			 */
17900 			if (__improbable(is_kernel_object(object))) {
17901 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17902 				    proc_selfpid(),
17903 				    (get_bsdtask_info(current_task())
17904 				    ? proc_name_address(get_bsdtask_info(current_task()))
17905 				    : "?"));
17906 				DTRACE_VM(extract_kernel_only);
17907 				result = KERN_INVALID_RIGHT;
17908 				break;
17909 			}
17910 
17911 			if (src_entry->iokit_acct) {
17912 				/*
17913 				 * This entry uses "IOKit accounting".
17914 				 */
17915 			} else if (object != VM_OBJECT_NULL &&
17916 			    object->internal &&
17917 			    (object->purgable != VM_PURGABLE_DENY ||
17918 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17919 				/*
17920 				 * Purgeable objects have their own accounting:
17921 				 * no pmap accounting for them.
17922 				 */
17923 				assertf(!src_entry->use_pmap,
17924 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17925 				    map,
17926 				    src_entry,
17927 				    (uint64_t)src_entry->vme_start,
17928 				    (uint64_t)src_entry->vme_end,
17929 				    src_entry->protection,
17930 				    src_entry->max_protection,
17931 				    VME_ALIAS(src_entry));
17932 			} else {
17933 				/*
17934 				 * Not IOKit or purgeable:
17935 				 * must be accounted by pmap stats.
17936 				 */
17937 				assertf(src_entry->use_pmap,
17938 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17939 				    map,
17940 				    src_entry,
17941 				    (uint64_t)src_entry->vme_start,
17942 				    (uint64_t)src_entry->vme_end,
17943 				    src_entry->protection,
17944 				    src_entry->max_protection,
17945 				    VME_ALIAS(src_entry));
17946 			}
17947 
17948 			if (object == VM_OBJECT_NULL) {
17949 				assert(!src_entry->needs_copy);
17950 				if (src_entry->max_protection == VM_PROT_NONE) {
17951 					assert(src_entry->protection == VM_PROT_NONE);
17952 					/*
17953 					 * No VM object and no permissions:
17954 					 * this must be a reserved range with
17955 					 * nothing to share or copy.
17956 					 * There could also be all sorts of
17957 					 * pmap shenanigans within that reserved
17958 					 * range, so let's just copy the map
17959 					 * entry as is to remap a similar
17960 					 * reserved range.
17961 					 */
17962 					offset = 0; /* no object => no offset */
17963 					goto copy_src_entry;
17964 				}
17965 				object = vm_object_allocate(entry_size);
17966 				VME_OFFSET_SET(src_entry, 0);
17967 				VME_OBJECT_SET(src_entry, object, false, 0);
17968 				assert(src_entry->use_pmap);
17969 				assert(!map->mapped_in_other_pmaps);
17970 			} else if (src_entry->wired_count ||
17971 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17972 				/*
17973 				 * A wired memory region should not have
17974 				 * any pending copy-on-write and needs to
17975 				 * keep pointing at the VM object that
17976 				 * contains the wired pages.
17977 				 * If we're sharing this memory (copy=false),
17978 				 * we'll share this VM object.
17979 				 * If we're copying this memory (copy=true),
17980 				 * we'll call vm_object_copy_slowly() below
17981 				 * and use the new VM object for the remapping.
17982 				 *
17983 				 * Or, we are already using an asymmetric
17984 				 * copy, and therefore we already have
17985 				 * the right object.
17986 				 */
17987 				assert(!src_entry->needs_copy);
17988 			} else if (src_entry->needs_copy || object->shadowed ||
17989 			    (object->internal && !object->true_share &&
17990 			    !src_entry->is_shared &&
17991 			    object->vo_size > entry_size)) {
17992 				bool is_writable;
17993 
17994 				VME_OBJECT_SHADOW(src_entry, entry_size,
17995 				    vm_map_always_shadow(map));
17996 				assert(src_entry->use_pmap);
17997 
17998 				is_writable = false;
17999 				if (src_entry->protection & VM_PROT_WRITE) {
18000 					is_writable = true;
18001 #if __arm64e__
18002 				} else if (src_entry->used_for_tpro) {
18003 					is_writable = true;
18004 #endif /* __arm64e__ */
18005 				}
18006 				if (!src_entry->needs_copy && is_writable) {
18007 					vm_prot_t prot;
18008 
18009 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18010 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18011 						    __FUNCTION__,
18012 						    map, map->pmap,
18013 						    src_entry,
18014 						    (uint64_t)src_entry->vme_start,
18015 						    (uint64_t)src_entry->vme_end,
18016 						    src_entry->protection);
18017 					}
18018 
18019 					prot = src_entry->protection & ~VM_PROT_WRITE;
18020 
18021 					if (override_nx(map,
18022 					    VME_ALIAS(src_entry))
18023 					    && prot) {
18024 						prot |= VM_PROT_EXECUTE;
18025 					}
18026 
18027 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18028 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18029 						    __FUNCTION__,
18030 						    map, map->pmap,
18031 						    src_entry,
18032 						    (uint64_t)src_entry->vme_start,
18033 						    (uint64_t)src_entry->vme_end,
18034 						    prot);
18035 					}
18036 
18037 					if (map->mapped_in_other_pmaps) {
18038 						vm_object_pmap_protect(
18039 							VME_OBJECT(src_entry),
18040 							VME_OFFSET(src_entry),
18041 							entry_size,
18042 							PMAP_NULL,
18043 							PAGE_SIZE,
18044 							src_entry->vme_start,
18045 							prot);
18046 #if MACH_ASSERT
18047 					} else if (__improbable(map->pmap == PMAP_NULL)) {
18048 						/*
18049 						 * Some VM tests (in vm_tests.c)
18050 						 * sometimes want to use a VM
18051 						 * map without a pmap.
18052 						 * Otherwise, this should never
18053 						 * happen.
18054 						 */
18055 						if (!thread_get_test_option(test_option_vm_map_allow_null_pmap)) {
18056 							panic("null pmap");
18057 						}
18058 #endif /* MACH_ASSERT */
18059 					} else {
18060 						pmap_protect(vm_map_pmap(map),
18061 						    src_entry->vme_start,
18062 						    src_entry->vme_end,
18063 						    prot);
18064 					}
18065 				}
18066 
18067 				object = VME_OBJECT(src_entry);
18068 				src_entry->needs_copy = FALSE;
18069 			}
18070 
18071 
18072 			vm_object_lock(object);
18073 			vm_object_reference_locked(object); /* object ref. for new entry */
18074 			assert(!src_entry->needs_copy);
18075 			if (object->copy_strategy ==
18076 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
18077 				/*
18078 				 * If we want to share this object (copy==0),
18079 				 * it needs to be COPY_DELAY.
18080 				 * If we want to copy this object (copy==1),
18081 				 * we can't just set "needs_copy" on our side
18082 				 * and expect the other side to do the same
18083 				 * (symmetrically), so we can't let the object
18084 				 * stay COPY_SYMMETRIC.
18085 				 * So we always switch from COPY_SYMMETRIC to
18086 				 * COPY_DELAY.
18087 				 */
18088 				object->copy_strategy =
18089 				    MEMORY_OBJECT_COPY_DELAY;
18090 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
18091 			}
18092 			vm_object_unlock(object);
18093 		}
18094 
18095 		offset = (VME_OFFSET(src_entry) +
18096 		    (src_start - src_entry->vme_start));
18097 
18098 copy_src_entry:
18099 		new_entry = _vm_map_entry_create(map_header);
18100 		vm_map_entry_copy(map, new_entry, src_entry);
18101 		if (new_entry->is_sub_map) {
18102 			/* clr address space specifics */
18103 			new_entry->use_pmap = FALSE;
18104 		} else if (copy) {
18105 			/*
18106 			 * We're dealing with a copy-on-write operation,
18107 			 * so the resulting mapping should not inherit the
18108 			 * original mapping's accounting settings.
18109 			 * "use_pmap" should be reset to its default (TRUE)
18110 			 * so that the new mapping gets accounted for in
18111 			 * the task's memory footprint.
18112 			 */
18113 			new_entry->use_pmap = TRUE;
18114 		}
18115 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
18116 		assert(!new_entry->iokit_acct);
18117 
18118 		new_entry->map_aligned = FALSE;
18119 
18120 		new_entry->vme_start = map_address;
18121 		new_entry->vme_end = map_address + tmp_size;
18122 		assert(new_entry->vme_start < new_entry->vme_end);
18123 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
18124 			/* security: keep "permanent" and "csm_associated" */
18125 			new_entry->vme_permanent = src_entry->vme_permanent;
18126 			new_entry->csm_associated = src_entry->csm_associated;
18127 			/*
18128 			 * Remapping for vm_map_protect(VM_PROT_COPY)
18129 			 * to convert a read-only mapping into a
18130 			 * copy-on-write version of itself but
18131 			 * with write access:
18132 			 * keep the original inheritance but let's not
18133 			 * add VM_PROT_WRITE to the max protection yet
18134 			 * since we want to do more security checks against
18135 			 * the target map.
18136 			 */
18137 			new_entry->inheritance = src_entry->inheritance;
18138 			new_entry->protection &= max_prot_for_prot_copy;
18139 
18140 #ifdef __arm64e__
18141 			/*
18142 			 * Remapping for vm_map_protect(VM_PROT_COPY) to remap a TPRO
18143 			 * region to be explicitly writable without TPRO is only permitted
18144 			 * if TPRO enforcement has been overridden.
18145 			 *
18146 			 * In this case we ensure any entries reset the TPRO state
18147 			 * and we permit the region to be downgraded from permanent.
18148 			 */
18149 			if (new_entry->used_for_tpro) {
18150 				if (vmk_flags.vmkf_tpro_enforcement_override) {
18151 					new_entry->used_for_tpro = FALSE;
18152 					new_entry->vme_permanent = FALSE;
18153 				} else {
18154 					result = KERN_PROTECTION_FAILURE;
18155 					vm_object_deallocate(object);
18156 					vm_map_entry_dispose(new_entry);
18157 					new_entry = VM_MAP_ENTRY_NULL;
18158 					break;
18159 				}
18160 			}
18161 #endif
18162 		} else {
18163 			new_entry->inheritance = inheritance;
18164 			if (!vm_remap_legacy) {
18165 				new_entry->protection = *cur_protection;
18166 				new_entry->max_protection = *max_protection;
18167 			}
18168 		}
18169 
18170 		VME_OFFSET_SET(new_entry, offset);
18171 
18172 		/*
18173 		 * The new region has to be copied now if required.
18174 		 */
18175 RestartCopy:
18176 		if (!copy) {
18177 			if (src_entry->used_for_jit == TRUE) {
18178 				if (same_map) {
18179 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
18180 					/*
18181 					 * Cannot allow an entry describing a JIT
18182 					 * region to be shared across address spaces.
18183 					 */
18184 					result = KERN_INVALID_ARGUMENT;
18185 					vm_object_deallocate(object);
18186 					vm_map_entry_dispose(new_entry);
18187 					new_entry = VM_MAP_ENTRY_NULL;
18188 					break;
18189 				}
18190 			}
18191 
18192 			if (!src_entry->is_sub_map &&
18193 			    VME_OBJECT(src_entry) == VM_OBJECT_NULL) {
18194 				/* no accessible memory; nothing to share */
18195 				assert(src_entry->protection == VM_PROT_NONE);
18196 				assert(src_entry->max_protection == VM_PROT_NONE);
18197 				src_entry->is_shared = FALSE;
18198 			} else {
18199 				src_entry->is_shared = TRUE;
18200 			}
18201 			if (!new_entry->is_sub_map &&
18202 			    VME_OBJECT(new_entry) == VM_OBJECT_NULL) {
18203 				/* no accessible memory; nothing to share */
18204 				assert(new_entry->protection == VM_PROT_NONE);
18205 				assert(new_entry->max_protection == VM_PROT_NONE);
18206 				new_entry->is_shared = FALSE;
18207 			} else {
18208 				new_entry->is_shared = TRUE;
18209 			}
18210 			if (!(new_entry->is_sub_map)) {
18211 				new_entry->needs_copy = FALSE;
18212 			}
18213 		} else if (src_entry->is_sub_map) {
18214 			/* make this a COW sub_map if not already */
18215 			assert(new_entry->wired_count == 0);
18216 			new_entry->needs_copy = TRUE;
18217 			object = VM_OBJECT_NULL;
18218 		} else if (src_entry->wired_count == 0 &&
18219 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
18220 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
18221 		    VME_OFFSET(new_entry),
18222 		    (new_entry->vme_end -
18223 		    new_entry->vme_start),
18224 		    &src_needs_copy,
18225 		    &new_entry_needs_copy)) {
18226 			new_entry->needs_copy = new_entry_needs_copy;
18227 			new_entry->is_shared = FALSE;
18228 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18229 
18230 			/*
18231 			 * Handle copy_on_write semantics.
18232 			 */
18233 			if (src_needs_copy && !src_entry->needs_copy) {
18234 				vm_prot_t prot;
18235 
18236 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18237 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18238 					    __FUNCTION__,
18239 					    map, map->pmap, src_entry,
18240 					    (uint64_t)src_entry->vme_start,
18241 					    (uint64_t)src_entry->vme_end,
18242 					    src_entry->protection);
18243 				}
18244 
18245 				prot = src_entry->protection & ~VM_PROT_WRITE;
18246 
18247 				if (override_nx(map,
18248 				    VME_ALIAS(src_entry))
18249 				    && prot) {
18250 					prot |= VM_PROT_EXECUTE;
18251 				}
18252 
18253 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18254 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18255 					    __FUNCTION__,
18256 					    map, map->pmap, src_entry,
18257 					    (uint64_t)src_entry->vme_start,
18258 					    (uint64_t)src_entry->vme_end,
18259 					    prot);
18260 				}
18261 
18262 				vm_object_pmap_protect(object,
18263 				    offset,
18264 				    entry_size,
18265 				    ((src_entry->is_shared
18266 				    || map->mapped_in_other_pmaps) ?
18267 				    PMAP_NULL : map->pmap),
18268 				    VM_MAP_PAGE_SIZE(map),
18269 				    src_entry->vme_start,
18270 				    prot);
18271 
18272 				assert(src_entry->wired_count == 0);
18273 				src_entry->needs_copy = TRUE;
18274 			}
18275 			/*
18276 			 * Throw away the old object reference of the new entry.
18277 			 */
18278 			vm_object_deallocate(object);
18279 		} else {
18280 			new_entry->is_shared = FALSE;
18281 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18282 
18283 			src_entry_was_wired = (src_entry->wired_count > 0);
18284 			saved_src_entry = src_entry;
18285 			src_entry = VM_MAP_ENTRY_NULL;
18286 
18287 			/*
18288 			 * The map can be safely unlocked since we
18289 			 * already hold a reference on the object.
18290 			 *
18291 			 * Record the timestamp of the map for later
18292 			 * verification, and unlock the map.
18293 			 */
18294 			version.main_timestamp = map->timestamp;
18295 			vm_map_unlock(map);     /* Increments timestamp once! */
18296 
18297 			/*
18298 			 * Perform the copy.
18299 			 */
18300 			if (src_entry_was_wired > 0 ||
18301 			    (debug4k_no_cow_copyin &&
18302 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18303 				vm_object_lock(object);
18304 				result = vm_object_copy_slowly(
18305 					object,
18306 					offset,
18307 					(new_entry->vme_end -
18308 					new_entry->vme_start),
18309 					THREAD_UNINT,
18310 					&new_copy_object);
18311 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18312 				saved_used_for_jit = new_entry->used_for_jit;
18313 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18314 				new_entry->used_for_jit = saved_used_for_jit;
18315 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18316 				new_entry->needs_copy = FALSE;
18317 			} else {
18318 				vm_object_offset_t new_offset;
18319 
18320 				new_offset = VME_OFFSET(new_entry);
18321 				result = vm_object_copy_strategically(
18322 					object,
18323 					offset,
18324 					(new_entry->vme_end -
18325 					new_entry->vme_start),
18326 					false, /* forking */
18327 					&new_copy_object,
18328 					&new_offset,
18329 					&new_entry_needs_copy);
18330 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18331 				saved_used_for_jit = new_entry->used_for_jit;
18332 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18333 				new_entry->used_for_jit = saved_used_for_jit;
18334 				if (new_offset != VME_OFFSET(new_entry)) {
18335 					VME_OFFSET_SET(new_entry, new_offset);
18336 				}
18337 
18338 				new_entry->needs_copy = new_entry_needs_copy;
18339 			}
18340 
18341 			/*
18342 			 * Throw away the old object reference of the new entry.
18343 			 */
18344 			vm_object_deallocate(object);
18345 
18346 			if (result != KERN_SUCCESS &&
18347 			    result != KERN_MEMORY_RESTART_COPY) {
18348 				vm_map_entry_dispose(new_entry);
18349 				vm_map_lock(map);
18350 				break;
18351 			}
18352 
18353 			/*
18354 			 * Verify that the map has not substantially
18355 			 * changed while the copy was being made.
18356 			 */
18357 
18358 			vm_map_lock(map);
18359 			if (version.main_timestamp + 1 != map->timestamp) {
18360 				/*
18361 				 * Simple version comparison failed.
18362 				 *
18363 				 * Retry the lookup and verify that the
18364 				 * same object/offset are still present.
18365 				 */
18366 				saved_src_entry = VM_MAP_ENTRY_NULL;
18367 				vm_object_deallocate(VME_OBJECT(new_entry));
18368 				vm_map_entry_dispose(new_entry);
18369 				if (result == KERN_MEMORY_RESTART_COPY) {
18370 					result = KERN_SUCCESS;
18371 				}
18372 				continue;
18373 			}
18374 			/* map hasn't changed: src_entry is still valid */
18375 			src_entry = saved_src_entry;
18376 			saved_src_entry = VM_MAP_ENTRY_NULL;
18377 
18378 			if (result == KERN_MEMORY_RESTART_COPY) {
18379 				vm_object_reference(object);
18380 				goto RestartCopy;
18381 			}
18382 		}
18383 
18384 		_vm_map_store_entry_link(map_header,
18385 		    map_header->links.prev, new_entry);
18386 
18387 		/* protections for submap mapping are irrelevant here */
18388 		if (vm_remap_legacy && !src_entry->is_sub_map) {
18389 			*cur_protection &= src_entry->protection;
18390 			*max_protection &= src_entry->max_protection;
18391 		}
18392 
18393 		map_address += tmp_size;
18394 		mapped_size += tmp_size;
18395 		src_start += tmp_size;
18396 
18397 		if (vmk_flags.vmkf_copy_single_object) {
18398 			if (mapped_size != size) {
18399 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18400 				    map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18401 				if (src_entry->vme_next != vm_map_to_entry(map) &&
18402 				    src_entry->vme_next->vme_object_value ==
18403 				    src_entry->vme_object_value) {
18404 					/* XXX TODO4K */
18405 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
18406 				}
18407 			}
18408 			break;
18409 		}
18410 	} /* end while */
18411 
18412 	vm_map_unlock(map);
18413 	if (result != KERN_SUCCESS) {
18414 		/*
18415 		 * Free all allocated elements.
18416 		 */
18417 		for (src_entry = map_header->links.next;
18418 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18419 		    src_entry = new_entry) {
18420 			new_entry = src_entry->vme_next;
18421 			_vm_map_store_entry_unlink(map_header, src_entry, false);
18422 			if (src_entry->is_sub_map) {
18423 				vm_map_deallocate(VME_SUBMAP(src_entry));
18424 			} else {
18425 				vm_object_deallocate(VME_OBJECT(src_entry));
18426 			}
18427 			vm_map_entry_dispose(src_entry);
18428 		}
18429 	}
18430 	return result;
18431 }
18432 
18433 bool
vm_map_is_exotic(vm_map_t map)18434 vm_map_is_exotic(
18435 	vm_map_t map)
18436 {
18437 	return VM_MAP_IS_EXOTIC(map);
18438 }
18439 
18440 bool
vm_map_is_alien(vm_map_t map)18441 vm_map_is_alien(
18442 	vm_map_t map)
18443 {
18444 	return VM_MAP_IS_ALIEN(map);
18445 }
18446 
18447 #if XNU_TARGET_OS_OSX
18448 void
vm_map_mark_alien(vm_map_t map)18449 vm_map_mark_alien(
18450 	vm_map_t map)
18451 {
18452 	vm_map_lock(map);
18453 	map->is_alien = true;
18454 	vm_map_unlock(map);
18455 }
18456 
18457 void
vm_map_single_jit(vm_map_t map)18458 vm_map_single_jit(
18459 	vm_map_t map)
18460 {
18461 	vm_map_lock(map);
18462 	map->single_jit = true;
18463 	vm_map_unlock(map);
18464 }
18465 #endif /* XNU_TARGET_OS_OSX */
18466 
18467 
18468 
18469 /*
18470  * Callers of this function must call vm_map_copy_require on
18471  * previously created vm_map_copy_t or pass a newly created
18472  * one to ensure that it hasn't been forged.
18473  */
18474 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18475 vm_map_copy_to_physcopy(
18476 	vm_map_copy_t   copy_map,
18477 	vm_map_t        target_map)
18478 {
18479 	vm_map_size_t           size;
18480 	vm_map_entry_t          entry;
18481 	vm_map_entry_t          new_entry;
18482 	vm_object_t             new_object;
18483 	unsigned int            pmap_flags;
18484 	pmap_t                  new_pmap;
18485 	vm_map_t                new_map;
18486 	vm_map_address_t        src_start, src_end, src_cur;
18487 	vm_map_address_t        dst_start, dst_end, dst_cur;
18488 	kern_return_t           kr;
18489 	void                    *kbuf;
18490 
18491 	/*
18492 	 * Perform the equivalent of vm_allocate() and memcpy().
18493 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
18494 	 */
18495 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18496 
18497 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18498 
18499 	/* create a new pmap to map "copy_map" */
18500 	pmap_flags = 0;
18501 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18502 #if PMAP_CREATE_FORCE_4K_PAGES
18503 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18504 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18505 	pmap_flags |= PMAP_CREATE_64BIT;
18506 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18507 	if (new_pmap == NULL) {
18508 		return KERN_RESOURCE_SHORTAGE;
18509 	}
18510 
18511 	/* allocate new VM object */
18512 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18513 	new_object = vm_object_allocate(size);
18514 	assert(new_object);
18515 
18516 	/* allocate new VM map entry */
18517 	new_entry = vm_map_copy_entry_create(copy_map);
18518 	assert(new_entry);
18519 
18520 	/* finish initializing new VM map entry */
18521 	new_entry->protection = VM_PROT_DEFAULT;
18522 	new_entry->max_protection = VM_PROT_DEFAULT;
18523 	new_entry->use_pmap = TRUE;
18524 
18525 	/* make new VM map entry point to new VM object */
18526 	new_entry->vme_start = 0;
18527 	new_entry->vme_end = size;
18528 	VME_OBJECT_SET(new_entry, new_object, false, 0);
18529 	VME_OFFSET_SET(new_entry, 0);
18530 
18531 	/* create a new pageable VM map to map "copy_map" */
18532 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18533 	    VM_MAP_CREATE_PAGEABLE);
18534 	assert(new_map);
18535 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18536 
18537 	/* map "copy_map" in the new VM map */
18538 	src_start = 0;
18539 	kr = vm_map_copyout_internal(
18540 		new_map,
18541 		&src_start,
18542 		copy_map,
18543 		copy_map->size,
18544 		FALSE, /* consume_on_success */
18545 		VM_PROT_DEFAULT,
18546 		VM_PROT_DEFAULT,
18547 		VM_INHERIT_DEFAULT);
18548 	assert(kr == KERN_SUCCESS);
18549 	src_end = src_start + copy_map->size;
18550 
18551 	/* map "new_object" in the new VM map */
18552 	vm_object_reference(new_object);
18553 	dst_start = 0;
18554 	kr = vm_map_enter(new_map,
18555 	    &dst_start,
18556 	    size,
18557 	    0,               /* mask */
18558 	    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18559 	    new_object,
18560 	    0,               /* offset */
18561 	    FALSE,               /* needs copy */
18562 	    VM_PROT_DEFAULT,
18563 	    VM_PROT_DEFAULT,
18564 	    VM_INHERIT_DEFAULT);
18565 	assert(kr == KERN_SUCCESS);
18566 	dst_end = dst_start + size;
18567 
18568 	/* get a kernel buffer */
18569 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18570 
18571 	/* physically copy "copy_map" mappings to new VM object */
18572 	for (src_cur = src_start, dst_cur = dst_start;
18573 	    src_cur < src_end;
18574 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18575 		vm_size_t bytes;
18576 
18577 		bytes = PAGE_SIZE;
18578 		if (src_cur + PAGE_SIZE > src_end) {
18579 			/* partial copy for last page */
18580 			bytes = src_end - src_cur;
18581 			assert(bytes > 0 && bytes < PAGE_SIZE);
18582 			/* rest of dst page should be zero-filled */
18583 		}
18584 		/* get bytes from src mapping */
18585 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
18586 		if (kr != KERN_SUCCESS) {
18587 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18588 		}
18589 		/* put bytes in dst mapping */
18590 		assert(dst_cur < dst_end);
18591 		assert(dst_cur + bytes <= dst_end);
18592 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18593 		if (kr != KERN_SUCCESS) {
18594 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18595 		}
18596 	}
18597 
18598 	/* free kernel buffer */
18599 	kfree_data(kbuf, PAGE_SIZE);
18600 
18601 	/* destroy new map */
18602 	vm_map_destroy(new_map);
18603 	new_map = VM_MAP_NULL;
18604 
18605 	/* dispose of the old map entries in "copy_map" */
18606 	while (vm_map_copy_first_entry(copy_map) !=
18607 	    vm_map_copy_to_entry(copy_map)) {
18608 		entry = vm_map_copy_first_entry(copy_map);
18609 		vm_map_copy_entry_unlink(copy_map, entry);
18610 		if (entry->is_sub_map) {
18611 			vm_map_deallocate(VME_SUBMAP(entry));
18612 		} else {
18613 			vm_object_deallocate(VME_OBJECT(entry));
18614 		}
18615 		vm_map_copy_entry_dispose(entry);
18616 	}
18617 
18618 	/* change "copy_map"'s page_size to match "target_map" */
18619 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18620 	copy_map->offset = 0;
18621 	copy_map->size = size;
18622 
18623 	/* insert new map entry in "copy_map" */
18624 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18625 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18626 
18627 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18628 	return KERN_SUCCESS;
18629 }
18630 
18631 void
18632 vm_map_copy_adjust_get_target_copy_map(
18633 	vm_map_copy_t   copy_map,
18634 	vm_map_copy_t   *target_copy_map_p);
18635 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18636 vm_map_copy_adjust_get_target_copy_map(
18637 	vm_map_copy_t   copy_map,
18638 	vm_map_copy_t   *target_copy_map_p)
18639 {
18640 	vm_map_copy_t   target_copy_map;
18641 	vm_map_entry_t  entry, target_entry;
18642 
18643 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18644 		/* the caller already has a "target_copy_map": use it */
18645 		return;
18646 	}
18647 
18648 	/* the caller wants us to create a new copy of "copy_map" */
18649 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18650 	target_copy_map = vm_map_copy_allocate(copy_map->type);
18651 	target_copy_map->offset = copy_map->offset;
18652 	target_copy_map->size = copy_map->size;
18653 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18654 	for (entry = vm_map_copy_first_entry(copy_map);
18655 	    entry != vm_map_copy_to_entry(copy_map);
18656 	    entry = entry->vme_next) {
18657 		target_entry = vm_map_copy_entry_create(target_copy_map);
18658 		vm_map_entry_copy_full(target_entry, entry);
18659 		if (target_entry->is_sub_map) {
18660 			vm_map_reference(VME_SUBMAP(target_entry));
18661 		} else {
18662 			vm_object_reference(VME_OBJECT(target_entry));
18663 		}
18664 		vm_map_copy_entry_link(
18665 			target_copy_map,
18666 			vm_map_copy_last_entry(target_copy_map),
18667 			target_entry);
18668 	}
18669 	entry = VM_MAP_ENTRY_NULL;
18670 	*target_copy_map_p = target_copy_map;
18671 }
18672 
18673 /*
18674  * Callers of this function must call vm_map_copy_require on
18675  * previously created vm_map_copy_t or pass a newly created
18676  * one to ensure that it hasn't been forged.
18677  */
18678 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18679 vm_map_copy_trim(
18680 	vm_map_copy_t   copy_map,
18681 	uint16_t        new_page_shift,
18682 	vm_map_offset_t trim_start,
18683 	vm_map_offset_t trim_end)
18684 {
18685 	uint16_t        copy_page_shift;
18686 	vm_map_entry_t  entry, next_entry;
18687 
18688 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18689 	assert(copy_map->cpy_hdr.nentries > 0);
18690 
18691 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18692 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18693 
18694 	/* use the new page_shift to do the clipping */
18695 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18696 	copy_map->cpy_hdr.page_shift = new_page_shift;
18697 
18698 	for (entry = vm_map_copy_first_entry(copy_map);
18699 	    entry != vm_map_copy_to_entry(copy_map);
18700 	    entry = next_entry) {
18701 		next_entry = entry->vme_next;
18702 		if (entry->vme_end <= trim_start) {
18703 			/* entry fully before trim range: skip */
18704 			continue;
18705 		}
18706 		if (entry->vme_start >= trim_end) {
18707 			/* entry fully after trim range: done */
18708 			break;
18709 		}
18710 		/* clip entry if needed */
18711 		vm_map_copy_clip_start(copy_map, entry, trim_start);
18712 		vm_map_copy_clip_end(copy_map, entry, trim_end);
18713 		/* dispose of entry */
18714 		copy_map->size -= entry->vme_end - entry->vme_start;
18715 		vm_map_copy_entry_unlink(copy_map, entry);
18716 		if (entry->is_sub_map) {
18717 			vm_map_deallocate(VME_SUBMAP(entry));
18718 		} else {
18719 			vm_object_deallocate(VME_OBJECT(entry));
18720 		}
18721 		vm_map_copy_entry_dispose(entry);
18722 		entry = VM_MAP_ENTRY_NULL;
18723 	}
18724 
18725 	/* restore copy_map's original page_shift */
18726 	copy_map->cpy_hdr.page_shift = copy_page_shift;
18727 }
18728 
18729 /*
18730  * Make any necessary adjustments to "copy_map" to allow it to be
18731  * mapped into "target_map".
18732  * If no changes were necessary, "target_copy_map" points to the
18733  * untouched "copy_map".
18734  * If changes are necessary, changes will be made to "target_copy_map".
18735  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18736  * copy the original "copy_map" to it before applying the changes.
18737  * The caller should discard "target_copy_map" if it's not the same as
18738  * the original "copy_map".
18739  */
18740 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18741 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_ut offset_u,vm_map_size_ut size_u,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18742 vm_map_copy_adjust_to_target(
18743 	vm_map_copy_t           src_copy_map,
18744 	vm_map_offset_ut        offset_u,
18745 	vm_map_size_ut          size_u,
18746 	vm_map_t                target_map,
18747 	boolean_t               copy,
18748 	vm_map_copy_t           *target_copy_map_p,
18749 	vm_map_offset_t         *overmap_start_p,
18750 	vm_map_offset_t         *overmap_end_p,
18751 	vm_map_offset_t         *trimmed_start_p)
18752 {
18753 	vm_map_copy_t           copy_map, target_copy_map;
18754 	vm_map_size_t           target_size;
18755 	vm_map_size_t           src_copy_map_size;
18756 	vm_map_size_t           overmap_start, overmap_end;
18757 	int                     misalignments;
18758 	vm_map_entry_t          entry, target_entry;
18759 	vm_map_offset_t         addr_adjustment;
18760 	vm_map_offset_t         new_start, new_end;
18761 	int                     copy_page_mask, target_page_mask;
18762 	uint16_t                copy_page_shift, target_page_shift;
18763 	vm_map_offset_t         trimmed_end;
18764 	vm_map_size_t           map_size;
18765 	kern_return_t           kr;
18766 
18767 	/*
18768 	 * Sanitize any input parameters that are addr/size/prot/inherit
18769 	 */
18770 	kr = vm_map_copy_addr_size_sanitize(
18771 		target_map,
18772 		offset_u,
18773 		size_u,
18774 		VM_SANITIZE_CALLER_MACH_MEMORY_ENTRY_MAP_SIZE,
18775 		&new_start,
18776 		&new_end,
18777 		&map_size);
18778 	if (__improbable(kr != KERN_SUCCESS)) {
18779 		return vm_sanitize_get_kr(kr);
18780 	}
18781 
18782 	/*
18783 	 * Assert that the vm_map_copy is coming from the right
18784 	 * zone and hasn't been forged
18785 	 */
18786 	vm_map_copy_require(src_copy_map);
18787 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18788 
18789 	/*
18790 	 * Start working with "src_copy_map" but we'll switch
18791 	 * to "target_copy_map" as soon as we start making adjustments.
18792 	 */
18793 	copy_map = src_copy_map;
18794 	src_copy_map_size = src_copy_map->size;
18795 
18796 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18797 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18798 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18799 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18800 
18801 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), *target_copy_map_p);
18802 
18803 	target_copy_map = *target_copy_map_p;
18804 	if (target_copy_map != VM_MAP_COPY_NULL) {
18805 		vm_map_copy_require(target_copy_map);
18806 	}
18807 
18808 	if (new_end > copy_map->size) {
18809 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u));
18810 		return KERN_INVALID_ARGUMENT;
18811 	}
18812 
18813 	/* trim the end */
18814 	trimmed_end = 0;
18815 	new_end = VM_MAP_ROUND_PAGE(new_end, target_page_mask);
18816 	if (new_end < copy_map->size) {
18817 		trimmed_end = src_copy_map_size - new_end;
18818 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18819 		/* get "target_copy_map" if needed and adjust it */
18820 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18821 		    &target_copy_map);
18822 		copy_map = target_copy_map;
18823 		vm_map_copy_trim(target_copy_map, target_page_shift,
18824 		    new_end, copy_map->size);
18825 	}
18826 
18827 	/* trim the start */
18828 	new_start = VM_MAP_TRUNC_PAGE(new_start, target_page_mask);
18829 	if (new_start != 0) {
18830 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)0, (uint64_t)new_start);
18831 		/* get "target_copy_map" if needed and adjust it */
18832 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18833 		    &target_copy_map);
18834 		copy_map = target_copy_map;
18835 		vm_map_copy_trim(target_copy_map, target_page_shift,
18836 		    0, new_start);
18837 	}
18838 	*trimmed_start_p = new_start;
18839 
18840 	/* target_size starts with what's left after trimming */
18841 	target_size = copy_map->size;
18842 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18843 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18844 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
18845 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18846 
18847 	/* check for misalignments but don't adjust yet */
18848 	misalignments = 0;
18849 	overmap_start = 0;
18850 	overmap_end = 0;
18851 	if (copy_page_shift < target_page_shift) {
18852 		/*
18853 		 * Remapping from 4K to 16K: check the VM object alignments
18854 		 * throughout the range.
18855 		 * If the start and end of the range are mis-aligned, we can
18856 		 * over-map to re-align, and adjust the "overmap" start/end
18857 		 * and "target_size" of the range accordingly.
18858 		 * If there is any mis-alignment within the range:
18859 		 *     if "copy":
18860 		 *         we can do immediate-copy instead of copy-on-write,
18861 		 *     else:
18862 		 *         no way to remap and share; fail.
18863 		 */
18864 		for (entry = vm_map_copy_first_entry(copy_map);
18865 		    entry != vm_map_copy_to_entry(copy_map);
18866 		    entry = entry->vme_next) {
18867 			vm_object_offset_t object_offset_start, object_offset_end;
18868 
18869 			object_offset_start = VME_OFFSET(entry);
18870 			object_offset_end = object_offset_start;
18871 			object_offset_end += entry->vme_end - entry->vme_start;
18872 			if (object_offset_start & target_page_mask) {
18873 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18874 					overmap_start++;
18875 				} else {
18876 					misalignments++;
18877 				}
18878 			}
18879 			if (object_offset_end & target_page_mask) {
18880 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18881 					overmap_end++;
18882 				} else {
18883 					misalignments++;
18884 				}
18885 			}
18886 		}
18887 	}
18888 	entry = VM_MAP_ENTRY_NULL;
18889 
18890 	/* decide how to deal with misalignments */
18891 	assert(overmap_start <= 1);
18892 	assert(overmap_end <= 1);
18893 	if (!overmap_start && !overmap_end && !misalignments) {
18894 		/* copy_map is properly aligned for target_map ... */
18895 		if (*trimmed_start_p) {
18896 			/* ... but we trimmed it, so still need to adjust */
18897 		} else {
18898 			/* ... and we didn't trim anything: we're done */
18899 			if (target_copy_map == VM_MAP_COPY_NULL) {
18900 				target_copy_map = copy_map;
18901 			}
18902 			*target_copy_map_p = target_copy_map;
18903 			*overmap_start_p = 0;
18904 			*overmap_end_p = 0;
18905 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18906 			return KERN_SUCCESS;
18907 		}
18908 	} else if (misalignments && !copy) {
18909 		/* can't "share" if misaligned */
18910 		DEBUG4K_ADJUST("unsupported sharing\n");
18911 #if MACH_ASSERT
18912 		if (debug4k_panic_on_misaligned_sharing) {
18913 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18914 		}
18915 #endif /* MACH_ASSERT */
18916 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18917 		return KERN_NOT_SUPPORTED;
18918 	} else {
18919 		/* can't virtual-copy if misaligned (but can physical-copy) */
18920 		DEBUG4K_ADJUST("mis-aligned copying\n");
18921 	}
18922 
18923 	/* get a "target_copy_map" if needed and switch to it */
18924 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18925 	copy_map = target_copy_map;
18926 
18927 	if (misalignments && copy) {
18928 		vm_map_size_t target_copy_map_size;
18929 
18930 		/*
18931 		 * Can't do copy-on-write with misaligned mappings.
18932 		 * Replace the mappings with a physical copy of the original
18933 		 * mappings' contents.
18934 		 */
18935 		target_copy_map_size = target_copy_map->size;
18936 		kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18937 		if (kr != KERN_SUCCESS) {
18938 			return kr;
18939 		}
18940 		*target_copy_map_p = target_copy_map;
18941 		*overmap_start_p = 0;
18942 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
18943 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18944 		return KERN_SUCCESS;
18945 	}
18946 
18947 	/* apply the adjustments */
18948 	misalignments = 0;
18949 	overmap_start = 0;
18950 	overmap_end = 0;
18951 	/* remove copy_map->offset, so that everything starts at offset 0 */
18952 	addr_adjustment = copy_map->offset;
18953 	/* also remove whatever we trimmed from the start */
18954 	addr_adjustment += *trimmed_start_p;
18955 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
18956 	    target_entry != vm_map_copy_to_entry(target_copy_map);
18957 	    target_entry = target_entry->vme_next) {
18958 		vm_object_offset_t object_offset_start, object_offset_end;
18959 
18960 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18961 		object_offset_start = VME_OFFSET(target_entry);
18962 		if (object_offset_start & target_page_mask) {
18963 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18964 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18965 				/*
18966 				 * start of 1st entry is mis-aligned:
18967 				 * re-adjust by over-mapping.
18968 				 */
18969 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18970 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18971 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18972 			} else {
18973 				misalignments++;
18974 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18975 				assert(copy);
18976 			}
18977 		}
18978 
18979 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18980 			target_size += overmap_start;
18981 		} else {
18982 			target_entry->vme_start += overmap_start;
18983 		}
18984 		target_entry->vme_end += overmap_start;
18985 
18986 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18987 		if (object_offset_end & target_page_mask) {
18988 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18989 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18990 				/*
18991 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
18992 				 */
18993 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18994 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18995 				target_entry->vme_end += overmap_end;
18996 				target_size += overmap_end;
18997 			} else {
18998 				misalignments++;
18999 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
19000 				assert(copy);
19001 			}
19002 		}
19003 		target_entry->vme_start -= addr_adjustment;
19004 		target_entry->vme_end -= addr_adjustment;
19005 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19006 	}
19007 
19008 	target_copy_map->size = target_size;
19009 	target_copy_map->offset += overmap_start;
19010 	target_copy_map->offset -= addr_adjustment;
19011 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
19012 
19013 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
19014 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
19015 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
19016 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
19017 
19018 	*target_copy_map_p = target_copy_map;
19019 	*overmap_start_p = overmap_start;
19020 	*overmap_end_p = overmap_end;
19021 
19022 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19023 	return KERN_SUCCESS;
19024 }
19025 
19026 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)19027 vm_map_range_physical_size(
19028 	vm_map_t         map,
19029 	vm_map_address_t start,
19030 	mach_vm_size_t   size,
19031 	mach_vm_size_t * phys_size)
19032 {
19033 	kern_return_t   kr;
19034 	vm_map_copy_t   copy_map, target_copy_map;
19035 	vm_map_offset_t adjusted_start, adjusted_end;
19036 	vm_map_size_t   adjusted_size;
19037 	vm_prot_t       cur_prot, max_prot;
19038 	vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
19039 	vm_map_kernel_flags_t vmk_flags;
19040 
19041 	if (size == 0) {
19042 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
19043 		*phys_size = 0;
19044 		return KERN_SUCCESS;
19045 	}
19046 
19047 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
19048 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
19049 	if (__improbable(os_add_overflow(start, size, &end) ||
19050 	    adjusted_end <= adjusted_start)) {
19051 		/* wraparound */
19052 		printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
19053 		*phys_size = 0;
19054 		return KERN_INVALID_ARGUMENT;
19055 	}
19056 	if (__improbable(vm_map_range_overflows(map, start, size))) {
19057 		*phys_size = 0;
19058 		return KERN_INVALID_ADDRESS;
19059 	}
19060 	assert(adjusted_end > adjusted_start);
19061 	adjusted_size = adjusted_end - adjusted_start;
19062 	*phys_size = adjusted_size;
19063 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
19064 		return KERN_SUCCESS;
19065 	}
19066 	if (start == 0) {
19067 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
19068 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
19069 		if (__improbable(adjusted_end <= adjusted_start)) {
19070 			/* wraparound */
19071 			printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
19072 			*phys_size = 0;
19073 			return KERN_INVALID_ARGUMENT;
19074 		}
19075 		assert(adjusted_end > adjusted_start);
19076 		adjusted_size = adjusted_end - adjusted_start;
19077 		*phys_size = adjusted_size;
19078 		return KERN_SUCCESS;
19079 	}
19080 
19081 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
19082 	vmk_flags.vmkf_copy_pageable = TRUE;
19083 	vmk_flags.vmkf_copy_same_map = TRUE;
19084 	assert(adjusted_size != 0);
19085 	cur_prot = VM_PROT_NONE; /* legacy mode */
19086 	max_prot = VM_PROT_NONE; /* legacy mode */
19087 	vmk_flags.vmkf_remap_legacy_mode = true;
19088 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
19089 	    FALSE /* copy */,
19090 	    &copy_map,
19091 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
19092 	    vmk_flags);
19093 	if (kr != KERN_SUCCESS) {
19094 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19095 		//assert(0);
19096 		*phys_size = 0;
19097 		return kr;
19098 	}
19099 	assert(copy_map != VM_MAP_COPY_NULL);
19100 	target_copy_map = copy_map;
19101 	DEBUG4K_ADJUST("adjusting...\n");
19102 	kr = vm_map_copy_adjust_to_target(
19103 		copy_map,
19104 		start - adjusted_start, /* offset */
19105 		size, /* size */
19106 		kernel_map,
19107 		FALSE,                          /* copy */
19108 		&target_copy_map,
19109 		&overmap_start,
19110 		&overmap_end,
19111 		&trimmed_start);
19112 	if (kr == KERN_SUCCESS) {
19113 		if (target_copy_map->size != *phys_size) {
19114 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
19115 		}
19116 		*phys_size = target_copy_map->size;
19117 	} else {
19118 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19119 		//assert(0);
19120 		*phys_size = 0;
19121 	}
19122 	vm_map_copy_discard(copy_map);
19123 	copy_map = VM_MAP_COPY_NULL;
19124 
19125 	return kr;
19126 }
19127 
19128 static __attribute__((always_inline, warn_unused_result))
19129 kern_return_t
vm_map_remap_sanitize(vm_map_t src_map,vm_map_t target_map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_offset_ut memory_address_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * target_addr,vm_map_address_t * mask,vm_map_offset_t * memory_address,vm_map_offset_t * memory_end,vm_map_size_t * memory_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)19130 vm_map_remap_sanitize(
19131 	vm_map_t                src_map,
19132 	vm_map_t                target_map,
19133 	vm_map_address_ut       address_u,
19134 	vm_map_size_ut          size_u,
19135 	vm_map_offset_ut        mask_u,
19136 	vm_map_offset_ut        memory_address_u,
19137 	vm_prot_ut              cur_protection_u,
19138 	vm_prot_ut              max_protection_u,
19139 	vm_inherit_ut           inheritance_u,
19140 	vm_map_kernel_flags_t   vmk_flags,
19141 	vm_map_address_t       *target_addr,
19142 	vm_map_address_t       *mask,
19143 	vm_map_offset_t        *memory_address,
19144 	vm_map_offset_t        *memory_end,
19145 	vm_map_size_t          *memory_size,
19146 	vm_prot_t              *cur_protection,
19147 	vm_prot_t              *max_protection,
19148 	vm_inherit_t           *inheritance)
19149 {
19150 	kern_return_t           result;
19151 	vm_sanitize_flags_t     vm_sanitize_flags;
19152 
19153 	result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_VM_MAP_REMAP,
19154 	    inheritance);
19155 	if (__improbable(result != KERN_SUCCESS)) {
19156 		return result;
19157 	}
19158 
19159 	result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
19160 	    VM_SANITIZE_CALLER_VM_MAP_REMAP, target_map,
19161 	    cur_protection, max_protection);
19162 	if (__improbable(result != KERN_SUCCESS)) {
19163 		return result;
19164 	}
19165 
19166 	result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, mask);
19167 	if (__improbable(result != KERN_SUCCESS)) {
19168 		return result;
19169 	}
19170 
19171 	/*
19172 	 * If the user is requesting that we return the address of the
19173 	 * first byte of the data (rather than the base of the page),
19174 	 * then we use different rounding semantics: specifically,
19175 	 * we assume that (memory_address, size) describes a region
19176 	 * all of whose pages we must cover, rather than a base to be truncated
19177 	 * down and a size to be added to that base.  So we figure out
19178 	 * the highest page that the requested region includes and make
19179 	 * sure that the size will cover it.
19180 	 *
19181 	 * The key example we're worried about it is of the form:
19182 	 *
19183 	 *              memory_address = 0x1ff0, size = 0x20
19184 	 *
19185 	 * With the old semantics, we round down the memory_address to 0x1000
19186 	 * and round up the size to 0x1000, resulting in our covering *only*
19187 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
19188 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
19189 	 * 0x1000 and page 0x2000 in the region we remap.
19190 	 *
19191 	 * VM_SANITIZE_FLAGS_REALIGN_START asks for the old (broken) semantics.
19192 	 */
19193 	vm_sanitize_flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS;
19194 	if (!vmk_flags.vmf_return_data_addr) {
19195 		vm_sanitize_flags |= VM_SANITIZE_FLAGS_REALIGN_START;
19196 	}
19197 
19198 	result = vm_sanitize_addr_size(memory_address_u, size_u,
19199 	    VM_SANITIZE_CALLER_VM_MAP_REMAP, src_map,
19200 	    vm_sanitize_flags, memory_address, memory_end,
19201 	    memory_size);
19202 	if (__improbable(result != KERN_SUCCESS)) {
19203 		return result;
19204 	}
19205 
19206 	*target_addr = vm_sanitize_addr(target_map, address_u);
19207 	return KERN_SUCCESS;
19208 }
19209 
19210 /*
19211  *	Routine:	vm_remap
19212  *
19213  *			Map portion of a task's address space.
19214  *			Mapped region must not overlap more than
19215  *			one vm memory object. Protections and
19216  *			inheritance attributes remain the same
19217  *			as in the original task and are	out parameters.
19218  *			Source and Target task can be identical
19219  *			Other attributes are identical as for vm_map()
19220  */
19221 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_ut * address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_ut memory_address_u,boolean_t copy,vm_prot_ut * cur_protection_u,vm_prot_ut * max_protection_u,vm_inherit_ut inheritance_u)19222 vm_map_remap(
19223 	vm_map_t                target_map,
19224 	vm_map_address_ut      *address_u,
19225 	vm_map_size_ut          size_u,
19226 	vm_map_offset_ut        mask_u,
19227 	vm_map_kernel_flags_t   vmk_flags,
19228 	vm_map_t                src_map,
19229 	vm_map_offset_ut        memory_address_u,
19230 	boolean_t               copy,
19231 	vm_prot_ut             *cur_protection_u, /* IN/OUT */
19232 	vm_prot_ut             *max_protection_u, /* IN/OUT */
19233 	vm_inherit_ut           inheritance_u)
19234 {
19235 	vm_map_address_t        target_addr, mask;
19236 	vm_map_size_t           target_size;
19237 	vm_map_offset_t         memory_address, memory_end;
19238 	vm_map_size_t           memory_size;
19239 	vm_prot_t               cur_protection, max_protection;
19240 	vm_inherit_t            inheritance;
19241 	kern_return_t           result;
19242 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
19243 	vm_map_copy_t           copy_map;
19244 	vm_map_offset_t         offset_in_mapping;
19245 	vm_map_size_t           src_page_mask, target_page_mask;
19246 	vm_map_size_t           initial_size;
19247 	VM_MAP_ZAP_DECLARE(zap_list);
19248 
19249 	if (target_map == VM_MAP_NULL || src_map == VM_MAP_NULL) {
19250 		return KERN_INVALID_ARGUMENT;
19251 	}
19252 	src_page_mask    = VM_MAP_PAGE_MASK(src_map);
19253 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
19254 
19255 	if (src_page_mask != target_page_mask) {
19256 		if (copy) {
19257 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19258 		} else {
19259 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19260 		}
19261 	}
19262 
19263 	/*
19264 	 * Sanitize any input parameters that are addr/size/prot/inherit
19265 	 */
19266 	result = vm_map_remap_sanitize(src_map,
19267 	    target_map,
19268 	    *address_u,
19269 	    size_u,
19270 	    mask_u,
19271 	    memory_address_u,
19272 	    *cur_protection_u,
19273 	    *max_protection_u,
19274 	    inheritance_u,
19275 	    vmk_flags,
19276 	    &target_addr,
19277 	    &mask,
19278 	    &memory_address,
19279 	    &memory_end,
19280 	    &memory_size,
19281 	    &cur_protection,
19282 	    &max_protection,
19283 	    &inheritance);
19284 	if (__improbable(result != KERN_SUCCESS)) {
19285 		return vm_sanitize_get_kr(result);
19286 	}
19287 
19288 	if (vmk_flags.vmf_return_data_addr) {
19289 		/*
19290 		 * This is safe to unwrap now that the quantities
19291 		 * have been validated and rounded up normally.
19292 		 */
19293 		offset_in_mapping = vm_sanitize_offset_in_page(src_map,
19294 		    memory_address_u);
19295 		initial_size = VM_SANITIZE_UNSAFE_UNWRAP(size_u);
19296 	} else {
19297 		/*
19298 		 * IMPORTANT:
19299 		 * This legacy code path is broken: for the range mentioned
19300 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19301 		 * two 4k pages, it yields [ memory_address = 0x1000,
19302 		 * size = 0x1000 ], which covers only the first 4k page.
19303 		 * BUT some code unfortunately depends on this bug, so we
19304 		 * can't fix it without breaking something.
19305 		 * New code should get automatically opted in the new
19306 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19307 		 */
19308 		offset_in_mapping = 0;
19309 		initial_size = memory_size;
19310 	}
19311 
19312 	if (vmk_flags.vmf_resilient_media) {
19313 		/* must be copy-on-write to be "media resilient" */
19314 		if (!copy) {
19315 			return KERN_INVALID_ARGUMENT;
19316 		}
19317 	}
19318 
19319 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19320 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19321 
19322 	assert(memory_size != 0);
19323 	result = vm_map_copy_extract(src_map,
19324 	    memory_address,
19325 	    memory_size,
19326 	    copy, &copy_map,
19327 	    &cur_protection, /* IN/OUT */
19328 	    &max_protection, /* IN/OUT */
19329 	    inheritance,
19330 	    vmk_flags);
19331 	if (result != KERN_SUCCESS) {
19332 		return result;
19333 	}
19334 	assert(copy_map != VM_MAP_COPY_NULL);
19335 
19336 	/*
19337 	 * Handle the policy for vm map ranges
19338 	 *
19339 	 * If the maps differ, the target_map policy applies like for vm_map()
19340 	 * For same mapping remaps, we preserve the range.
19341 	 */
19342 	if (vmk_flags.vmkf_copy_same_map) {
19343 		vmk_flags.vmkf_range_id = copy_map->orig_range;
19344 	} else {
19345 		vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, memory_size);
19346 	}
19347 
19348 	target_size = memory_size;
19349 	if (src_page_mask != target_page_mask) {
19350 		vm_map_copy_t   target_copy_map;
19351 		vm_map_offset_t overmap_start = 0;
19352 		vm_map_offset_t overmap_end   = 0;
19353 		vm_map_offset_t trimmed_start = 0;
19354 
19355 		target_copy_map = copy_map; /* can modify "copy_map" itself */
19356 		DEBUG4K_ADJUST("adjusting...\n");
19357 		result = vm_map_copy_adjust_to_target(
19358 			copy_map,
19359 			offset_in_mapping, /* offset */
19360 			initial_size,
19361 			target_map,
19362 			copy,
19363 			&target_copy_map,
19364 			&overmap_start,
19365 			&overmap_end,
19366 			&trimmed_start);
19367 		if (result != KERN_SUCCESS) {
19368 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19369 			vm_map_copy_discard(copy_map);
19370 			return result;
19371 		}
19372 		if (trimmed_start == 0) {
19373 			/* nothing trimmed: no adjustment needed */
19374 		} else if (trimmed_start >= offset_in_mapping) {
19375 			/* trimmed more than offset_in_mapping: nothing left */
19376 			assert(overmap_start == 0);
19377 			assert(overmap_end == 0);
19378 			offset_in_mapping = 0;
19379 		} else {
19380 			/* trimmed some of offset_in_mapping: adjust */
19381 			assert(overmap_start == 0);
19382 			assert(overmap_end == 0);
19383 			offset_in_mapping -= trimmed_start;
19384 		}
19385 		offset_in_mapping += overmap_start;
19386 		target_size = target_copy_map->size;
19387 	}
19388 
19389 	/*
19390 	 * Allocate/check a range of free virtual address
19391 	 * space for the target
19392 	 */
19393 	target_size = vm_map_round_page(target_size, target_page_mask);
19394 
19395 	if (target_size == 0) {
19396 		vm_map_copy_discard(copy_map);
19397 		return KERN_INVALID_ARGUMENT;
19398 	}
19399 
19400 	vm_map_lock(target_map);
19401 
19402 	if (!vmk_flags.vmf_fixed) {
19403 		result = vm_map_locate_space_anywhere(target_map, target_size,
19404 		    mask, vmk_flags, &target_addr, &insp_entry);
19405 	} else {
19406 		/*
19407 		 * vm_map_locate_space_fixed will reject overflowing
19408 		 * target_addr + target_size values
19409 		 */
19410 		result = vm_map_locate_space_fixed(target_map, target_addr,
19411 		    target_size, mask, vmk_flags, &insp_entry, &zap_list);
19412 
19413 		if (result == KERN_MEMORY_PRESENT) {
19414 			assert(!vmk_flags.vmkf_already);
19415 			insp_entry = VM_MAP_ENTRY_NULL;
19416 			result = KERN_NO_SPACE;
19417 		}
19418 	}
19419 
19420 	if (result == KERN_SUCCESS) {
19421 		while (vm_map_copy_first_entry(copy_map) !=
19422 		    vm_map_copy_to_entry(copy_map)) {
19423 			vm_map_entry_t entry = vm_map_copy_first_entry(copy_map);
19424 
19425 			vm_map_copy_entry_unlink(copy_map, entry);
19426 
19427 			if (vmk_flags.vmkf_remap_prot_copy) {
19428 				/*
19429 				 * This vm_map_remap() is for a
19430 				 * vm_protect(VM_PROT_COPY), so the caller
19431 				 * expects to be allowed to add write access
19432 				 * to this new mapping.  This is done by
19433 				 * adding VM_PROT_WRITE to each entry's
19434 				 * max_protection... unless some security
19435 				 * settings disallow it.
19436 				 */
19437 				bool allow_write = false;
19438 				if (entry->vme_permanent) {
19439 					/* immutable mapping... */
19440 					if ((entry->max_protection & VM_PROT_EXECUTE) &&
19441 					    developer_mode_state()) {
19442 						/*
19443 						 * ... but executable and
19444 						 * possibly being debugged,
19445 						 * so let's allow it to become
19446 						 * writable, for breakpoints
19447 						 * and dtrace probes, for
19448 						 * example.
19449 						 */
19450 						allow_write = true;
19451 					} else {
19452 						printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19453 						    proc_selfpid(),
19454 						    (get_bsdtask_info(current_task())
19455 						    ? proc_name_address(get_bsdtask_info(current_task()))
19456 						    : "?"),
19457 						    (uint64_t)memory_address,
19458 						    (uint64_t)memory_size,
19459 						    entry->protection,
19460 						    entry->max_protection,
19461 						    developer_mode_state());
19462 						DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19463 						    vm_map_entry_t, entry,
19464 						    vm_map_offset_t, entry->vme_start,
19465 						    vm_map_offset_t, entry->vme_end,
19466 						    vm_prot_t, entry->protection,
19467 						    vm_prot_t, entry->max_protection,
19468 						    int, VME_ALIAS(entry));
19469 					}
19470 				} else {
19471 					allow_write = true;
19472 				}
19473 
19474 				/*
19475 				 * VM_PROT_COPY: allow this mapping to become
19476 				 * writable, unless it was "permanent".
19477 				 */
19478 				if (allow_write) {
19479 					entry->max_protection |= VM_PROT_WRITE;
19480 				}
19481 			}
19482 			if (vmk_flags.vmf_resilient_codesign) {
19483 				/* no codesigning -> read-only access */
19484 				entry->max_protection = VM_PROT_READ;
19485 				entry->protection = VM_PROT_READ;
19486 				entry->vme_resilient_codesign = TRUE;
19487 			}
19488 			entry->vme_start += target_addr;
19489 			entry->vme_end += target_addr;
19490 			assert(!entry->map_aligned);
19491 			if (vmk_flags.vmf_resilient_media &&
19492 			    !entry->is_sub_map &&
19493 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19494 			    VME_OBJECT(entry)->internal)) {
19495 				entry->vme_resilient_media = TRUE;
19496 			}
19497 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19498 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19499 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19500 			vm_map_store_entry_link(target_map, insp_entry, entry,
19501 			    vmk_flags);
19502 			insp_entry = entry;
19503 		}
19504 	}
19505 
19506 	if (vmk_flags.vmf_resilient_codesign) {
19507 		cur_protection = VM_PROT_READ;
19508 		max_protection = VM_PROT_READ;
19509 	}
19510 
19511 	if (result == KERN_SUCCESS) {
19512 		target_map->size += target_size;
19513 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19514 	}
19515 	vm_map_unlock(target_map);
19516 
19517 	vm_map_zap_dispose(&zap_list);
19518 
19519 	if (result == KERN_SUCCESS && target_map->wiring_required) {
19520 		result = vm_map_wire_nested(target_map, target_addr,
19521 		    target_addr + target_size, cur_protection, VM_KERN_MEMORY_MLOCK,
19522 		    TRUE, PMAP_NULL, 0, NULL);
19523 	}
19524 
19525 	if (result == KERN_SUCCESS) {
19526 #if KASAN
19527 		if (target_map->pmap == kernel_pmap) {
19528 			kasan_notify_address(target_addr, target_size);
19529 		}
19530 #endif
19531 		/*
19532 		 * If requested, return the address of the data pointed to by the
19533 		 * request, rather than the base of the resulting page.
19534 		 */
19535 		if (vmk_flags.vmf_return_data_addr) {
19536 			target_addr += offset_in_mapping;
19537 		}
19538 
19539 		/*
19540 		 * Update OUT parameters.
19541 		 */
19542 		*address_u = vm_sanitize_wrap_addr(target_addr);
19543 
19544 		*cur_protection_u = vm_sanitize_wrap_prot(cur_protection);
19545 		*max_protection_u = vm_sanitize_wrap_prot(max_protection);
19546 	}
19547 
19548 	if (src_page_mask != target_page_mask) {
19549 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)target_size, copy, target_map, (uint64_t)target_addr, (uint64_t)offset_in_mapping, result);
19550 	}
19551 	vm_map_copy_discard(copy_map);
19552 	copy_map = VM_MAP_COPY_NULL;
19553 
19554 	return result;
19555 }
19556 
19557 /*
19558  *	vm_map_switch:
19559  *
19560  *	Set the address map for the current thread to the specified map
19561  */
19562 
19563 vm_map_t
vm_map_switch(vm_map_t map)19564 vm_map_switch(
19565 	vm_map_t        map)
19566 {
19567 	thread_t        thread = current_thread();
19568 	vm_map_t        oldmap = thread->map;
19569 
19570 
19571 	/*
19572 	 *	Deactivate the current map and activate the requested map
19573 	 */
19574 	mp_disable_preemption();
19575 	PMAP_SWITCH_USER(thread, map, cpu_number());
19576 	mp_enable_preemption();
19577 	return oldmap;
19578 }
19579 
19580 static __attribute__((always_inline, warn_unused_result))
19581 kern_return_t
vm_map_rw_user_sanitize(vm_map_t map,vm_map_address_ut addr_u,vm_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_address_t * addr,vm_map_address_t * end,vm_map_size_t * size)19582 vm_map_rw_user_sanitize(
19583 	vm_map_t                map,
19584 	vm_map_address_ut       addr_u,
19585 	vm_size_ut              size_u,
19586 	vm_sanitize_caller_t    vm_sanitize_caller,
19587 	vm_map_address_t       *addr,
19588 	vm_map_address_t       *end,
19589 	vm_map_size_t          *size)
19590 {
19591 	vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
19592 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES;
19593 
19594 
19595 	return vm_sanitize_addr_size(addr_u, size_u,
19596 	           vm_sanitize_caller, map,
19597 	           flags,
19598 	           addr, end, size);
19599 }
19600 
19601 /*
19602  *	Routine:	vm_map_write_user
19603  *
19604  *	Description:
19605  *		Copy out data from a kernel space into space in the
19606  *		destination map. The space must already exist in the
19607  *		destination map.
19608  *		NOTE:  This routine should only be called by threads
19609  *		which can block on a page fault. i.e. kernel mode user
19610  *		threads.
19611  *
19612  */
19613 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_ut dst_addr_u,vm_size_ut size_u)19614 vm_map_write_user(
19615 	vm_map_t                map,
19616 	void                   *src_p,
19617 	vm_map_address_ut       dst_addr_u,
19618 	vm_size_ut              size_u)
19619 {
19620 	kern_return_t    kr;
19621 	vm_map_address_t dst_addr, dst_end;
19622 	vm_map_size_t    size;
19623 
19624 	/*
19625 	 * src_p isn't validated: [src_p, src_p + size_u)
19626 	 * is trusted kernel input.
19627 	 *
19628 	 * dst_addr_u and size_u are untrusted and need to be sanitized.
19629 	 */
19630 	kr = vm_map_rw_user_sanitize(map,
19631 	    dst_addr_u,
19632 	    size_u,
19633 	    VM_SANITIZE_CALLER_VM_MAP_WRITE_USER,
19634 	    &dst_addr,
19635 	    &dst_end,
19636 	    &size);
19637 	if (__improbable(kr != KERN_SUCCESS)) {
19638 		return vm_sanitize_get_kr(kr);
19639 	}
19640 
19641 	if (current_map() == map) {
19642 		if (copyout(src_p, dst_addr, size)) {
19643 			kr = KERN_INVALID_ADDRESS;
19644 		}
19645 	} else {
19646 		vm_map_t        oldmap;
19647 
19648 		/* take on the identity of the target map while doing */
19649 		/* the transfer */
19650 
19651 		vm_map_reference(map);
19652 		oldmap = vm_map_switch(map);
19653 		if (copyout(src_p, dst_addr, size)) {
19654 			kr = KERN_INVALID_ADDRESS;
19655 		}
19656 		vm_map_switch(oldmap);
19657 		vm_map_deallocate(map);
19658 	}
19659 	return kr;
19660 }
19661 
19662 /*
19663  *	Routine:	vm_map_read_user
19664  *
19665  *	Description:
19666  *		Copy in data from a user space source map into the
19667  *		kernel map. The space must already exist in the
19668  *		kernel map.
19669  *		NOTE:  This routine should only be called by threads
19670  *		which can block on a page fault. i.e. kernel mode user
19671  *		threads.
19672  *
19673  */
19674 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_ut src_addr_u,void * dst_p,vm_size_ut size_u)19675 vm_map_read_user(
19676 	vm_map_t                map,
19677 	vm_map_address_ut       src_addr_u,
19678 	void                   *dst_p,
19679 	vm_size_ut              size_u)
19680 {
19681 	kern_return_t    kr;
19682 	vm_map_address_t src_addr, src_end;
19683 	vm_map_size_t    size;
19684 
19685 	/*
19686 	 * dst_p isn't validated: [dst_p, dst_p + size_u)
19687 	 * is trusted kernel input.
19688 	 *
19689 	 * src_addr_u and size_u are untrusted and need to be sanitized.
19690 	 */
19691 	kr = vm_map_rw_user_sanitize(map,
19692 	    src_addr_u,
19693 	    size_u,
19694 	    VM_SANITIZE_CALLER_VM_MAP_READ_USER,
19695 	    &src_addr,
19696 	    &src_end,
19697 	    &size);
19698 	if (__improbable(kr != KERN_SUCCESS)) {
19699 		return vm_sanitize_get_kr(kr);
19700 	}
19701 
19702 	if (current_map() == map) {
19703 		if (copyin(src_addr, dst_p, size)) {
19704 			kr = KERN_INVALID_ADDRESS;
19705 		}
19706 	} else {
19707 		vm_map_t        oldmap;
19708 
19709 		/* take on the identity of the target map while doing */
19710 		/* the transfer */
19711 
19712 		vm_map_reference(map);
19713 		oldmap = vm_map_switch(map);
19714 		if (copyin(src_addr, dst_p, size)) {
19715 			kr = KERN_INVALID_ADDRESS;
19716 		}
19717 		vm_map_switch(oldmap);
19718 		vm_map_deallocate(map);
19719 	}
19720 	return kr;
19721 }
19722 
19723 
19724 static __attribute__((always_inline, warn_unused_result))
19725 kern_return_t
vm_map_check_protection_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * protection)19726 vm_map_check_protection_sanitize(
19727 	vm_map_t                map,
19728 	vm_map_offset_ut        start_u,
19729 	vm_map_offset_ut        end_u,
19730 	vm_prot_ut              protection_u,
19731 	vm_sanitize_caller_t    vm_sanitize_caller,
19732 	vm_map_offset_t        *start,
19733 	vm_map_offset_t        *end,
19734 	vm_prot_t              *protection)
19735 {
19736 	kern_return_t           kr;
19737 	vm_map_size_t           size;
19738 
19739 	kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
19740 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start, end,
19741 	    &size);
19742 	if (__improbable(kr != KERN_SUCCESS)) {
19743 		return kr;
19744 	}
19745 
19746 	/*
19747 	 * Given that the protection is used only for comparisons below
19748 	 * no sanitization is being applied on it.
19749 	 */
19750 	*protection = VM_SANITIZE_UNSAFE_UNWRAP(protection_u);
19751 
19752 	return KERN_SUCCESS;
19753 }
19754 
19755 /*
19756  *	vm_map_check_protection:
19757  *
19758  *	Assert that the target map allows the specified
19759  *	privilege on the entire address region given.
19760  *	The entire region must be allocated.
19761  */
19762 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller)19763 vm_map_check_protection(
19764 	vm_map_t                map,
19765 	vm_map_offset_ut        start_u,
19766 	vm_map_offset_ut        end_u,
19767 	vm_prot_ut              protection_u,
19768 	vm_sanitize_caller_t    vm_sanitize_caller)
19769 {
19770 	vm_map_entry_t entry;
19771 	vm_map_entry_t tmp_entry;
19772 	vm_map_offset_t start;
19773 	vm_map_offset_t end;
19774 	vm_prot_t protection;
19775 	kern_return_t kr;
19776 
19777 	kr = vm_map_check_protection_sanitize(map,
19778 	    start_u,
19779 	    end_u,
19780 	    protection_u,
19781 	    vm_sanitize_caller,
19782 	    &start,
19783 	    &end,
19784 	    &protection);
19785 	if (__improbable(kr != KERN_SUCCESS)) {
19786 		kr = vm_sanitize_get_kr(kr);
19787 		if (kr == KERN_SUCCESS) {
19788 			return true;
19789 		}
19790 		return false;
19791 	}
19792 
19793 	vm_map_lock(map);
19794 
19795 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
19796 		vm_map_unlock(map);
19797 		return false;
19798 	}
19799 
19800 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19801 		vm_map_unlock(map);
19802 		return false;
19803 	}
19804 
19805 	entry = tmp_entry;
19806 
19807 	while (start < end) {
19808 		if (entry == vm_map_to_entry(map)) {
19809 			vm_map_unlock(map);
19810 			return false;
19811 		}
19812 
19813 		/*
19814 		 *	No holes allowed!
19815 		 */
19816 
19817 		if (start < entry->vme_start) {
19818 			vm_map_unlock(map);
19819 			return false;
19820 		}
19821 
19822 		/*
19823 		 * Check protection associated with entry.
19824 		 */
19825 
19826 		if ((entry->protection & protection) != protection) {
19827 			vm_map_unlock(map);
19828 			return false;
19829 		}
19830 
19831 		/* go to next entry */
19832 
19833 		start = entry->vme_end;
19834 		entry = entry->vme_next;
19835 	}
19836 	vm_map_unlock(map);
19837 	return true;
19838 }
19839 
19840 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_ut address_u,vm_purgable_t control,int * state)19841 vm_map_purgable_control(
19842 	vm_map_t                map,
19843 	vm_map_offset_ut        address_u,
19844 	vm_purgable_t           control,
19845 	int                    *state)
19846 {
19847 	vm_map_offset_t         address;
19848 	vm_map_entry_t          entry;
19849 	vm_object_t             object;
19850 	kern_return_t           kr;
19851 	boolean_t               was_nonvolatile;
19852 
19853 	/*
19854 	 * Vet all the input parameters and current type and state of the
19855 	 * underlaying object.  Return with an error if anything is amiss.
19856 	 */
19857 	if (map == VM_MAP_NULL) {
19858 		return KERN_INVALID_ARGUMENT;
19859 	}
19860 
19861 	if (control != VM_PURGABLE_SET_STATE &&
19862 	    control != VM_PURGABLE_GET_STATE &&
19863 	    control != VM_PURGABLE_PURGE_ALL &&
19864 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19865 		return KERN_INVALID_ARGUMENT;
19866 	}
19867 
19868 	if (control == VM_PURGABLE_PURGE_ALL) {
19869 		vm_purgeable_object_purge_all();
19870 		return KERN_SUCCESS;
19871 	}
19872 
19873 	if ((control == VM_PURGABLE_SET_STATE ||
19874 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19875 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19876 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19877 		return KERN_INVALID_ARGUMENT;
19878 	}
19879 
19880 	address = vm_sanitize_addr(map, address_u);
19881 
19882 	vm_map_lock_read(map);
19883 
19884 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19885 		/*
19886 		 * Must pass a valid non-submap address.
19887 		 */
19888 		vm_map_unlock_read(map);
19889 		return KERN_INVALID_ADDRESS;
19890 	}
19891 
19892 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
19893 	    control != VM_PURGABLE_GET_STATE) {
19894 		/*
19895 		 * Can't apply purgable controls to something you can't write.
19896 		 */
19897 		vm_map_unlock_read(map);
19898 		return KERN_PROTECTION_FAILURE;
19899 	}
19900 
19901 	object = VME_OBJECT(entry);
19902 	if (object == VM_OBJECT_NULL ||
19903 	    object->purgable == VM_PURGABLE_DENY) {
19904 		/*
19905 		 * Object must already be present and be purgeable.
19906 		 */
19907 		vm_map_unlock_read(map);
19908 		return KERN_INVALID_ARGUMENT;
19909 	}
19910 
19911 	vm_object_lock(object);
19912 
19913 #if 00
19914 	if (VME_OFFSET(entry) != 0 ||
19915 	    entry->vme_end - entry->vme_start != object->vo_size) {
19916 		/*
19917 		 * Can only apply purgable controls to the whole (existing)
19918 		 * object at once.
19919 		 */
19920 		vm_map_unlock_read(map);
19921 		vm_object_unlock(object);
19922 		return KERN_INVALID_ARGUMENT;
19923 	}
19924 #endif
19925 
19926 	assert(!entry->is_sub_map);
19927 	assert(!entry->use_pmap); /* purgeable has its own accounting */
19928 
19929 	vm_map_unlock_read(map);
19930 
19931 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19932 
19933 	kr = vm_object_purgable_control(object, control, state);
19934 
19935 	if (was_nonvolatile &&
19936 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
19937 	    map->pmap == kernel_pmap) {
19938 #if DEBUG
19939 		object->vo_purgeable_volatilizer = kernel_task;
19940 #endif /* DEBUG */
19941 	}
19942 
19943 	vm_object_unlock(object);
19944 
19945 	return kr;
19946 }
19947 
19948 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19949 vm_map_footprint_query_page_info(
19950 	vm_map_t        map,
19951 	vm_map_entry_t  map_entry,
19952 	vm_map_offset_t curr_s_offset,
19953 	int             *disposition_p)
19954 {
19955 	int             pmap_disp;
19956 	vm_object_t     object = VM_OBJECT_NULL;
19957 	int             disposition;
19958 	int             effective_page_size;
19959 
19960 	vm_map_lock_assert_held(map);
19961 	assert(!map->has_corpse_footprint);
19962 	assert(curr_s_offset >= map_entry->vme_start);
19963 	assert(curr_s_offset < map_entry->vme_end);
19964 
19965 	if (map_entry->is_sub_map) {
19966 		if (!map_entry->use_pmap) {
19967 			/* nested pmap: no footprint */
19968 			*disposition_p = 0;
19969 			return;
19970 		}
19971 	} else {
19972 		object = VME_OBJECT(map_entry);
19973 		if (object == VM_OBJECT_NULL) {
19974 			/* nothing mapped here: no need to ask */
19975 			*disposition_p = 0;
19976 			return;
19977 		}
19978 	}
19979 
19980 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19981 
19982 	pmap_disp = 0;
19983 
19984 	/*
19985 	 * Query the pmap.
19986 	 */
19987 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19988 
19989 	/*
19990 	 * Compute this page's disposition.
19991 	 */
19992 	disposition = 0;
19993 
19994 	/* deal with "alternate accounting" first */
19995 	if (!map_entry->is_sub_map &&
19996 	    object->vo_no_footprint) {
19997 		/* does not count in footprint */
19998 //		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19999 	} else if (!map_entry->is_sub_map &&
20000 	    !object->internal &&
20001 	    object->vo_ledger_tag &&
20002 	    VM_OBJECT_OWNER(object) != NULL &&
20003 	    VM_OBJECT_OWNER(object)->map == map) {
20004 		/* owned external object: wired pages count in footprint */
20005 		assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20006 		if ((((curr_s_offset
20007 		    - map_entry->vme_start
20008 		    + VME_OFFSET(map_entry))
20009 		    / effective_page_size) <
20010 		    object->wired_page_count)) {
20011 			/*
20012 			 * External object owned by this task: report the first
20013 			 * "#wired" pages as "resident" (to show that they
20014 			 * contribute to the footprint) but not "dirty"
20015 			 * (to avoid double-counting with the fake "owned"
20016 			 * region we'll report at the end of the address space
20017 			 * to account for all (mapped or not) owned memory
20018 			 * owned by this task.
20019 			 */
20020 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20021 		}
20022 	} else if (!map_entry->is_sub_map &&
20023 	    object->internal &&
20024 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
20025 	    (object->purgable == VM_PURGABLE_DENY &&
20026 	    object->vo_ledger_tag)) &&
20027 	    VM_OBJECT_OWNER(object) != NULL &&
20028 	    VM_OBJECT_OWNER(object)->map == map) {
20029 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20030 		if ((((curr_s_offset
20031 		    - map_entry->vme_start
20032 		    + VME_OFFSET(map_entry))
20033 		    / effective_page_size) <
20034 		    (object->resident_page_count +
20035 		    vm_compressor_pager_get_count(object->pager)))) {
20036 			/*
20037 			 * Non-volatile purgeable object owned
20038 			 * by this task: report the first
20039 			 * "#resident + #compressed" pages as
20040 			 * "resident" (to show that they
20041 			 * contribute to the footprint) but not
20042 			 * "dirty" (to avoid double-counting
20043 			 * with the fake "non-volatile" region
20044 			 * we'll report at the end of the
20045 			 * address space to account for all
20046 			 * (mapped or not) non-volatile memory
20047 			 * owned by this task.
20048 			 */
20049 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20050 		}
20051 	} else if (!map_entry->is_sub_map &&
20052 	    object->internal &&
20053 	    (object->purgable == VM_PURGABLE_VOLATILE ||
20054 	    object->purgable == VM_PURGABLE_EMPTY) &&
20055 	    VM_OBJECT_OWNER(object) != NULL &&
20056 	    VM_OBJECT_OWNER(object)->map == map) {
20057 		if (object->internal) {
20058 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20059 		}
20060 		if ((((curr_s_offset
20061 		    - map_entry->vme_start
20062 		    + VME_OFFSET(map_entry))
20063 		    / effective_page_size) <
20064 		    object->wired_page_count)) {
20065 			/*
20066 			 * Volatile|empty purgeable object owned
20067 			 * by this task: report the first
20068 			 * "#wired" pages as "resident" (to
20069 			 * show that they contribute to the
20070 			 * footprint) but not "dirty" (to avoid
20071 			 * double-counting with the fake
20072 			 * "non-volatile" region we'll report
20073 			 * at the end of the address space to
20074 			 * account for all (mapped or not)
20075 			 * non-volatile memory owned by this
20076 			 * task.
20077 			 */
20078 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20079 		}
20080 	} else if (!map_entry->is_sub_map &&
20081 	    map_entry->iokit_acct &&
20082 	    object->internal &&
20083 	    object->purgable == VM_PURGABLE_DENY) {
20084 		/*
20085 		 * Non-purgeable IOKit memory: phys_footprint
20086 		 * includes the entire virtual mapping.
20087 		 */
20088 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20089 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20090 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20091 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
20092 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
20093 		/* alternate accounting */
20094 #if __arm64__ && (DEVELOPMENT || DEBUG)
20095 		if (map->pmap->footprint_was_suspended) {
20096 			/*
20097 			 * The assertion below can fail if dyld
20098 			 * suspended footprint accounting
20099 			 * while doing some adjustments to
20100 			 * this page;  the mapping would say
20101 			 * "use pmap accounting" but the page
20102 			 * would be marked "alternate
20103 			 * accounting".
20104 			 */
20105 		} else
20106 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
20107 		{
20108 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20109 		}
20110 		disposition = 0;
20111 	} else {
20112 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
20113 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20114 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20115 			disposition |= VM_PAGE_QUERY_PAGE_REF;
20116 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
20117 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20118 			} else {
20119 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20120 			}
20121 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
20122 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20123 			}
20124 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
20125 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20126 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20127 		}
20128 	}
20129 
20130 	*disposition_p = disposition;
20131 }
20132 
20133 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_ut offset_u,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20134 vm_map_page_info(
20135 	vm_map_t                map,
20136 	vm_map_offset_ut        offset_u,
20137 	vm_page_info_flavor_t   flavor,
20138 	vm_page_info_t          info,
20139 	mach_msg_type_number_t  *count)
20140 {
20141 	return vm_map_page_range_info_internal(map,
20142 	           offset_u, /* start of range */
20143 	           vm_sanitize_compute_ut_end(offset_u, 1), /* this will get rounded in the call to the page boundary */
20144 	           (int)-1, /* effective_page_shift: unspecified */
20145 	           flavor,
20146 	           info,
20147 	           count);
20148 }
20149 
20150 static __attribute__((always_inline, warn_unused_result))
20151 kern_return_t
vm_map_page_range_info_sanitize(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,vm_map_offset_t effective_page_mask,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_offset_t * offset_in_page)20152 vm_map_page_range_info_sanitize(
20153 	vm_map_t                map,
20154 	vm_map_offset_ut        start_offset_u,
20155 	vm_map_offset_ut        end_offset_u,
20156 	vm_map_offset_t         effective_page_mask,
20157 	vm_map_offset_t        *start,
20158 	vm_map_offset_t        *end,
20159 	vm_map_offset_t        *offset_in_page)
20160 {
20161 	kern_return_t           retval;
20162 	vm_map_size_t           size;
20163 
20164 	/*
20165 	 * Perform validation against map's mask but don't align start/end,
20166 	 * as we need for those to be aligned wrt effective_page_mask
20167 	 */
20168 	retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20169 	    VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, map,
20170 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
20171 	    VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES, start,
20172 	    end, &size);
20173 	if (retval != KERN_SUCCESS) {
20174 		return retval;
20175 	}
20176 
20177 	retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20178 	    VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, effective_page_mask,
20179 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start,
20180 	    end, &size);
20181 	if (retval != KERN_SUCCESS) {
20182 		return retval;
20183 	}
20184 
20185 	*offset_in_page = vm_sanitize_offset_in_page(effective_page_mask,
20186 	    start_offset_u);
20187 
20188 	return KERN_SUCCESS;
20189 }
20190 
20191 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20192 vm_map_page_range_info_internal(
20193 	vm_map_t                map,
20194 	vm_map_offset_ut        start_offset_u,
20195 	vm_map_offset_ut        end_offset_u,
20196 	int                     effective_page_shift,
20197 	vm_page_info_flavor_t   flavor,
20198 	vm_page_info_t          info,
20199 	mach_msg_type_number_t  *count)
20200 {
20201 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
20202 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
20203 	vm_page_t               m = VM_PAGE_NULL;
20204 	kern_return_t           retval = KERN_SUCCESS;
20205 	int                     disposition = 0;
20206 	int                     ref_count = 0;
20207 	int                     depth = 0, info_idx = 0;
20208 	vm_page_info_basic_t    basic_info = 0;
20209 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
20210 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
20211 	boolean_t               do_region_footprint;
20212 	ledger_amount_t         ledger_resident, ledger_compressed;
20213 	int                     effective_page_size;
20214 	vm_map_offset_t         effective_page_mask;
20215 
20216 	switch (flavor) {
20217 	case VM_PAGE_INFO_BASIC:
20218 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
20219 			/*
20220 			 * The "vm_page_info_basic_data" structure was not
20221 			 * properly padded, so allow the size to be off by
20222 			 * one to maintain backwards binary compatibility...
20223 			 */
20224 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
20225 				return KERN_INVALID_ARGUMENT;
20226 			}
20227 		}
20228 		break;
20229 	default:
20230 		return KERN_INVALID_ARGUMENT;
20231 	}
20232 
20233 	if (effective_page_shift == -1) {
20234 		effective_page_shift = vm_self_region_page_shift_safely(map);
20235 		if (effective_page_shift == -1) {
20236 			return KERN_INVALID_ARGUMENT;
20237 		}
20238 	}
20239 	effective_page_size = (1 << effective_page_shift);
20240 	effective_page_mask = effective_page_size - 1;
20241 
20242 
20243 	retval = vm_map_page_range_info_sanitize(map,
20244 	    start_offset_u,
20245 	    end_offset_u,
20246 	    effective_page_mask,
20247 	    &start,
20248 	    &end,
20249 	    &offset_in_page);
20250 	if (retval != KERN_SUCCESS) {
20251 		return vm_sanitize_get_kr(retval);
20252 	}
20253 
20254 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
20255 
20256 	do_region_footprint = task_self_region_footprint();
20257 	disposition = 0;
20258 	ref_count = 0;
20259 	depth = 0;
20260 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
20261 
20262 	vm_map_lock_read(map);
20263 
20264 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
20265 
20266 	for (curr_s_offset = start; curr_s_offset < end;) {
20267 		/*
20268 		 * New lookup needs reset of these variables.
20269 		 */
20270 		curr_object = object = VM_OBJECT_NULL;
20271 		offset_in_object = 0;
20272 		ref_count = 0;
20273 		depth = 0;
20274 
20275 		if (do_region_footprint &&
20276 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
20277 			/*
20278 			 * Request for "footprint" info about a page beyond
20279 			 * the end of address space: this must be for
20280 			 * the fake region vm_map_region_recurse_64()
20281 			 * reported to account for non-volatile purgeable
20282 			 * memory owned by this task.
20283 			 */
20284 			disposition = 0;
20285 
20286 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
20287 			    (unsigned) ledger_compressed) {
20288 				/*
20289 				 * We haven't reported all the "non-volatile
20290 				 * compressed" pages yet, so report this fake
20291 				 * page as "compressed".
20292 				 */
20293 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20294 			} else {
20295 				/*
20296 				 * We've reported all the non-volatile
20297 				 * compressed page but not all the non-volatile
20298 				 * pages , so report this fake page as
20299 				 * "resident dirty".
20300 				 */
20301 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20302 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20303 				disposition |= VM_PAGE_QUERY_PAGE_REF;
20304 			}
20305 			switch (flavor) {
20306 			case VM_PAGE_INFO_BASIC:
20307 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20308 				basic_info->disposition = disposition;
20309 				basic_info->ref_count = 1;
20310 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20311 				basic_info->offset = 0;
20312 				basic_info->depth = 0;
20313 
20314 				info_idx++;
20315 				break;
20316 			}
20317 			curr_s_offset += effective_page_size;
20318 			continue;
20319 		}
20320 
20321 		/*
20322 		 * First, find the map entry covering "curr_s_offset", going down
20323 		 * submaps if necessary.
20324 		 */
20325 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
20326 			/* no entry -> no object -> no page */
20327 
20328 			if (curr_s_offset < vm_map_min(map)) {
20329 				/*
20330 				 * Illegal address that falls below map min.
20331 				 */
20332 				curr_e_offset = MIN(end, vm_map_min(map));
20333 			} else if (curr_s_offset >= vm_map_max(map)) {
20334 				/*
20335 				 * Illegal address that falls on/after map max.
20336 				 */
20337 				curr_e_offset = end;
20338 			} else if (map_entry == vm_map_to_entry(map)) {
20339 				/*
20340 				 * Hit a hole.
20341 				 */
20342 				if (map_entry->vme_next == vm_map_to_entry(map)) {
20343 					/*
20344 					 * Empty map.
20345 					 */
20346 					curr_e_offset = MIN(map->max_offset, end);
20347 				} else {
20348 					/*
20349 					 * Hole at start of the map.
20350 					 */
20351 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20352 				}
20353 			} else {
20354 				if (map_entry->vme_next == vm_map_to_entry(map)) {
20355 					/*
20356 					 * Hole at the end of the map.
20357 					 */
20358 					curr_e_offset = MIN(map->max_offset, end);
20359 				} else {
20360 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20361 				}
20362 			}
20363 
20364 			assert(curr_e_offset >= curr_s_offset);
20365 
20366 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20367 
20368 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20369 
20370 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20371 
20372 			curr_s_offset = curr_e_offset;
20373 
20374 			info_idx += num_pages;
20375 
20376 			continue;
20377 		}
20378 
20379 		/* compute offset from this map entry's start */
20380 		offset_in_object = curr_s_offset - map_entry->vme_start;
20381 
20382 		/* compute offset into this map entry's object (or submap) */
20383 		offset_in_object += VME_OFFSET(map_entry);
20384 
20385 		if (map_entry->is_sub_map) {
20386 			vm_map_t sub_map = VM_MAP_NULL;
20387 			vm_page_info_t submap_info = 0;
20388 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20389 
20390 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20391 
20392 			submap_s_offset = offset_in_object;
20393 			submap_e_offset = submap_s_offset + range_len;
20394 
20395 			sub_map = VME_SUBMAP(map_entry);
20396 
20397 			vm_map_reference(sub_map);
20398 			vm_map_unlock_read(map);
20399 
20400 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20401 
20402 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20403 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20404 
20405 			retval = vm_map_page_range_info_internal(sub_map,
20406 			    submap_s_offset,
20407 			    submap_e_offset,
20408 			    effective_page_shift,
20409 			    VM_PAGE_INFO_BASIC,
20410 			    (vm_page_info_t) submap_info,
20411 			    count);
20412 
20413 			assert(retval == KERN_SUCCESS);
20414 
20415 			vm_map_lock_read(map);
20416 			vm_map_deallocate(sub_map);
20417 
20418 			/* Move the "info" index by the number of pages we inspected.*/
20419 			info_idx += range_len >> effective_page_shift;
20420 
20421 			/* Move our current offset by the size of the range we inspected.*/
20422 			curr_s_offset += range_len;
20423 
20424 			continue;
20425 		}
20426 
20427 		object = VME_OBJECT(map_entry);
20428 
20429 		if (object == VM_OBJECT_NULL) {
20430 			/*
20431 			 * We don't have an object here and, hence,
20432 			 * no pages to inspect. We'll fill up the
20433 			 * info structure appropriately.
20434 			 */
20435 
20436 			curr_e_offset = MIN(map_entry->vme_end, end);
20437 
20438 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20439 
20440 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20441 
20442 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20443 
20444 			curr_s_offset = curr_e_offset;
20445 
20446 			info_idx += num_pages;
20447 
20448 			continue;
20449 		}
20450 
20451 		if (do_region_footprint) {
20452 			disposition = 0;
20453 			if (map->has_corpse_footprint) {
20454 				/*
20455 				 * Query the page info data we saved
20456 				 * while forking the corpse.
20457 				 */
20458 				vm_map_corpse_footprint_query_page_info(
20459 					map,
20460 					curr_s_offset,
20461 					&disposition);
20462 			} else {
20463 				/*
20464 				 * Query the live pmap for footprint info
20465 				 * about this page.
20466 				 */
20467 				vm_map_footprint_query_page_info(
20468 					map,
20469 					map_entry,
20470 					curr_s_offset,
20471 					&disposition);
20472 			}
20473 			switch (flavor) {
20474 			case VM_PAGE_INFO_BASIC:
20475 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20476 				basic_info->disposition = disposition;
20477 				basic_info->ref_count = 1;
20478 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20479 				basic_info->offset = 0;
20480 				basic_info->depth = 0;
20481 
20482 				info_idx++;
20483 				break;
20484 			}
20485 			curr_s_offset += effective_page_size;
20486 			continue;
20487 		}
20488 
20489 		vm_object_reference(object);
20490 		/*
20491 		 * Shared mode -- so we can allow other readers
20492 		 * to grab the lock too.
20493 		 */
20494 		vm_object_lock_shared(object);
20495 
20496 		curr_e_offset = MIN(map_entry->vme_end, end);
20497 
20498 		vm_map_unlock_read(map);
20499 
20500 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20501 
20502 		curr_object = object;
20503 
20504 		for (; curr_s_offset < curr_e_offset;) {
20505 			if (object == curr_object) {
20506 				/* account for our object reference above. */
20507 				ref_count = os_ref_get_count_raw(&curr_object->ref_count) - 1;
20508 			} else {
20509 				ref_count = os_ref_get_count_raw(&curr_object->ref_count);
20510 			}
20511 
20512 			curr_offset_in_object = offset_in_object;
20513 
20514 			for (;;) {
20515 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20516 
20517 				if (m != VM_PAGE_NULL) {
20518 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20519 					break;
20520 				} else {
20521 					if (curr_object->internal &&
20522 					    curr_object->alive &&
20523 					    !curr_object->terminating &&
20524 					    curr_object->pager_ready) {
20525 						if (vm_object_compressor_pager_state_get(curr_object, vm_object_trunc_page(curr_offset_in_object))
20526 						    == VM_EXTERNAL_STATE_EXISTS) {
20527 							/* the pager has that page */
20528 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20529 							break;
20530 						}
20531 					}
20532 
20533 					/*
20534 					 * Go down the VM object shadow chain until we find the page
20535 					 * we're looking for.
20536 					 */
20537 
20538 					if (curr_object->shadow != VM_OBJECT_NULL) {
20539 						vm_object_t shadow = VM_OBJECT_NULL;
20540 
20541 						curr_offset_in_object += curr_object->vo_shadow_offset;
20542 						shadow = curr_object->shadow;
20543 
20544 						vm_object_lock_shared(shadow);
20545 						vm_object_unlock(curr_object);
20546 
20547 						curr_object = shadow;
20548 						depth++;
20549 						continue;
20550 					} else {
20551 						break;
20552 					}
20553 				}
20554 			}
20555 
20556 			/* The ref_count is not strictly accurate, it measures the number   */
20557 			/* of entities holding a ref on the object, they may not be mapping */
20558 			/* the object or may not be mapping the section holding the         */
20559 			/* target page but its still a ball park number and though an over- */
20560 			/* count, it picks up the copy-on-write cases                       */
20561 
20562 			/* We could also get a picture of page sharing from pmap_attributes */
20563 			/* but this would under count as only faulted-in mappings would     */
20564 			/* show up.							    */
20565 
20566 			if ((curr_object == object) && curr_object->shadow) {
20567 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20568 			}
20569 
20570 			if (!curr_object->internal) {
20571 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20572 			}
20573 
20574 			if (m != VM_PAGE_NULL) {
20575 				if (m->vmp_fictitious) {
20576 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20577 				} else {
20578 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20579 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20580 					}
20581 
20582 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20583 						disposition |= VM_PAGE_QUERY_PAGE_REF;
20584 					}
20585 
20586 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20587 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20588 					}
20589 
20590 					/*
20591 					 * XXX TODO4K:
20592 					 * when this routine deals with 4k
20593 					 * pages, check the appropriate CS bit
20594 					 * here.
20595 					 */
20596 					if (m->vmp_cs_validated) {
20597 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20598 					}
20599 					if (m->vmp_cs_tainted) {
20600 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20601 					}
20602 					if (m->vmp_cs_nx) {
20603 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20604 					}
20605 					if (m->vmp_reusable || curr_object->all_reusable) {
20606 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20607 					}
20608 				}
20609 			}
20610 
20611 			switch (flavor) {
20612 			case VM_PAGE_INFO_BASIC:
20613 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20614 				basic_info->disposition = disposition;
20615 				basic_info->ref_count = ref_count;
20616 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
20617 				    VM_KERNEL_ADDRHASH(curr_object);
20618 				basic_info->offset =
20619 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20620 				basic_info->depth = depth;
20621 
20622 				info_idx++;
20623 				break;
20624 			}
20625 
20626 			disposition = 0;
20627 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20628 
20629 			/*
20630 			 * Move to next offset in the range and in our object.
20631 			 */
20632 			curr_s_offset += effective_page_size;
20633 			offset_in_object += effective_page_size;
20634 			curr_offset_in_object = offset_in_object;
20635 
20636 			if (curr_object != object) {
20637 				vm_object_unlock(curr_object);
20638 
20639 				curr_object = object;
20640 
20641 				vm_object_lock_shared(curr_object);
20642 			} else {
20643 				vm_object_lock_yield_shared(curr_object);
20644 			}
20645 		}
20646 
20647 		vm_object_unlock(curr_object);
20648 		vm_object_deallocate(curr_object);
20649 
20650 		vm_map_lock_read(map);
20651 	}
20652 
20653 	vm_map_unlock_read(map);
20654 	return retval;
20655 }
20656 
20657 static __attribute__((always_inline, warn_unused_result))
20658 kern_return_t
vm_map_msync_sanitize(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_object_offset_t * address,vm_map_size_t * size)20659 vm_map_msync_sanitize(
20660 	vm_map_t                map,
20661 	vm_map_address_ut       address_u,
20662 	vm_map_size_ut          size_u,
20663 	vm_object_offset_t     *address,
20664 	vm_map_size_t          *size)
20665 {
20666 	vm_object_offset_t      end;
20667 
20668 	return vm_sanitize_addr_size(address_u, size_u,
20669 	           VM_SANITIZE_CALLER_VM_MAP_MSYNC,
20670 	           map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS,
20671 	           address, &end, size);
20672 }
20673 
20674 /*
20675  *	vm_map_msync
20676  *
20677  *	Synchronises the memory range specified with its backing store
20678  *	image by either flushing or cleaning the contents to the appropriate
20679  *	memory manager engaging in a memory object synchronize dialog with
20680  *	the manager.  The client doesn't return until the manager issues
20681  *	m_o_s_completed message.  MIG Magically converts user task parameter
20682  *	to the task's address map.
20683  *
20684  *	interpretation of sync_flags
20685  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
20686  *				  pages to manager.
20687  *
20688  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20689  *				- discard pages, write dirty or precious
20690  *				  pages back to memory manager.
20691  *
20692  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20693  *				- write dirty or precious pages back to
20694  *				  the memory manager.
20695  *
20696  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
20697  *				  is a hole in the region, and we would
20698  *				  have returned KERN_SUCCESS, return
20699  *				  KERN_INVALID_ADDRESS instead.
20700  *
20701  *	NOTE
20702  *	The memory object attributes have not yet been implemented, this
20703  *	function will have to deal with the invalidate attribute
20704  *
20705  *	RETURNS
20706  *	KERN_INVALID_TASK		Bad task parameter
20707  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
20708  *	KERN_SUCCESS			The usual.
20709  *	KERN_INVALID_ADDRESS		There was a hole in the region.
20710  */
20711 
20712 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_sync_t sync_flags)20713 vm_map_msync(
20714 	vm_map_t                map,
20715 	vm_map_address_ut       address_u,
20716 	vm_map_size_ut          size_u,
20717 	vm_sync_t               sync_flags)
20718 {
20719 	vm_map_entry_t          entry;
20720 	vm_map_size_t           size, amount_left;
20721 	vm_object_offset_t      address, offset;
20722 	vm_object_offset_t      start_offset, end_offset;
20723 	boolean_t               do_sync_req;
20724 	boolean_t               had_hole = FALSE;
20725 	vm_map_offset_t         pmap_offset;
20726 	kern_return_t           kr;
20727 
20728 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20729 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20730 		return KERN_INVALID_ARGUMENT;
20731 	}
20732 
20733 	if (map == VM_MAP_NULL) {
20734 		return KERN_INVALID_TASK;
20735 	}
20736 
20737 	kr = vm_map_msync_sanitize(map,
20738 	    address_u,
20739 	    size_u,
20740 	    &address,
20741 	    &size);
20742 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20743 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20744 	}
20745 	if (__improbable(kr != KERN_SUCCESS)) {
20746 		return vm_sanitize_get_kr(kr);
20747 	}
20748 
20749 	amount_left = size;
20750 
20751 	while (amount_left > 0) {
20752 		vm_object_size_t        flush_size;
20753 		vm_object_t             object;
20754 
20755 		vm_map_lock(map);
20756 		if (!vm_map_lookup_entry(map,
20757 		    address,
20758 		    &entry)) {
20759 			vm_map_size_t   skip;
20760 
20761 			/*
20762 			 * hole in the address map.
20763 			 */
20764 			had_hole = TRUE;
20765 
20766 			if (sync_flags & VM_SYNC_KILLPAGES) {
20767 				/*
20768 				 * For VM_SYNC_KILLPAGES, there should be
20769 				 * no holes in the range, since we couldn't
20770 				 * prevent someone else from allocating in
20771 				 * that hole and we wouldn't want to "kill"
20772 				 * their pages.
20773 				 */
20774 				vm_map_unlock(map);
20775 				break;
20776 			}
20777 
20778 			/*
20779 			 * Check for empty map.
20780 			 */
20781 			if (entry == vm_map_to_entry(map) &&
20782 			    entry->vme_next == entry) {
20783 				vm_map_unlock(map);
20784 				break;
20785 			}
20786 			/*
20787 			 * Check that we don't wrap and that
20788 			 * we have at least one real map entry.
20789 			 */
20790 			if ((map->hdr.nentries == 0) ||
20791 			    (entry->vme_next->vme_start < address)) {
20792 				vm_map_unlock(map);
20793 				break;
20794 			}
20795 			/*
20796 			 * Move up to the next entry if needed
20797 			 */
20798 			skip = (entry->vme_next->vme_start - address);
20799 			if (skip >= amount_left) {
20800 				amount_left = 0;
20801 			} else {
20802 				amount_left -= skip;
20803 			}
20804 			address = entry->vme_next->vme_start;
20805 			vm_map_unlock(map);
20806 			continue;
20807 		}
20808 
20809 		offset = address - entry->vme_start;
20810 		pmap_offset = address;
20811 
20812 		/*
20813 		 * do we have more to flush than is contained in this
20814 		 * entry ?
20815 		 */
20816 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
20817 			flush_size = entry->vme_end -
20818 			    (entry->vme_start + offset);
20819 		} else {
20820 			flush_size = amount_left;
20821 		}
20822 		amount_left -= flush_size;
20823 		address += flush_size;
20824 
20825 		if (entry->is_sub_map == TRUE) {
20826 			vm_map_t        local_map;
20827 			vm_map_offset_t local_offset;
20828 
20829 			local_map = VME_SUBMAP(entry);
20830 			local_offset = VME_OFFSET(entry);
20831 			vm_map_reference(local_map);
20832 			vm_map_unlock(map);
20833 			if (vm_map_msync(
20834 				    local_map,
20835 				    local_offset,
20836 				    flush_size,
20837 				    sync_flags) == KERN_INVALID_ADDRESS) {
20838 				had_hole = TRUE;
20839 			}
20840 			vm_map_deallocate(local_map);
20841 			continue;
20842 		}
20843 		object = VME_OBJECT(entry);
20844 
20845 		/*
20846 		 * We can't sync this object if the object has not been
20847 		 * created yet
20848 		 */
20849 		if (object == VM_OBJECT_NULL) {
20850 			vm_map_unlock(map);
20851 			continue;
20852 		}
20853 		offset += VME_OFFSET(entry);
20854 
20855 		vm_object_lock(object);
20856 
20857 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20858 			int kill_pages = 0;
20859 
20860 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20861 				/*
20862 				 * This is a destructive operation and so we
20863 				 * err on the side of limiting the range of
20864 				 * the operation.
20865 				 */
20866 				start_offset = vm_object_round_page(offset);
20867 				end_offset = vm_object_trunc_page(offset + flush_size);
20868 
20869 				if (end_offset <= start_offset) {
20870 					vm_object_unlock(object);
20871 					vm_map_unlock(map);
20872 					continue;
20873 				}
20874 
20875 				pmap_offset += start_offset - offset;
20876 			} else {
20877 				start_offset = offset;
20878 				end_offset = offset + flush_size;
20879 			}
20880 
20881 			if (sync_flags & VM_SYNC_KILLPAGES) {
20882 				if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
20883 				    ((object->copy_strategy !=
20884 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
20885 				    (object->vo_copy == VM_OBJECT_NULL))) &&
20886 				    (object->shadow == VM_OBJECT_NULL)) {
20887 					if (os_ref_get_count_raw(&object->ref_count) != 1) {
20888 						vm_page_stats_reusable.free_shared++;
20889 					}
20890 					kill_pages = 1;
20891 				} else {
20892 					kill_pages = -1;
20893 				}
20894 			}
20895 			if (kill_pages != -1) {
20896 				vm_object_deactivate_pages(
20897 					object,
20898 					start_offset,
20899 					(vm_object_size_t) (end_offset - start_offset),
20900 					kill_pages,
20901 					FALSE, /* reusable_pages */
20902 					FALSE, /* reusable_no_write */
20903 					map->pmap,
20904 					pmap_offset);
20905 			}
20906 			vm_object_unlock(object);
20907 			vm_map_unlock(map);
20908 			continue;
20909 		}
20910 		/*
20911 		 * We can't sync this object if there isn't a pager.
20912 		 * Don't bother to sync internal objects, since there can't
20913 		 * be any "permanent" storage for these objects anyway.
20914 		 */
20915 		if ((object->pager == MEMORY_OBJECT_NULL) ||
20916 		    (object->internal) || (object->private)) {
20917 			vm_object_unlock(object);
20918 			vm_map_unlock(map);
20919 			continue;
20920 		}
20921 		/*
20922 		 * keep reference on the object until syncing is done
20923 		 */
20924 		vm_object_reference_locked(object);
20925 		vm_object_unlock(object);
20926 
20927 		vm_map_unlock(map);
20928 
20929 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20930 			start_offset = vm_object_trunc_page(offset);
20931 			end_offset = vm_object_round_page(offset + flush_size);
20932 		} else {
20933 			start_offset = offset;
20934 			end_offset = offset + flush_size;
20935 		}
20936 
20937 		do_sync_req = vm_object_sync(object,
20938 		    start_offset,
20939 		    (end_offset - start_offset),
20940 		    sync_flags & VM_SYNC_INVALIDATE,
20941 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20942 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20943 		    sync_flags & VM_SYNC_SYNCHRONOUS);
20944 
20945 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20946 			/*
20947 			 * clear out the clustering and read-ahead hints
20948 			 */
20949 			vm_object_lock(object);
20950 
20951 			object->pages_created = 0;
20952 			object->pages_used = 0;
20953 			object->sequential = 0;
20954 			object->last_alloc = 0;
20955 
20956 			vm_object_unlock(object);
20957 		}
20958 		vm_object_deallocate(object);
20959 	} /* while */
20960 
20961 	/* for proper msync() behaviour */
20962 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20963 		return KERN_INVALID_ADDRESS;
20964 	}
20965 
20966 	return KERN_SUCCESS;
20967 }/* vm_msync */
20968 
20969 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20970 vm_named_entry_associate_vm_object(
20971 	vm_named_entry_t        named_entry,
20972 	vm_object_t             object,
20973 	vm_object_offset_t      offset,
20974 	vm_object_size_t        size,
20975 	vm_prot_t               prot)
20976 {
20977 	vm_map_copy_t copy;
20978 	vm_map_entry_t copy_entry;
20979 
20980 	assert(!named_entry->is_sub_map);
20981 	assert(!named_entry->is_copy);
20982 	assert(!named_entry->is_object);
20983 	assert(!named_entry->internal);
20984 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20985 
20986 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20987 	copy->offset = offset;
20988 	copy->size = size;
20989 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20990 
20991 	copy_entry = vm_map_copy_entry_create(copy);
20992 	copy_entry->protection = prot;
20993 	copy_entry->max_protection = prot;
20994 	copy_entry->use_pmap = TRUE;
20995 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20996 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20997 	VME_OBJECT_SET(copy_entry, object, false, 0);
20998 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20999 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
21000 
21001 	named_entry->backing.copy = copy;
21002 	named_entry->is_object = TRUE;
21003 	if (object->internal) {
21004 		named_entry->internal = TRUE;
21005 	}
21006 
21007 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
21008 	    named_entry, copy, object, offset, size, prot);
21009 }
21010 
21011 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)21012 vm_named_entry_to_vm_object(
21013 	vm_named_entry_t named_entry)
21014 {
21015 	vm_map_copy_t   copy;
21016 	vm_map_entry_t  copy_entry;
21017 	vm_object_t     object;
21018 
21019 	assert(!named_entry->is_sub_map);
21020 	assert(!named_entry->is_copy);
21021 	assert(named_entry->is_object);
21022 	copy = named_entry->backing.copy;
21023 	assert(copy != VM_MAP_COPY_NULL);
21024 	/*
21025 	 * Assert that the vm_map_copy is coming from the right
21026 	 * zone and hasn't been forged
21027 	 */
21028 	vm_map_copy_require(copy);
21029 	assert(copy->cpy_hdr.nentries == 1);
21030 	copy_entry = vm_map_copy_first_entry(copy);
21031 	object = VME_OBJECT(copy_entry);
21032 
21033 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
21034 
21035 	return object;
21036 }
21037 
21038 /*
21039  *	Routine:	convert_port_entry_to_map
21040  *	Purpose:
21041  *		Convert from a port specifying an entry or a task
21042  *		to a map. Doesn't consume the port ref; produces a map ref,
21043  *		which may be null.  Unlike convert_port_to_map, the
21044  *		port may be task or a named entry backed.
21045  *	Conditions:
21046  *		Nothing locked.
21047  */
21048 
21049 vm_map_t
convert_port_entry_to_map(ipc_port_t port)21050 convert_port_entry_to_map(
21051 	ipc_port_t      port)
21052 {
21053 	vm_map_t map = VM_MAP_NULL;
21054 	vm_named_entry_t named_entry;
21055 
21056 	if (!IP_VALID(port)) {
21057 		return VM_MAP_NULL;
21058 	}
21059 
21060 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
21061 		return convert_port_to_map(port);
21062 	}
21063 
21064 	named_entry = mach_memory_entry_from_port(port);
21065 
21066 	if ((named_entry->is_sub_map) &&
21067 	    (named_entry->protection & VM_PROT_WRITE)) {
21068 		map = named_entry->backing.map;
21069 		if (map->pmap != PMAP_NULL) {
21070 			if (map->pmap == kernel_pmap) {
21071 				panic("userspace has access "
21072 				    "to a kernel map %p", map);
21073 			}
21074 			pmap_require(map->pmap);
21075 		}
21076 		vm_map_reference(map);
21077 	}
21078 
21079 	return map;
21080 }
21081 
21082 /*
21083  * Export routines to other components for the things we access locally through
21084  * macros.
21085  */
21086 #undef current_map
21087 vm_map_t
current_map(void)21088 current_map(void)
21089 {
21090 	return current_map_fast();
21091 }
21092 
21093 /*
21094  *	vm_map_reference:
21095  *
21096  *	Takes a reference on the specified map.
21097  */
21098 void
vm_map_reference(vm_map_t map)21099 vm_map_reference(
21100 	vm_map_t        map)
21101 {
21102 	if (__probable(map != VM_MAP_NULL)) {
21103 		vm_map_require(map);
21104 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
21105 	}
21106 }
21107 
21108 /*
21109  *	vm_map_deallocate:
21110  *
21111  *	Removes a reference from the specified map,
21112  *	destroying it if no references remain.
21113  *	The map should not be locked.
21114  */
21115 void
vm_map_deallocate(vm_map_t map)21116 vm_map_deallocate(
21117 	vm_map_t        map)
21118 {
21119 	if (__probable(map != VM_MAP_NULL)) {
21120 		vm_map_require(map);
21121 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
21122 			vm_map_destroy(map);
21123 		}
21124 	}
21125 }
21126 
21127 void
vm_map_inspect_deallocate(vm_map_inspect_t map)21128 vm_map_inspect_deallocate(
21129 	vm_map_inspect_t      map)
21130 {
21131 	vm_map_deallocate((vm_map_t)map);
21132 }
21133 
21134 void
vm_map_read_deallocate(vm_map_read_t map)21135 vm_map_read_deallocate(
21136 	vm_map_read_t      map)
21137 {
21138 	vm_map_deallocate((vm_map_t)map);
21139 }
21140 
21141 
21142 void
vm_map_disable_NX(vm_map_t map)21143 vm_map_disable_NX(vm_map_t map)
21144 {
21145 	if (map == NULL) {
21146 		return;
21147 	}
21148 	if (map->pmap == NULL) {
21149 		return;
21150 	}
21151 
21152 	pmap_disable_NX(map->pmap);
21153 }
21154 
21155 void
vm_map_disallow_data_exec(vm_map_t map)21156 vm_map_disallow_data_exec(vm_map_t map)
21157 {
21158 	if (map == NULL) {
21159 		return;
21160 	}
21161 
21162 	map->map_disallow_data_exec = TRUE;
21163 }
21164 
21165 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
21166  * more descriptive.
21167  */
21168 void
vm_map_set_32bit(vm_map_t map)21169 vm_map_set_32bit(vm_map_t map)
21170 {
21171 #if defined(__arm64__)
21172 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
21173 #else
21174 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
21175 #endif
21176 }
21177 
21178 
21179 void
vm_map_set_64bit(vm_map_t map)21180 vm_map_set_64bit(vm_map_t map)
21181 {
21182 #if defined(__arm64__)
21183 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
21184 #else
21185 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
21186 #endif
21187 }
21188 
21189 /*
21190  * Expand the maximum size of an existing map to 64GB.
21191  */
21192 void
vm_map_set_jumbo(vm_map_t map)21193 vm_map_set_jumbo(vm_map_t map)
21194 {
21195 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21196 	vm_map_set_max_addr(map, ~0, false);
21197 #else /* arm64 */
21198 	(void) map;
21199 #endif
21200 }
21201 
21202 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21203 /*
21204  * Expand the maximum size of an existing map to the maximum supported.
21205  */
21206 void
vm_map_set_extra_jumbo(vm_map_t map)21207 vm_map_set_extra_jumbo(vm_map_t map)
21208 {
21209 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21210 	vm_map_set_max_addr(map, ~0, true);
21211 #else /* arm64 */
21212 	(void) map;
21213 #endif
21214 }
21215 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21216 
21217 /*
21218  * This map has a JIT entitlement
21219  */
21220 void
vm_map_set_jit_entitled(vm_map_t map)21221 vm_map_set_jit_entitled(vm_map_t map)
21222 {
21223 #if defined (__arm64__)
21224 	pmap_set_jit_entitled(map->pmap);
21225 #else /* arm64 */
21226 	(void) map;
21227 #endif
21228 }
21229 
21230 /*
21231  * Get status of this maps TPRO flag
21232  */
21233 boolean_t
vm_map_tpro(vm_map_t map)21234 vm_map_tpro(vm_map_t map)
21235 {
21236 #if defined (__arm64e__)
21237 	return pmap_get_tpro(map->pmap);
21238 #else /* arm64e */
21239 	(void) map;
21240 	return FALSE;
21241 #endif
21242 }
21243 
21244 /*
21245  * This map has TPRO enabled
21246  */
21247 void
vm_map_set_tpro(vm_map_t map)21248 vm_map_set_tpro(vm_map_t map)
21249 {
21250 #if defined (__arm64e__)
21251 	pmap_set_tpro(map->pmap);
21252 #else /* arm64e */
21253 	(void) map;
21254 #endif
21255 }
21256 
21257 /*
21258  * Does this map have TPRO enforcement enabled
21259  */
21260 boolean_t
vm_map_tpro_enforcement(vm_map_t map)21261 vm_map_tpro_enforcement(vm_map_t map)
21262 {
21263 	return map->tpro_enforcement;
21264 }
21265 
21266 /*
21267  * Set TPRO enforcement for this map
21268  */
21269 void
vm_map_set_tpro_enforcement(vm_map_t map)21270 vm_map_set_tpro_enforcement(vm_map_t map)
21271 {
21272 	if (vm_map_tpro(map)) {
21273 		vm_map_lock(map);
21274 		map->tpro_enforcement = TRUE;
21275 		vm_map_unlock(map);
21276 	}
21277 }
21278 
21279 /*
21280  * Enable TPRO on the requested region
21281  *
21282  * Note:
21283  *     This routine is primarily intended to be called during/soon after map
21284  *     creation before the associated task has been released to run. It is only
21285  *     currently safe when we have no resident pages.
21286  */
21287 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)21288 vm_map_set_tpro_range(
21289 	__unused vm_map_t map,
21290 	__unused vm_map_address_t start,
21291 	__unused vm_map_address_t end)
21292 {
21293 	return TRUE;
21294 }
21295 
21296 /*
21297  * Expand the maximum size of an existing map.
21298  */
21299 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset,__unused bool extra_jumbo)21300 vm_map_set_max_addr(
21301 	vm_map_t map,
21302 	vm_map_offset_t new_max_offset,
21303 	__unused bool extra_jumbo)
21304 {
21305 #if defined(__arm64__)
21306 	vm_map_offset_t max_supported_offset;
21307 	vm_map_offset_t old_max_offset;
21308 	unsigned int option = ARM_PMAP_MAX_OFFSET_JUMBO;
21309 
21310 	vm_map_lock(map);
21311 
21312 	old_max_offset = map->max_offset;
21313 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21314 	if (extra_jumbo) {
21315 		option = ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO;
21316 	}
21317 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21318 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), option);
21319 
21320 	new_max_offset = trunc_page(new_max_offset);
21321 
21322 	/* The address space cannot be shrunk using this routine. */
21323 	if (old_max_offset >= new_max_offset) {
21324 		vm_map_unlock(map);
21325 		return;
21326 	}
21327 
21328 	if (max_supported_offset < new_max_offset) {
21329 		new_max_offset = max_supported_offset;
21330 	}
21331 
21332 	map->max_offset = new_max_offset;
21333 
21334 	/*
21335 	 * Disable the following chunk of code that extends the "holes" list
21336 	 * to accomodate a larger VM map.
21337 	 * In `vm_map_create_options()`, we now set the end of the "holes" list to
21338 	 * max(map->max_offset, MACH_VM_MAX_ADDRESS) for all platforms.
21339 	 * MACH_VM_MAX_ADDRESS is the largest virtual address a userspace process
21340 	 * can map, so any `new_max_offset` value will be <= MACH_VM_MAX_ADDRESS.
21341 	 * The "holes" list does not need to be adjusted.
21342 	 */
21343 #if 0
21344 	if (map->holelistenabled) {
21345 		if (map->holes_list->prev->vme_end == old_max_offset) {
21346 			/*
21347 			 * There is already a hole at the end of the map; simply make it bigger.
21348 			 */
21349 			map->holes_list->prev->vme_end = map->max_offset;
21350 		} else {
21351 			/*
21352 			 * There is no hole at the end, so we need to create a new hole
21353 			 * for the new empty space we're creating.
21354 			 */
21355 			struct vm_map_links *new_hole;
21356 
21357 			new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
21358 			new_hole->start = old_max_offset;
21359 			new_hole->end = map->max_offset;
21360 			new_hole->prev = map->holes_list->prev;
21361 			new_hole->next = (struct vm_map_entry *)map->holes_list;
21362 			map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
21363 			map->holes_list->prev = (struct vm_map_entry *)new_hole;
21364 		}
21365 	}
21366 #endif
21367 
21368 	vm_map_unlock(map);
21369 #else
21370 	(void)map;
21371 	(void)new_max_offset;
21372 #endif
21373 }
21374 
21375 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21376 vm_compute_max_offset(boolean_t is64)
21377 {
21378 #if defined(__arm64__)
21379 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21380 #else
21381 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21382 #endif
21383 }
21384 
21385 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21386 vm_map_get_max_aslr_slide_section(
21387 	vm_map_t                map __unused,
21388 	int64_t                 *max_sections,
21389 	int64_t                 *section_size)
21390 {
21391 #if defined(__arm64__)
21392 	*max_sections = 3;
21393 	*section_size = ARM_TT_TWIG_SIZE;
21394 #else
21395 	*max_sections = 1;
21396 	*section_size = 0;
21397 #endif
21398 }
21399 
21400 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21401 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21402 {
21403 #if defined(__arm64__)
21404 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21405 	 * limited embedded address space; this is also meant to minimize pmap
21406 	 * memory usage on 16KB page systems.
21407 	 */
21408 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21409 #else
21410 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21411 #endif
21412 }
21413 
21414 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21415 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21416 {
21417 #if defined(__arm64__)
21418 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21419 	 * of independent entropy on 16KB page systems.
21420 	 */
21421 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21422 #else
21423 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21424 #endif
21425 }
21426 
21427 boolean_t
vm_map_is_64bit(vm_map_t map)21428 vm_map_is_64bit(
21429 	vm_map_t map)
21430 {
21431 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21432 }
21433 
21434 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21435 vm_map_has_hard_pagezero(
21436 	vm_map_t        map,
21437 	vm_map_offset_t pagezero_size)
21438 {
21439 	/*
21440 	 * XXX FBDP
21441 	 * We should lock the VM map (for read) here but we can get away
21442 	 * with it for now because there can't really be any race condition:
21443 	 * the VM map's min_offset is changed only when the VM map is created
21444 	 * and when the zero page is established (when the binary gets loaded),
21445 	 * and this routine gets called only when the task terminates and the
21446 	 * VM map is being torn down, and when a new map is created via
21447 	 * load_machfile()/execve().
21448 	 */
21449 	return map->min_offset >= pagezero_size;
21450 }
21451 
21452 /*
21453  * Raise a VM map's maximun offset.
21454  */
21455 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21456 vm_map_raise_max_offset(
21457 	vm_map_t        map,
21458 	vm_map_offset_t new_max_offset)
21459 {
21460 	kern_return_t   ret;
21461 
21462 	vm_map_lock(map);
21463 	ret = KERN_INVALID_ADDRESS;
21464 
21465 	if (new_max_offset >= map->max_offset) {
21466 		if (!vm_map_is_64bit(map)) {
21467 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21468 				map->max_offset = new_max_offset;
21469 				ret = KERN_SUCCESS;
21470 			}
21471 		} else {
21472 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21473 				map->max_offset = new_max_offset;
21474 				ret = KERN_SUCCESS;
21475 			}
21476 		}
21477 	}
21478 
21479 	vm_map_unlock(map);
21480 	return ret;
21481 }
21482 
21483 
21484 /*
21485  * Raise a VM map's minimum offset.
21486  * To strictly enforce "page zero" reservation.
21487  */
21488 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21489 vm_map_raise_min_offset(
21490 	vm_map_t        map,
21491 	vm_map_offset_t new_min_offset)
21492 {
21493 	vm_map_entry_t  first_entry;
21494 
21495 	new_min_offset = vm_map_round_page(new_min_offset,
21496 	    VM_MAP_PAGE_MASK(map));
21497 
21498 	vm_map_lock(map);
21499 
21500 	if (new_min_offset < map->min_offset) {
21501 		/*
21502 		 * Can't move min_offset backwards, as that would expose
21503 		 * a part of the address space that was previously, and for
21504 		 * possibly good reasons, inaccessible.
21505 		 */
21506 		vm_map_unlock(map);
21507 		return KERN_INVALID_ADDRESS;
21508 	}
21509 	if (new_min_offset >= map->max_offset) {
21510 		/* can't go beyond the end of the address space */
21511 		vm_map_unlock(map);
21512 		return KERN_INVALID_ADDRESS;
21513 	}
21514 
21515 	first_entry = vm_map_first_entry(map);
21516 	if (first_entry != vm_map_to_entry(map) &&
21517 	    first_entry->vme_start < new_min_offset) {
21518 		/*
21519 		 * Some memory was already allocated below the new
21520 		 * minimun offset.  It's too late to change it now...
21521 		 */
21522 		vm_map_unlock(map);
21523 		return KERN_NO_SPACE;
21524 	}
21525 
21526 	map->min_offset = new_min_offset;
21527 
21528 	if (map->holelistenabled) {
21529 		assert(map->holes_list);
21530 		map->holes_list->start = new_min_offset;
21531 		assert(new_min_offset < map->holes_list->end);
21532 	}
21533 
21534 	vm_map_unlock(map);
21535 
21536 	return KERN_SUCCESS;
21537 }
21538 
21539 /*
21540  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21541  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21542  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21543  * have to reach over to the BSD data structures.
21544  */
21545 
21546 uint64_t vm_map_set_size_limit_count = 0;
21547 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21548 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21549 {
21550 	kern_return_t kr;
21551 
21552 	vm_map_lock(map);
21553 	if (new_size_limit < map->size) {
21554 		/* new limit should not be lower than its current size */
21555 		DTRACE_VM2(vm_map_set_size_limit_fail,
21556 		    vm_map_size_t, map->size,
21557 		    uint64_t, new_size_limit);
21558 		kr = KERN_FAILURE;
21559 	} else if (new_size_limit == map->size_limit) {
21560 		/* no change */
21561 		kr = KERN_SUCCESS;
21562 	} else {
21563 		/* set new limit */
21564 		DTRACE_VM2(vm_map_set_size_limit,
21565 		    vm_map_size_t, map->size,
21566 		    uint64_t, new_size_limit);
21567 		if (new_size_limit != RLIM_INFINITY) {
21568 			vm_map_set_size_limit_count++;
21569 		}
21570 		map->size_limit = new_size_limit;
21571 		kr = KERN_SUCCESS;
21572 	}
21573 	vm_map_unlock(map);
21574 	return kr;
21575 }
21576 
21577 uint64_t vm_map_set_data_limit_count = 0;
21578 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21579 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21580 {
21581 	kern_return_t kr;
21582 
21583 	vm_map_lock(map);
21584 	if (new_data_limit < map->size) {
21585 		/* new limit should not be lower than its current size */
21586 		DTRACE_VM2(vm_map_set_data_limit_fail,
21587 		    vm_map_size_t, map->size,
21588 		    uint64_t, new_data_limit);
21589 		kr = KERN_FAILURE;
21590 	} else if (new_data_limit == map->data_limit) {
21591 		/* no change */
21592 		kr = KERN_SUCCESS;
21593 	} else {
21594 		/* set new limit */
21595 		DTRACE_VM2(vm_map_set_data_limit,
21596 		    vm_map_size_t, map->size,
21597 		    uint64_t, new_data_limit);
21598 		if (new_data_limit != RLIM_INFINITY) {
21599 			vm_map_set_data_limit_count++;
21600 		}
21601 		map->data_limit = new_data_limit;
21602 		kr = KERN_SUCCESS;
21603 	}
21604 	vm_map_unlock(map);
21605 	return kr;
21606 }
21607 
21608 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21609 vm_map_set_user_wire_limit(vm_map_t     map,
21610     vm_size_t    limit)
21611 {
21612 	vm_map_lock(map);
21613 	map->user_wire_limit = limit;
21614 	vm_map_unlock(map);
21615 }
21616 
21617 
21618 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21619 vm_map_switch_protect(vm_map_t     map,
21620     boolean_t    val)
21621 {
21622 	vm_map_lock(map);
21623 	map->switch_protect = val;
21624 	vm_map_unlock(map);
21625 }
21626 
21627 extern int cs_process_enforcement_enable;
21628 boolean_t
vm_map_cs_enforcement(vm_map_t map)21629 vm_map_cs_enforcement(
21630 	vm_map_t map)
21631 {
21632 	if (cs_process_enforcement_enable) {
21633 		return TRUE;
21634 	}
21635 	return map->cs_enforcement;
21636 }
21637 
21638 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21639 vm_map_cs_wx_enable(
21640 	__unused vm_map_t map)
21641 {
21642 #if CODE_SIGNING_MONITOR
21643 	kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21644 	if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21645 		return KERN_SUCCESS;
21646 	}
21647 	return ret;
21648 #else
21649 	/* The VM manages WX memory entirely on its own */
21650 	return KERN_SUCCESS;
21651 #endif
21652 }
21653 
21654 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21655 vm_map_csm_allow_jit(
21656 	__unused vm_map_t map)
21657 {
21658 #if CODE_SIGNING_MONITOR
21659 	return csm_allow_jit_region(vm_map_pmap(map));
21660 #else
21661 	/* No code signing monitor to enforce JIT policy */
21662 	return KERN_SUCCESS;
21663 #endif
21664 }
21665 
21666 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21667 vm_map_cs_debugged_set(
21668 	vm_map_t map,
21669 	boolean_t val)
21670 {
21671 	vm_map_lock(map);
21672 	map->cs_debugged = val;
21673 	vm_map_unlock(map);
21674 }
21675 
21676 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21677 vm_map_cs_enforcement_set(
21678 	vm_map_t map,
21679 	boolean_t val)
21680 {
21681 	vm_map_lock(map);
21682 	map->cs_enforcement = val;
21683 	pmap_set_vm_map_cs_enforced(map->pmap, val);
21684 	vm_map_unlock(map);
21685 }
21686 
21687 /*
21688  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21689  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21690  * bump both counters.
21691  */
21692 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21693 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21694 {
21695 	pmap_t pmap = vm_map_pmap(map);
21696 
21697 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21698 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21699 }
21700 
21701 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21702 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21703 {
21704 	pmap_t pmap = vm_map_pmap(map);
21705 
21706 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21707 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21708 }
21709 
21710 /* Add (generate) code signature for memory range */
21711 #if CONFIG_DYNAMIC_CODE_SIGNING
21712 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21713 vm_map_sign(vm_map_t map,
21714     vm_map_offset_t start,
21715     vm_map_offset_t end)
21716 {
21717 	vm_map_entry_t entry;
21718 	vm_page_t m;
21719 	vm_object_t object;
21720 
21721 	/*
21722 	 * Vet all the input parameters and current type and state of the
21723 	 * underlaying object.  Return with an error if anything is amiss.
21724 	 */
21725 	if (map == VM_MAP_NULL) {
21726 		return KERN_INVALID_ARGUMENT;
21727 	}
21728 
21729 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21730 		return KERN_INVALID_ADDRESS;
21731 	}
21732 
21733 	vm_map_lock_read(map);
21734 
21735 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21736 		/*
21737 		 * Must pass a valid non-submap address.
21738 		 */
21739 		vm_map_unlock_read(map);
21740 		return KERN_INVALID_ADDRESS;
21741 	}
21742 
21743 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
21744 		/*
21745 		 * Map entry doesn't cover the requested range. Not handling
21746 		 * this situation currently.
21747 		 */
21748 		vm_map_unlock_read(map);
21749 		return KERN_INVALID_ARGUMENT;
21750 	}
21751 
21752 	object = VME_OBJECT(entry);
21753 	if (object == VM_OBJECT_NULL) {
21754 		/*
21755 		 * Object must already be present or we can't sign.
21756 		 */
21757 		vm_map_unlock_read(map);
21758 		return KERN_INVALID_ARGUMENT;
21759 	}
21760 
21761 	vm_object_lock(object);
21762 	vm_map_unlock_read(map);
21763 
21764 	while (start < end) {
21765 		uint32_t refmod;
21766 
21767 		m = vm_page_lookup(object,
21768 		    start - entry->vme_start + VME_OFFSET(entry));
21769 		if (m == VM_PAGE_NULL) {
21770 			/* shoud we try to fault a page here? we can probably
21771 			 * demand it exists and is locked for this request */
21772 			vm_object_unlock(object);
21773 			return KERN_FAILURE;
21774 		}
21775 		/* deal with special page status */
21776 		if (m->vmp_busy ||
21777 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
21778 			vm_object_unlock(object);
21779 			return KERN_FAILURE;
21780 		}
21781 
21782 		/* Page is OK... now "validate" it */
21783 		/* This is the place where we'll call out to create a code
21784 		 * directory, later */
21785 		/* XXX TODO4K: deal with 4k subpages individually? */
21786 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21787 
21788 		/* The page is now "clean" for codesigning purposes. That means
21789 		 * we don't consider it as modified (wpmapped) anymore. But
21790 		 * we'll disconnect the page so we note any future modification
21791 		 * attempts. */
21792 		m->vmp_wpmapped = FALSE;
21793 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21794 
21795 		/* Pull the dirty status from the pmap, since we cleared the
21796 		 * wpmapped bit */
21797 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21798 			SET_PAGE_DIRTY(m, FALSE);
21799 		}
21800 
21801 		/* On to the next page */
21802 		start += PAGE_SIZE;
21803 	}
21804 	vm_object_unlock(object);
21805 
21806 	return KERN_SUCCESS;
21807 }
21808 #endif
21809 
21810 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)21811 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21812 {
21813 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
21814 	vm_map_entry_t  next_entry;
21815 	kern_return_t   kr = KERN_SUCCESS;
21816 	VM_MAP_ZAP_DECLARE(zap_list);
21817 
21818 	vm_map_lock(map);
21819 
21820 	for (entry = vm_map_first_entry(map);
21821 	    entry != vm_map_to_entry(map);
21822 	    entry = next_entry) {
21823 		next_entry = entry->vme_next;
21824 
21825 		if (!entry->is_sub_map &&
21826 		    VME_OBJECT(entry) &&
21827 		    (VME_OBJECT(entry)->internal == TRUE) &&
21828 		    (os_ref_get_count_raw(&VME_OBJECT(entry)->ref_count) == 1)) {
21829 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21830 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21831 
21832 			(void)vm_map_delete(map, entry->vme_start,
21833 			    entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
21834 			    KMEM_GUARD_NONE, &zap_list);
21835 		}
21836 	}
21837 
21838 	vm_map_unlock(map);
21839 
21840 	vm_map_zap_dispose(&zap_list);
21841 
21842 	return kr;
21843 }
21844 
21845 
21846 #if DEVELOPMENT || DEBUG
21847 
21848 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)21849 vm_map_disconnect_page_mappings(
21850 	vm_map_t map,
21851 	boolean_t do_unnest)
21852 {
21853 	vm_map_entry_t entry;
21854 	ledger_amount_t byte_count = 0;
21855 
21856 	if (do_unnest == TRUE) {
21857 #ifndef NO_NESTED_PMAP
21858 		vm_map_lock(map);
21859 
21860 		for (entry = vm_map_first_entry(map);
21861 		    entry != vm_map_to_entry(map);
21862 		    entry = entry->vme_next) {
21863 			if (entry->is_sub_map && entry->use_pmap) {
21864 				/*
21865 				 * Make sure the range between the start of this entry and
21866 				 * the end of this entry is no longer nested, so that
21867 				 * we will only remove mappings from the pmap in use by this
21868 				 * this task
21869 				 */
21870 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
21871 			}
21872 		}
21873 		vm_map_unlock(map);
21874 #endif
21875 	}
21876 	vm_map_lock_read(map);
21877 
21878 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
21879 
21880 	for (entry = vm_map_first_entry(map);
21881 	    entry != vm_map_to_entry(map);
21882 	    entry = entry->vme_next) {
21883 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
21884 		    (VME_OBJECT(entry)->phys_contiguous))) {
21885 			continue;
21886 		}
21887 		if (entry->is_sub_map) {
21888 			assert(!entry->use_pmap);
21889 		}
21890 
21891 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
21892 	}
21893 	vm_map_unlock_read(map);
21894 
21895 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
21896 }
21897 
21898 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)21899 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
21900 {
21901 	vm_object_t object = NULL;
21902 	vm_object_offset_t offset;
21903 	vm_prot_t prot;
21904 	boolean_t wired;
21905 	vm_map_version_t version;
21906 	vm_map_t real_map;
21907 	int result = KERN_FAILURE;
21908 
21909 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
21910 	vm_map_lock(map);
21911 
21912 	result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
21913 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
21914 	    NULL, &real_map, NULL);
21915 	if (object == NULL) {
21916 		result = KERN_MEMORY_ERROR;
21917 	} else if (object->pager) {
21918 		result = vm_compressor_pager_inject_error(object->pager,
21919 		    offset);
21920 	} else {
21921 		result = KERN_MEMORY_PRESENT;
21922 	}
21923 
21924 	if (object != NULL) {
21925 		vm_object_unlock(object);
21926 	}
21927 
21928 	if (real_map != map) {
21929 		vm_map_unlock(real_map);
21930 	}
21931 	vm_map_unlock(map);
21932 
21933 	return result;
21934 }
21935 
21936 /* iterate over map entries. Call the first argument block for the number of entries and the second for every entry
21937  * returns: KERN_SUCCESS if iteration completed ok,
21938  *      error code if callback returned an error
21939  *      KERN_FAILURE if there was a race of adding/removing entries during the iteration and the number of entries
21940  *      iterated is different from the number in the first call
21941  */
21942 static kern_return_t
21943 vm_map_entries_foreach_locked(vm_map_t map, kern_return_t (^count_handler)(int nentries),
21944     kern_return_t (^entry_handler)(void* entry))
21945 {
21946 	vm_map_lock_assert_held(map);
21947 	int nentries = map->hdr.nentries;
21948 	kern_return_t error = count_handler(nentries);
21949 	if (error) {
21950 		return error;
21951 	}
21952 
21953 	/* iterate until we loop back to the map, see get_vmmap_entries() */
21954 	vm_map_entry_t entry = vm_map_first_entry(map);
21955 	int count = 0;
21956 	while (entry != vm_map_to_entry(map)) {
21957 		error = entry_handler(entry);
21958 		if (error != KERN_SUCCESS) {
21959 			return error;
21960 		}
21961 		entry = entry->vme_next;
21962 		++count;
21963 		if (count > nentries) {
21964 			/* nentries and entries iteration don't agree on how many entries there are, shouldn't really happen */
21965 			return KERN_FAILURE;
21966 		}
21967 	}
21968 	if (count < nentries) {
21969 		return KERN_FAILURE;
21970 	}
21971 	return KERN_SUCCESS;
21972 }
21973 
21974 kern_return_t
21975 vm_map_entries_foreach(vm_map_t map, kern_return_t (^count_handler)(int nentries),
21976     kern_return_t (^entry_handler)(void* entry))
21977 {
21978 	vm_map_lock_read(map);
21979 	kern_return_t error = vm_map_entries_foreach_locked(map, count_handler, entry_handler);
21980 	vm_map_unlock_read(map);
21981 	return error;
21982 }
21983 
21984 /*
21985  * Dump info about the entry into the given buffer.
21986  * return true on success, false if there was not enough space in the give buffer
21987  * argument size in: bytes free in the given buffer, out: bytes written
21988  */
21989 kern_return_t
vm_map_dump_entry_and_compressor_pager(void * pentry,char * buf,size_t * size)21990 vm_map_dump_entry_and_compressor_pager(void* pentry, char *buf, size_t *size)
21991 {
21992 	size_t insize = *size;
21993 	kern_return_t kr;
21994 	size_t offset = 0;
21995 
21996 	*size = 0;
21997 	if (sizeof(struct vm_map_entry_info) > insize) {
21998 		return KERN_NO_SPACE;
21999 	}
22000 
22001 	vm_map_entry_t entry = (vm_map_entry_t)pentry;
22002 	struct vm_map_entry_info *out_entry = (struct vm_map_entry_info*)buf;
22003 	out_entry->vmei_start = entry->vme_start;
22004 	out_entry->vmei_end = entry->vme_end;
22005 	out_entry->vmei_alias = VME_ALIAS(entry);
22006 	out_entry->vmei_offset = VME_OFFSET(entry);
22007 	out_entry->vmei_is_sub_map = entry->is_sub_map;
22008 	out_entry->vmei_protection = entry->protection;
22009 	offset += sizeof(struct vm_map_entry_info);
22010 
22011 	out_entry->vmei_slot_mapping_count = 0;
22012 	out_entry->vmei_is_compressor_pager = false;
22013 	*size = offset;
22014 	if (out_entry->vmei_is_sub_map) {
22015 		return KERN_SUCCESS; // TODO: sub_map interrogation not supported yet
22016 	}
22017 	/* have a vm_object? */
22018 	vm_object_t object = VME_OBJECT(entry);
22019 	if (object == VM_OBJECT_NULL || !object->internal) {
22020 		return KERN_SUCCESS;
22021 	}
22022 	/* objects has a pager? */
22023 	memory_object_t pager = object->pager;
22024 	if (pager != MEMORY_OBJECT_NULL) {
22025 		return KERN_SUCCESS;
22026 	}
22027 	bool is_compressor = false;
22028 	unsigned int slot_mapping_count = 0;
22029 	size_t pager_info_size = insize - offset;
22030 	kr = vm_compressor_pager_dump(pager, buf + offset, &pager_info_size, &is_compressor, &slot_mapping_count);
22031 	if (kr != KERN_SUCCESS) {
22032 		/* didn't have enough space for everything we want to write, caller needs to retry */
22033 		return kr;
22034 	}
22035 	offset += pager_info_size;
22036 	/* if we got here, is_compressor should be true due to the object->internal check above, so this assignment
22037 	 * is just for sanity sake */
22038 	out_entry->vmei_is_compressor_pager = is_compressor;
22039 	out_entry->vmei_slot_mapping_count = slot_mapping_count;
22040 	*size = offset;
22041 	return KERN_SUCCESS;
22042 }
22043 
22044 
22045 #endif
22046 
22047 
22048 #if CONFIG_FREEZE
22049 
22050 
22051 extern struct freezer_context freezer_context_global;
22052 AbsoluteTime c_freezer_last_yield_ts = 0;
22053 
22054 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
22055 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
22056 
22057 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)22058 vm_map_freeze(
22059 	task_t       task,
22060 	unsigned int *purgeable_count,
22061 	unsigned int *wired_count,
22062 	unsigned int *clean_count,
22063 	unsigned int *dirty_count,
22064 	unsigned int dirty_budget,
22065 	unsigned int *shared_count,
22066 	int          *freezer_error_code,
22067 	boolean_t    eval_only)
22068 {
22069 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
22070 	kern_return_t   kr = KERN_SUCCESS;
22071 	boolean_t       evaluation_phase = TRUE;
22072 	vm_object_t     cur_shared_object = NULL;
22073 	int             cur_shared_obj_ref_cnt = 0;
22074 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
22075 
22076 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
22077 
22078 	/*
22079 	 * We need the exclusive lock here so that we can
22080 	 * block any page faults or lookups while we are
22081 	 * in the middle of freezing this vm map.
22082 	 */
22083 	vm_map_t map = task->map;
22084 
22085 	vm_map_lock(map);
22086 
22087 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
22088 
22089 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22090 		if (vm_compressor_low_on_space()) {
22091 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22092 		}
22093 
22094 		if (vm_swap_low_on_space()) {
22095 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22096 		}
22097 
22098 		kr = KERN_NO_SPACE;
22099 		goto done;
22100 	}
22101 
22102 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
22103 		/*
22104 		 * In-memory compressor backing the freezer. No disk.
22105 		 * So no need to do the evaluation phase.
22106 		 */
22107 		evaluation_phase = FALSE;
22108 
22109 		if (eval_only == TRUE) {
22110 			/*
22111 			 * We don't support 'eval_only' mode
22112 			 * in this non-swap config.
22113 			 */
22114 			*freezer_error_code = FREEZER_ERROR_GENERIC;
22115 			kr = KERN_INVALID_ARGUMENT;
22116 			goto done;
22117 		}
22118 
22119 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22120 		clock_get_uptime(&c_freezer_last_yield_ts);
22121 	}
22122 again:
22123 
22124 	for (entry2 = vm_map_first_entry(map);
22125 	    entry2 != vm_map_to_entry(map);
22126 	    entry2 = entry2->vme_next) {
22127 		vm_object_t src_object;
22128 
22129 		if (entry2->is_sub_map) {
22130 			continue;
22131 		}
22132 
22133 		src_object = VME_OBJECT(entry2);
22134 		if (!src_object ||
22135 		    src_object->phys_contiguous ||
22136 		    !src_object->internal) {
22137 			continue;
22138 		}
22139 
22140 		/* If eligible, scan the entry, moving eligible pages over to our parent object */
22141 
22142 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
22143 			/*
22144 			 * We skip purgeable objects during evaluation phase only.
22145 			 * If we decide to freeze this process, we'll explicitly
22146 			 * purge these objects before we go around again with
22147 			 * 'evaluation_phase' set to FALSE.
22148 			 */
22149 
22150 			if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
22151 				/*
22152 				 * We want to purge objects that may not belong to this task but are mapped
22153 				 * in this task alone. Since we already purged this task's purgeable memory
22154 				 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
22155 				 * on this task's purgeable objects. Hence the check for only volatile objects.
22156 				 */
22157 				if (evaluation_phase ||
22158 				    src_object->purgable != VM_PURGABLE_VOLATILE ||
22159 				    os_ref_get_count_raw(&src_object->ref_count) != 1) {
22160 					continue;
22161 				}
22162 				vm_object_lock(src_object);
22163 				if (src_object->purgable == VM_PURGABLE_VOLATILE &&
22164 				    os_ref_get_count_raw(&src_object->ref_count) == 1) {
22165 					purgeable_q_t old_queue;
22166 
22167 					/* object should be on a purgeable queue */
22168 					assert(src_object->objq.next != NULL &&
22169 					    src_object->objq.prev != NULL);
22170 					/* move object from its volatile queue to the nonvolatile queue */
22171 					old_queue = vm_purgeable_object_remove(src_object);
22172 					assert(old_queue);
22173 					if (src_object->purgeable_when_ripe) {
22174 						/* remove a token from that volatile queue */
22175 						vm_page_lock_queues();
22176 						vm_purgeable_token_delete_first(old_queue);
22177 						vm_page_unlock_queues();
22178 					}
22179 					/* purge the object */
22180 					vm_object_purge(src_object, 0);
22181 				}
22182 				vm_object_unlock(src_object);
22183 				continue;
22184 			}
22185 
22186 			/*
22187 			 * Pages belonging to this object could be swapped to disk.
22188 			 * Make sure it's not a shared object because we could end
22189 			 * up just bringing it back in again.
22190 			 *
22191 			 * We try to optimize somewhat by checking for objects that are mapped
22192 			 * more than once within our own map. But we don't do full searches,
22193 			 * we just look at the entries following our current entry.
22194 			 */
22195 
22196 			if (os_ref_get_count_raw(&src_object->ref_count) > 1) {
22197 				if (src_object != cur_shared_object) {
22198 					obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22199 					dirty_shared_count += obj_pages_snapshot;
22200 
22201 					cur_shared_object = src_object;
22202 					cur_shared_obj_ref_cnt = 1;
22203 					continue;
22204 				} else {
22205 					cur_shared_obj_ref_cnt++;
22206 					if (os_ref_get_count_raw(&src_object->ref_count) == cur_shared_obj_ref_cnt) {
22207 						/*
22208 						 * Fall through to below and treat this object as private.
22209 						 * So deduct its pages from our shared total and add it to the
22210 						 * private total.
22211 						 */
22212 
22213 						dirty_shared_count -= obj_pages_snapshot;
22214 						dirty_private_count += obj_pages_snapshot;
22215 					} else {
22216 						continue;
22217 					}
22218 				}
22219 			}
22220 
22221 
22222 			if (os_ref_get_count_raw(&src_object->ref_count) == 1) {
22223 				dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22224 			}
22225 
22226 			if (evaluation_phase == TRUE) {
22227 				continue;
22228 			}
22229 		}
22230 
22231 		uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
22232 		*wired_count += src_object->wired_page_count;
22233 
22234 		if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22235 			if (vm_compressor_low_on_space()) {
22236 				*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22237 			}
22238 
22239 			if (vm_swap_low_on_space()) {
22240 				*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22241 			}
22242 
22243 			kr = KERN_NO_SPACE;
22244 			break;
22245 		}
22246 		if (paged_out_count >= dirty_budget) {
22247 			break;
22248 		}
22249 		dirty_budget -= paged_out_count;
22250 	}
22251 
22252 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
22253 	if (evaluation_phase) {
22254 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
22255 
22256 		if (dirty_shared_count > shared_pages_threshold) {
22257 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
22258 			kr = KERN_FAILURE;
22259 			goto done;
22260 		}
22261 
22262 		if (dirty_shared_count &&
22263 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
22264 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
22265 			kr = KERN_FAILURE;
22266 			goto done;
22267 		}
22268 
22269 		evaluation_phase = FALSE;
22270 		dirty_shared_count = dirty_private_count = 0;
22271 
22272 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22273 		clock_get_uptime(&c_freezer_last_yield_ts);
22274 
22275 		if (eval_only) {
22276 			kr = KERN_SUCCESS;
22277 			goto done;
22278 		}
22279 
22280 		vm_purgeable_purge_task_owned(task);
22281 
22282 		goto again;
22283 	} else {
22284 		kr = KERN_SUCCESS;
22285 	}
22286 
22287 done:
22288 	vm_map_unlock(map);
22289 
22290 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
22291 		vm_object_compressed_freezer_done();
22292 	}
22293 	return kr;
22294 }
22295 
22296 #endif
22297 
22298 /*
22299  * vm_map_entry_should_cow_for_true_share:
22300  *
22301  * Determines if the map entry should be clipped and setup for copy-on-write
22302  * to avoid applying "true_share" to a large VM object when only a subset is
22303  * targeted.
22304  *
22305  * For now, we target only the map entries created for the Objective C
22306  * Garbage Collector, which initially have the following properties:
22307  *	- alias == VM_MEMORY_MALLOC
22308  *      - wired_count == 0
22309  *      - !needs_copy
22310  * and a VM object with:
22311  *      - internal
22312  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
22313  *      - !true_share
22314  *      - vo_size == ANON_CHUNK_SIZE
22315  *
22316  * Only non-kernel map entries.
22317  */
22318 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)22319 vm_map_entry_should_cow_for_true_share(
22320 	vm_map_entry_t  entry)
22321 {
22322 	vm_object_t     object;
22323 
22324 	if (entry->is_sub_map) {
22325 		/* entry does not point at a VM object */
22326 		return FALSE;
22327 	}
22328 
22329 	if (entry->needs_copy) {
22330 		/* already set for copy_on_write: done! */
22331 		return FALSE;
22332 	}
22333 
22334 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
22335 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
22336 		/* not a malloc heap or Obj-C Garbage Collector heap */
22337 		return FALSE;
22338 	}
22339 
22340 	if (entry->wired_count) {
22341 		/* wired: can't change the map entry... */
22342 		vm_counters.should_cow_but_wired++;
22343 		return FALSE;
22344 	}
22345 
22346 	object = VME_OBJECT(entry);
22347 
22348 	if (object == VM_OBJECT_NULL) {
22349 		/* no object yet... */
22350 		return FALSE;
22351 	}
22352 
22353 	if (!object->internal) {
22354 		/* not an internal object */
22355 		return FALSE;
22356 	}
22357 
22358 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
22359 		/* not the default copy strategy */
22360 		return FALSE;
22361 	}
22362 
22363 	if (object->true_share) {
22364 		/* already true_share: too late to avoid it */
22365 		return FALSE;
22366 	}
22367 
22368 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
22369 	    object->vo_size != ANON_CHUNK_SIZE) {
22370 		/* ... not an object created for the ObjC Garbage Collector */
22371 		return FALSE;
22372 	}
22373 
22374 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
22375 	    object->vo_size != 2048 * 4096) {
22376 		/* ... not a "MALLOC_SMALL" heap */
22377 		return FALSE;
22378 	}
22379 
22380 	/*
22381 	 * All the criteria match: we have a large object being targeted for "true_share".
22382 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
22383 	 * try and avoid setting up the entire object for "true_share" by clipping the
22384 	 * targeted range and setting it up for copy-on-write.
22385 	 */
22386 	return TRUE;
22387 }
22388 
22389 uint64_t vm_map_range_overflows_count = 0;
22390 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22391 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22392 vm_map_range_overflows(
22393 	vm_map_t map,
22394 	vm_map_offset_t addr,
22395 	vm_map_size_t size)
22396 {
22397 	vm_map_offset_t start, end, sum;
22398 	vm_map_offset_t pgmask;
22399 
22400 	if (size == 0) {
22401 		/* empty range -> no overflow */
22402 		return false;
22403 	}
22404 	pgmask = vm_map_page_mask(map);
22405 	start = vm_map_trunc_page_mask(addr, pgmask);
22406 	end = vm_map_round_page_mask(addr + size, pgmask);
22407 	if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22408 		vm_map_range_overflows_count++;
22409 		if (vm_map_range_overflows_log) {
22410 			printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22411 			    proc_selfpid(),
22412 			    proc_best_name(current_proc()),
22413 			    (uint64_t)addr,
22414 			    (uint64_t)size,
22415 			    (uint64_t)pgmask);
22416 		}
22417 		DTRACE_VM4(vm_map_range_overflows,
22418 		    vm_map_t, map,
22419 		    uint32_t, pgmask,
22420 		    uint64_t, (uint64_t)addr,
22421 		    uint64_t, (uint64_t)size);
22422 		return true;
22423 	}
22424 	return false;
22425 }
22426 
22427 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22428 vm_map_round_page_mask(
22429 	vm_map_offset_t offset,
22430 	vm_map_offset_t mask)
22431 {
22432 	return VM_MAP_ROUND_PAGE(offset, mask);
22433 }
22434 
22435 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22436 vm_map_trunc_page_mask(
22437 	vm_map_offset_t offset,
22438 	vm_map_offset_t mask)
22439 {
22440 	return VM_MAP_TRUNC_PAGE(offset, mask);
22441 }
22442 
22443 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22444 vm_map_page_aligned(
22445 	vm_map_offset_t offset,
22446 	vm_map_offset_t mask)
22447 {
22448 	return ((offset) & mask) == 0;
22449 }
22450 
22451 int
vm_map_page_shift(vm_map_t map)22452 vm_map_page_shift(
22453 	vm_map_t map)
22454 {
22455 	return VM_MAP_PAGE_SHIFT(map);
22456 }
22457 
22458 int
vm_map_page_size(vm_map_t map)22459 vm_map_page_size(
22460 	vm_map_t map)
22461 {
22462 	return VM_MAP_PAGE_SIZE(map);
22463 }
22464 
22465 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22466 vm_map_page_mask(
22467 	vm_map_t map)
22468 {
22469 	return VM_MAP_PAGE_MASK(map);
22470 }
22471 
22472 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22473 vm_map_set_page_shift(
22474 	vm_map_t        map,
22475 	int             pageshift)
22476 {
22477 	if (map->hdr.nentries != 0) {
22478 		/* too late to change page size */
22479 		return KERN_FAILURE;
22480 	}
22481 
22482 	map->hdr.page_shift = (uint16_t)pageshift;
22483 
22484 	return KERN_SUCCESS;
22485 }
22486 
22487 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22488 vm_map_query_volatile(
22489 	vm_map_t        map,
22490 	mach_vm_size_t  *volatile_virtual_size_p,
22491 	mach_vm_size_t  *volatile_resident_size_p,
22492 	mach_vm_size_t  *volatile_compressed_size_p,
22493 	mach_vm_size_t  *volatile_pmap_size_p,
22494 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
22495 {
22496 	mach_vm_size_t  volatile_virtual_size;
22497 	mach_vm_size_t  volatile_resident_count;
22498 	mach_vm_size_t  volatile_compressed_count;
22499 	mach_vm_size_t  volatile_pmap_count;
22500 	mach_vm_size_t  volatile_compressed_pmap_count;
22501 	mach_vm_size_t  resident_count;
22502 	vm_map_entry_t  entry;
22503 	vm_object_t     object;
22504 
22505 	/* map should be locked by caller */
22506 
22507 	volatile_virtual_size = 0;
22508 	volatile_resident_count = 0;
22509 	volatile_compressed_count = 0;
22510 	volatile_pmap_count = 0;
22511 	volatile_compressed_pmap_count = 0;
22512 
22513 	for (entry = vm_map_first_entry(map);
22514 	    entry != vm_map_to_entry(map);
22515 	    entry = entry->vme_next) {
22516 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
22517 
22518 		if (entry->is_sub_map) {
22519 			continue;
22520 		}
22521 		if (!(entry->protection & VM_PROT_WRITE)) {
22522 			continue;
22523 		}
22524 		object = VME_OBJECT(entry);
22525 		if (object == VM_OBJECT_NULL) {
22526 			continue;
22527 		}
22528 		if (object->purgable != VM_PURGABLE_VOLATILE &&
22529 		    object->purgable != VM_PURGABLE_EMPTY) {
22530 			continue;
22531 		}
22532 		if (VME_OFFSET(entry)) {
22533 			/*
22534 			 * If the map entry has been split and the object now
22535 			 * appears several times in the VM map, we don't want
22536 			 * to count the object's resident_page_count more than
22537 			 * once.  We count it only for the first one, starting
22538 			 * at offset 0 and ignore the other VM map entries.
22539 			 */
22540 			continue;
22541 		}
22542 		resident_count = object->resident_page_count;
22543 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22544 			resident_count = 0;
22545 		} else {
22546 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22547 		}
22548 
22549 		volatile_virtual_size += entry->vme_end - entry->vme_start;
22550 		volatile_resident_count += resident_count;
22551 		if (object->pager) {
22552 			volatile_compressed_count +=
22553 			    vm_compressor_pager_get_count(object->pager);
22554 		}
22555 		pmap_compressed_bytes = 0;
22556 		pmap_resident_bytes =
22557 		    pmap_query_resident(map->pmap,
22558 		    entry->vme_start,
22559 		    entry->vme_end,
22560 		    &pmap_compressed_bytes);
22561 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22562 		volatile_compressed_pmap_count += (pmap_compressed_bytes
22563 		    / PAGE_SIZE);
22564 	}
22565 
22566 	/* map is still locked on return */
22567 
22568 	*volatile_virtual_size_p = volatile_virtual_size;
22569 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22570 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22571 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22572 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22573 
22574 	return KERN_SUCCESS;
22575 }
22576 
22577 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22578 vm_map_sizes(vm_map_t map,
22579     vm_map_size_t * psize,
22580     vm_map_size_t * pfree,
22581     vm_map_size_t * plargest_free)
22582 {
22583 	vm_map_entry_t  entry;
22584 	vm_map_offset_t prev;
22585 	vm_map_size_t   free, total_free, largest_free;
22586 	boolean_t       end;
22587 
22588 	if (!map) {
22589 		*psize = *pfree = *plargest_free = 0;
22590 		return;
22591 	}
22592 	total_free = largest_free = 0;
22593 
22594 	vm_map_lock_read(map);
22595 	if (psize) {
22596 		*psize = map->max_offset - map->min_offset;
22597 	}
22598 
22599 	prev = map->min_offset;
22600 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22601 		end = (entry == vm_map_to_entry(map));
22602 
22603 		if (end) {
22604 			free = entry->vme_end   - prev;
22605 		} else {
22606 			free = entry->vme_start - prev;
22607 		}
22608 
22609 		total_free += free;
22610 		if (free > largest_free) {
22611 			largest_free = free;
22612 		}
22613 
22614 		if (end) {
22615 			break;
22616 		}
22617 		prev = entry->vme_end;
22618 	}
22619 	vm_map_unlock_read(map);
22620 	if (pfree) {
22621 		*pfree = total_free;
22622 	}
22623 	if (plargest_free) {
22624 		*plargest_free = largest_free;
22625 	}
22626 }
22627 
22628 #if VM_SCAN_FOR_SHADOW_CHAIN
22629 int
vm_map_shadow_max(vm_map_t map)22630 vm_map_shadow_max(
22631 	vm_map_t map)
22632 {
22633 	int             shadows, shadows_max;
22634 	vm_map_entry_t  entry;
22635 	vm_object_t     object, next_object;
22636 
22637 	if (map == NULL) {
22638 		return 0;
22639 	}
22640 
22641 	shadows_max = 0;
22642 
22643 	vm_map_lock_read(map);
22644 
22645 	for (entry = vm_map_first_entry(map);
22646 	    entry != vm_map_to_entry(map);
22647 	    entry = entry->vme_next) {
22648 		if (entry->is_sub_map) {
22649 			continue;
22650 		}
22651 		object = VME_OBJECT(entry);
22652 		if (object == NULL) {
22653 			continue;
22654 		}
22655 		vm_object_lock_shared(object);
22656 		for (shadows = 0;
22657 		    object->shadow != NULL;
22658 		    shadows++, object = next_object) {
22659 			next_object = object->shadow;
22660 			vm_object_lock_shared(next_object);
22661 			vm_object_unlock(object);
22662 		}
22663 		vm_object_unlock(object);
22664 		if (shadows > shadows_max) {
22665 			shadows_max = shadows;
22666 		}
22667 	}
22668 
22669 	vm_map_unlock_read(map);
22670 
22671 	return shadows_max;
22672 }
22673 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22674 
22675 void
vm_commit_pagezero_status(vm_map_t lmap)22676 vm_commit_pagezero_status(vm_map_t lmap)
22677 {
22678 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22679 }
22680 
22681 #if __x86_64__
22682 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22683 vm_map_set_high_start(
22684 	vm_map_t        map,
22685 	vm_map_offset_t high_start)
22686 {
22687 	map->vmmap_high_start = high_start;
22688 }
22689 #endif /* __x86_64__ */
22690 
22691 #if CODE_SIGNING_MONITOR
22692 
22693 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22694 vm_map_entry_cs_associate(
22695 	vm_map_t                map,
22696 	vm_map_entry_t          entry,
22697 	vm_map_kernel_flags_t   vmk_flags)
22698 {
22699 	vm_object_t cs_object, cs_shadow, backing_object;
22700 	vm_object_offset_t cs_offset, backing_offset;
22701 	void *cs_blobs;
22702 	struct vnode *cs_vnode;
22703 	kern_return_t cs_ret;
22704 
22705 	if (map->pmap == NULL ||
22706 	    entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22707 	    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22708 	    VME_OBJECT(entry) == VM_OBJECT_NULL) {
22709 		return KERN_SUCCESS;
22710 	}
22711 
22712 	if (!(entry->protection & VM_PROT_EXECUTE)) {
22713 		/*
22714 		 * This memory region is not executable, so the code-signing
22715 		 * monitor would usually not care about it...
22716 		 */
22717 		if (vmk_flags.vmkf_remap_prot_copy &&
22718 		    (entry->max_protection & VM_PROT_EXECUTE)) {
22719 			/*
22720 			 * ... except if the memory region is being remapped
22721 			 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22722 			 * which is what a debugger or dtrace would be doing
22723 			 * to prepare to modify an executable page to insert
22724 			 * a breakpoint or activate a probe.
22725 			 * In that case, fall through so that we can mark
22726 			 * this region as being "debugged" and no longer
22727 			 * strictly code-signed.
22728 			 */
22729 		} else {
22730 			/*
22731 			 * Really not executable, so no need to tell the
22732 			 * code-signing monitor.
22733 			 */
22734 			return KERN_SUCCESS;
22735 		}
22736 	}
22737 
22738 	vm_map_lock_assert_exclusive(map);
22739 
22740 	/*
22741 	 * Check for a debug association mapping before we check for used_for_jit. This
22742 	 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22743 	 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22744 	 * since they are mapped with RW or RX permissions, which the page table monitor
22745 	 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22746 	 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22747 	 * violation when those USER_EXEC pages are mapped as RW.
22748 	 *
22749 	 * Since these pages switch between RW and RX through mprotect, they mimic what
22750 	 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22751 	 * on macOS systems, this works in our favor here and allows us to continue to
22752 	 * support these legacy-programmed applications without sacrificing security on
22753 	 * the page table or the code signing monitor. We don't need to explicitly check
22754 	 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22755 	 * created with RX, then the application must map it as RW in order to first write
22756 	 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22757 	 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22758 	 * Similarly, if the mapping was created as RW, and then switched to RX,
22759 	 * vm_map_protect will again mark the entry as a copy, and both these cases
22760 	 * lead to this if-statement being entered.
22761 	 *
22762 	 * For more information: rdar://115313336.
22763 	 */
22764 	if (vmk_flags.vmkf_remap_prot_copy) {
22765 		cs_ret = csm_associate_debug_region(
22766 			map->pmap,
22767 			entry->vme_start,
22768 			entry->vme_end - entry->vme_start);
22769 
22770 		/*
22771 		 * csm_associate_debug_region returns not supported when the code signing
22772 		 * monitor is disabled. This is intentional, since cs_ret is checked towards
22773 		 * the end of the function, and if it is not supported, then we still want the
22774 		 * VM to perform code-signing enforcement on this entry. That said, if we don't
22775 		 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22776 		 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22777 		 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22778 		 * cases, which will cause a violation when attempted to be mapped as writable).
22779 		 */
22780 		if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22781 			entry->vme_xnu_user_debug = TRUE;
22782 		}
22783 #if DEVELOPMENT || DEBUG
22784 		if (vm_log_xnu_user_debug) {
22785 			printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ]  vme_xnu_user_debug=%d cs_ret %d\n",
22786 			    proc_selfpid(),
22787 			    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22788 			    __FUNCTION__, __LINE__,
22789 			    map, entry,
22790 			    (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22791 			    entry->vme_xnu_user_debug,
22792 			    cs_ret);
22793 		}
22794 #endif /* DEVELOPMENT || DEBUG */
22795 		goto done;
22796 	}
22797 
22798 	if (entry->used_for_jit) {
22799 		cs_ret = csm_associate_jit_region(
22800 			map->pmap,
22801 			entry->vme_start,
22802 			entry->vme_end - entry->vme_start);
22803 		goto done;
22804 	}
22805 
22806 	cs_object = VME_OBJECT(entry);
22807 	vm_object_lock_shared(cs_object);
22808 	cs_offset = VME_OFFSET(entry);
22809 
22810 	/* find the VM object backed by the code-signed vnode */
22811 	for (;;) {
22812 		/* go to the bottom of cs_object's shadow chain */
22813 		for (;
22814 		    cs_object->shadow != VM_OBJECT_NULL;
22815 		    cs_object = cs_shadow) {
22816 			cs_shadow = cs_object->shadow;
22817 			cs_offset += cs_object->vo_shadow_offset;
22818 			vm_object_lock_shared(cs_shadow);
22819 			vm_object_unlock(cs_object);
22820 		}
22821 		if (cs_object->internal ||
22822 		    cs_object->pager == MEMORY_OBJECT_NULL) {
22823 			vm_object_unlock(cs_object);
22824 			return KERN_SUCCESS;
22825 		}
22826 
22827 		cs_offset += cs_object->paging_offset;
22828 
22829 		/*
22830 		 * cs_object could be backed by a:
22831 		 *      vnode_pager
22832 		 *	apple_protect_pager
22833 		 *      shared_region_pager
22834 		 *	fourk_pager (multiple backing objects -> fail?)
22835 		 * ask the pager if it has a backing VM object
22836 		 */
22837 		if (!memory_object_backing_object(cs_object->pager,
22838 		    cs_offset,
22839 		    &backing_object,
22840 		    &backing_offset)) {
22841 			/* no backing object: cs_object is it */
22842 			break;
22843 		}
22844 
22845 		/* look down the backing object's shadow chain */
22846 		vm_object_lock_shared(backing_object);
22847 		vm_object_unlock(cs_object);
22848 		cs_object = backing_object;
22849 		cs_offset = backing_offset;
22850 	}
22851 
22852 	cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
22853 	if (cs_vnode == NULL) {
22854 		/* no vnode, no code signatures to associate */
22855 		cs_ret = KERN_SUCCESS;
22856 	} else {
22857 		cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
22858 		    &cs_blobs);
22859 		assert(cs_ret == KERN_SUCCESS);
22860 		cs_ret = cs_associate_blob_with_mapping(map->pmap,
22861 		    entry->vme_start,
22862 		    (entry->vme_end - entry->vme_start),
22863 		    cs_offset,
22864 		    cs_blobs);
22865 	}
22866 	vm_object_unlock(cs_object);
22867 	cs_object = VM_OBJECT_NULL;
22868 
22869 done:
22870 	if (cs_ret == KERN_SUCCESS) {
22871 		DTRACE_VM2(vm_map_entry_cs_associate_success,
22872 		    vm_map_offset_t, entry->vme_start,
22873 		    vm_map_offset_t, entry->vme_end);
22874 		if (vm_map_executable_immutable) {
22875 			/*
22876 			 * Prevent this executable
22877 			 * mapping from being unmapped
22878 			 * or modified.
22879 			 */
22880 			entry->vme_permanent = TRUE;
22881 		}
22882 		/*
22883 		 * pmap says it will validate the
22884 		 * code-signing validity of pages
22885 		 * faulted in via this mapping, so
22886 		 * this map entry should be marked so
22887 		 * that vm_fault() bypasses code-signing
22888 		 * validation for faults coming through
22889 		 * this mapping.
22890 		 */
22891 		entry->csm_associated = TRUE;
22892 	} else if (cs_ret == KERN_NOT_SUPPORTED) {
22893 		/*
22894 		 * pmap won't check the code-signing
22895 		 * validity of pages faulted in via
22896 		 * this mapping, so VM should keep
22897 		 * doing it.
22898 		 */
22899 		DTRACE_VM3(vm_map_entry_cs_associate_off,
22900 		    vm_map_offset_t, entry->vme_start,
22901 		    vm_map_offset_t, entry->vme_end,
22902 		    int, cs_ret);
22903 	} else {
22904 		/*
22905 		 * A real error: do not allow
22906 		 * execution in this mapping.
22907 		 */
22908 		DTRACE_VM3(vm_map_entry_cs_associate_failure,
22909 		    vm_map_offset_t, entry->vme_start,
22910 		    vm_map_offset_t, entry->vme_end,
22911 		    int, cs_ret);
22912 		if (vmk_flags.vmkf_overwrite_immutable) {
22913 			/*
22914 			 * We can get here when we remap an apple_protect pager
22915 			 * on top of an already cs_associated executable mapping
22916 			 * with the same code signatures, so we don't want to
22917 			 * lose VM_PROT_EXECUTE in that case...
22918 			 */
22919 		} else {
22920 			entry->protection &= ~VM_PROT_ALLEXEC;
22921 			entry->max_protection &= ~VM_PROT_ALLEXEC;
22922 		}
22923 	}
22924 
22925 	return cs_ret;
22926 }
22927 
22928 #endif /* CODE_SIGNING_MONITOR */
22929 
22930 inline bool
vm_map_is_corpse_source(vm_map_t map)22931 vm_map_is_corpse_source(vm_map_t map)
22932 {
22933 	bool status = false;
22934 	if (map) {
22935 		vm_map_lock_read(map);
22936 		status = map->corpse_source;
22937 		vm_map_unlock_read(map);
22938 	}
22939 	return status;
22940 }
22941 
22942 inline void
vm_map_set_corpse_source(vm_map_t map)22943 vm_map_set_corpse_source(vm_map_t map)
22944 {
22945 	if (map) {
22946 		vm_map_lock(map);
22947 		map->corpse_source = true;
22948 		vm_map_unlock(map);
22949 	}
22950 }
22951 
22952 inline void
vm_map_unset_corpse_source(vm_map_t map)22953 vm_map_unset_corpse_source(vm_map_t map)
22954 {
22955 	if (map) {
22956 		vm_map_lock(map);
22957 		map->corpse_source = false;
22958 		vm_map_unlock(map);
22959 	}
22960 }
22961 /*
22962  * FORKED CORPSE FOOTPRINT
22963  *
22964  * A forked corpse gets a copy of the original VM map but its pmap is mostly
22965  * empty since it never ran and never got to fault in any pages.
22966  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
22967  * a forked corpse would therefore return very little information.
22968  *
22969  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
22970  * to vm_map_fork() to collect footprint information from the original VM map
22971  * and its pmap, and store it in the forked corpse's VM map.  That information
22972  * is stored in place of the VM map's "hole list" since we'll never need to
22973  * lookup for holes in the corpse's map.
22974  *
22975  * The corpse's footprint info looks like this:
22976  *
22977  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
22978  * as follows:
22979  *                     +---------------------------------------+
22980  *            header-> | cf_size                               |
22981  *                     +-------------------+-------------------+
22982  *                     | cf_last_region    | cf_last_zeroes    |
22983  *                     +-------------------+-------------------+
22984  *           region1-> | cfr_vaddr                             |
22985  *                     +-------------------+-------------------+
22986  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
22987  *                     +---------------------------------------+
22988  *                     | d4 | d5 | ...                         |
22989  *                     +---------------------------------------+
22990  *                     | ...                                   |
22991  *                     +-------------------+-------------------+
22992  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
22993  *                     +-------------------+-------------------+
22994  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
22995  *                     +---------------------------------------+
22996  *                     | d0 | d1 ...                           |
22997  *                     +---------------------------------------+
22998  *                       ...
22999  *                     +---------------------------------------+
23000  *       last region-> | cfr_vaddr                             |
23001  *                     +---------------------------------------+
23002  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
23003  *                     +---------------------------------------+
23004  *                       ...
23005  *                     +---------------------------------------+
23006  *                     | dx | dy | dz | na | na | na | na | na |
23007  *                     +---------------------------------------+
23008  *
23009  * where:
23010  *      cf_size:	total size of the buffer (rounded to page size)
23011  *      cf_last_region:	offset in the buffer of the last "region" sub-header
23012  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
23013  *			of last region
23014  *	cfr_vaddr:	virtual address of the start of the covered "region"
23015  *	cfr_num_pages:	number of pages in the covered "region"
23016  *	d*:		disposition of the page at that virtual address
23017  * Regions in the buffer are word-aligned.
23018  *
23019  * We estimate the size of the buffer based on the number of memory regions
23020  * and the virtual size of the address space.  While copying each memory region
23021  * during vm_map_fork(), we also collect the footprint info for that region
23022  * and store it in the buffer, packing it as much as possible (coalescing
23023  * contiguous memory regions to avoid having too many region headers and
23024  * avoiding long streaks of "zero" page dispositions by splitting footprint
23025  * "regions", so the number of regions in the footprint buffer might not match
23026  * the number of memory regions in the address space.
23027  *
23028  * We also have to copy the original task's "nonvolatile" ledgers since that's
23029  * part of the footprint and will need to be reported to any tool asking for
23030  * the footprint information of the forked corpse.
23031  */
23032 
23033 uint64_t vm_map_corpse_footprint_count = 0;
23034 uint64_t vm_map_corpse_footprint_size_avg = 0;
23035 uint64_t vm_map_corpse_footprint_size_max = 0;
23036 uint64_t vm_map_corpse_footprint_full = 0;
23037 uint64_t vm_map_corpse_footprint_no_buf = 0;
23038 
23039 struct vm_map_corpse_footprint_header {
23040 	vm_size_t       cf_size;        /* allocated buffer size */
23041 	uint32_t        cf_last_region; /* offset of last region in buffer */
23042 	union {
23043 		uint32_t cfu_last_zeroes; /* during creation:
23044 		                           * number of "zero" dispositions at
23045 		                           * end of last region */
23046 		uint32_t cfu_hint_region; /* during lookup:
23047 		                           * offset of last looked up region */
23048 #define cf_last_zeroes cfu.cfu_last_zeroes
23049 #define cf_hint_region cfu.cfu_hint_region
23050 	} cfu;
23051 };
23052 typedef uint8_t cf_disp_t;
23053 struct vm_map_corpse_footprint_region {
23054 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
23055 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
23056 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
23057 } __attribute__((packed));
23058 
23059 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)23060 vm_page_disposition_to_cf_disp(
23061 	int disposition)
23062 {
23063 	assert(sizeof(cf_disp_t) == 1);
23064 	/* relocate bits that don't fit in a "uint8_t" */
23065 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
23066 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
23067 	}
23068 	/* cast gets rid of extra bits */
23069 	return (cf_disp_t) disposition;
23070 }
23071 
23072 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)23073 vm_page_cf_disp_to_disposition(
23074 	cf_disp_t cf_disp)
23075 {
23076 	int disposition;
23077 
23078 	assert(sizeof(cf_disp_t) == 1);
23079 	disposition = (int) cf_disp;
23080 	/* move relocated bits back in place */
23081 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
23082 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
23083 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
23084 	}
23085 	return disposition;
23086 }
23087 
23088 /*
23089  * vm_map_corpse_footprint_new_region:
23090  *      closes the current footprint "region" and creates a new one
23091  *
23092  * Returns NULL if there's not enough space in the buffer for a new region.
23093  */
23094 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)23095 vm_map_corpse_footprint_new_region(
23096 	struct vm_map_corpse_footprint_header *footprint_header)
23097 {
23098 	uintptr_t       footprint_edge;
23099 	uint32_t        new_region_offset;
23100 	struct vm_map_corpse_footprint_region *footprint_region;
23101 	struct vm_map_corpse_footprint_region *new_footprint_region;
23102 
23103 	footprint_edge = ((uintptr_t)footprint_header +
23104 	    footprint_header->cf_size);
23105 	footprint_region = ((struct vm_map_corpse_footprint_region *)
23106 	    ((char *)footprint_header +
23107 	    footprint_header->cf_last_region));
23108 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
23109 	    footprint_edge);
23110 
23111 	/* get rid of trailing zeroes in the last region */
23112 	assert(footprint_region->cfr_num_pages >=
23113 	    footprint_header->cf_last_zeroes);
23114 	footprint_region->cfr_num_pages -=
23115 	    footprint_header->cf_last_zeroes;
23116 	footprint_header->cf_last_zeroes = 0;
23117 
23118 	/* reuse this region if it's now empty */
23119 	if (footprint_region->cfr_num_pages == 0) {
23120 		return footprint_region;
23121 	}
23122 
23123 	/* compute offset of new region */
23124 	new_region_offset = footprint_header->cf_last_region;
23125 	new_region_offset += sizeof(*footprint_region);
23126 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23127 	new_region_offset = roundup(new_region_offset, sizeof(int));
23128 
23129 	/* check if we're going over the edge */
23130 	if (((uintptr_t)footprint_header +
23131 	    new_region_offset +
23132 	    sizeof(*footprint_region)) >=
23133 	    footprint_edge) {
23134 		/* over the edge: no new region */
23135 		return NULL;
23136 	}
23137 
23138 	/* adjust offset of last region in header */
23139 	footprint_header->cf_last_region = new_region_offset;
23140 
23141 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
23142 	    ((char *)footprint_header +
23143 	    footprint_header->cf_last_region);
23144 	new_footprint_region->cfr_vaddr = 0;
23145 	new_footprint_region->cfr_num_pages = 0;
23146 	/* caller needs to initialize new region */
23147 
23148 	return new_footprint_region;
23149 }
23150 
23151 /*
23152  * vm_map_corpse_footprint_collect:
23153  *	collect footprint information for "old_entry" in "old_map" and
23154  *	stores it in "new_map"'s vmmap_footprint_info.
23155  */
23156 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)23157 vm_map_corpse_footprint_collect(
23158 	vm_map_t        old_map,
23159 	vm_map_entry_t  old_entry,
23160 	vm_map_t        new_map)
23161 {
23162 	vm_map_offset_t va;
23163 	kern_return_t   kr;
23164 	struct vm_map_corpse_footprint_header *footprint_header;
23165 	struct vm_map_corpse_footprint_region *footprint_region;
23166 	struct vm_map_corpse_footprint_region *new_footprint_region;
23167 	cf_disp_t       *next_disp_p;
23168 	uintptr_t       footprint_edge;
23169 	uint32_t        num_pages_tmp;
23170 	int             effective_page_size;
23171 
23172 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
23173 
23174 	va = old_entry->vme_start;
23175 
23176 	vm_map_lock_assert_exclusive(old_map);
23177 	vm_map_lock_assert_exclusive(new_map);
23178 
23179 	assert(new_map->has_corpse_footprint);
23180 	assert(!old_map->has_corpse_footprint);
23181 	if (!new_map->has_corpse_footprint ||
23182 	    old_map->has_corpse_footprint) {
23183 		/*
23184 		 * This can only transfer footprint info from a
23185 		 * map with a live pmap to a map with a corpse footprint.
23186 		 */
23187 		return KERN_NOT_SUPPORTED;
23188 	}
23189 
23190 	if (new_map->vmmap_corpse_footprint == NULL) {
23191 		vm_offset_t     buf;
23192 		vm_size_t       buf_size;
23193 
23194 		buf = 0;
23195 		buf_size = (sizeof(*footprint_header) +
23196 		    (old_map->hdr.nentries
23197 		    *
23198 		    (sizeof(*footprint_region) +
23199 		    +3))            /* potential alignment for each region */
23200 		    +
23201 		    ((old_map->size / effective_page_size)
23202 		    *
23203 		    sizeof(cf_disp_t)));      /* disposition for each page */
23204 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
23205 		buf_size = round_page(buf_size);
23206 
23207 		/* limit buffer to 1 page to validate overflow detection */
23208 //		buf_size = PAGE_SIZE;
23209 
23210 		/* limit size to a somewhat sane amount */
23211 #if XNU_TARGET_OS_OSX
23212 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
23213 #else /* XNU_TARGET_OS_OSX */
23214 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
23215 #endif /* XNU_TARGET_OS_OSX */
23216 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
23217 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
23218 		}
23219 
23220 		/*
23221 		 * Allocate the pageable buffer (with a trailing guard page).
23222 		 * It will be zero-filled on demand.
23223 		 */
23224 		kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
23225 		    KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
23226 		    VM_KERN_MEMORY_DIAG);
23227 		if (kr != KERN_SUCCESS) {
23228 			vm_map_corpse_footprint_no_buf++;
23229 			return kr;
23230 		}
23231 
23232 		/* initialize header and 1st region */
23233 		footprint_header = (struct vm_map_corpse_footprint_header *)buf;
23234 		new_map->vmmap_corpse_footprint = footprint_header;
23235 
23236 		footprint_header->cf_size = buf_size;
23237 		footprint_header->cf_last_region =
23238 		    sizeof(*footprint_header);
23239 		footprint_header->cf_last_zeroes = 0;
23240 
23241 		footprint_region = (struct vm_map_corpse_footprint_region *)
23242 		    ((char *)footprint_header +
23243 		    footprint_header->cf_last_region);
23244 		footprint_region->cfr_vaddr = 0;
23245 		footprint_region->cfr_num_pages = 0;
23246 	} else {
23247 		/* retrieve header and last region */
23248 		footprint_header = (struct vm_map_corpse_footprint_header *)
23249 		    new_map->vmmap_corpse_footprint;
23250 		footprint_region = (struct vm_map_corpse_footprint_region *)
23251 		    ((char *)footprint_header +
23252 		    footprint_header->cf_last_region);
23253 	}
23254 	footprint_edge = ((uintptr_t)footprint_header +
23255 	    footprint_header->cf_size);
23256 
23257 	if ((footprint_region->cfr_vaddr +
23258 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
23259 	    effective_page_size))
23260 	    != old_entry->vme_start) {
23261 		uint64_t num_pages_delta, num_pages_delta_size;
23262 		uint32_t region_offset_delta_size;
23263 
23264 		/*
23265 		 * Not the next contiguous virtual address:
23266 		 * start a new region or store "zero" dispositions for
23267 		 * the missing pages?
23268 		 */
23269 		/* size of gap in actual page dispositions */
23270 		num_pages_delta = ((old_entry->vme_start -
23271 		    footprint_region->cfr_vaddr) / effective_page_size)
23272 		    - footprint_region->cfr_num_pages;
23273 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
23274 		/* size of gap as a new footprint region header */
23275 		region_offset_delta_size =
23276 		    (sizeof(*footprint_region) +
23277 		    roundup(((footprint_region->cfr_num_pages -
23278 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
23279 		    sizeof(int)) -
23280 		    ((footprint_region->cfr_num_pages -
23281 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
23282 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
23283 		if (region_offset_delta_size < num_pages_delta_size ||
23284 		    os_add3_overflow(footprint_region->cfr_num_pages,
23285 		    (uint32_t) num_pages_delta,
23286 		    1,
23287 		    &num_pages_tmp)) {
23288 			/*
23289 			 * Storing data for this gap would take more space
23290 			 * than inserting a new footprint region header:
23291 			 * let's start a new region and save space. If it's a
23292 			 * tie, let's avoid using a new region, since that
23293 			 * would require more region hops to find the right
23294 			 * range during lookups.
23295 			 *
23296 			 * If the current region's cfr_num_pages would overflow
23297 			 * if we added "zero" page dispositions for the gap,
23298 			 * no choice but to start a new region.
23299 			 */
23300 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
23301 			new_footprint_region =
23302 			    vm_map_corpse_footprint_new_region(footprint_header);
23303 			/* check that we're not going over the edge */
23304 			if (new_footprint_region == NULL) {
23305 				goto over_the_edge;
23306 			}
23307 			footprint_region = new_footprint_region;
23308 			/* initialize new region as empty */
23309 			footprint_region->cfr_vaddr = old_entry->vme_start;
23310 			footprint_region->cfr_num_pages = 0;
23311 		} else {
23312 			/*
23313 			 * Store "zero" page dispositions for the missing
23314 			 * pages.
23315 			 */
23316 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
23317 			for (; num_pages_delta > 0; num_pages_delta--) {
23318 				next_disp_p = (cf_disp_t *)
23319 				    ((uintptr_t) footprint_region +
23320 				    sizeof(*footprint_region));
23321 				next_disp_p += footprint_region->cfr_num_pages;
23322 				/* check that we're not going over the edge */
23323 				if ((uintptr_t)next_disp_p >= footprint_edge) {
23324 					goto over_the_edge;
23325 				}
23326 				/* store "zero" disposition for this gap page */
23327 				footprint_region->cfr_num_pages++;
23328 				*next_disp_p = (cf_disp_t) 0;
23329 				footprint_header->cf_last_zeroes++;
23330 			}
23331 		}
23332 	}
23333 
23334 	for (va = old_entry->vme_start;
23335 	    va < old_entry->vme_end;
23336 	    va += effective_page_size) {
23337 		int             disposition;
23338 		cf_disp_t       cf_disp;
23339 
23340 		vm_map_footprint_query_page_info(old_map,
23341 		    old_entry,
23342 		    va,
23343 		    &disposition);
23344 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
23345 
23346 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
23347 
23348 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
23349 			/*
23350 			 * Ignore "zero" dispositions at start of
23351 			 * region: just move start of region.
23352 			 */
23353 			footprint_region->cfr_vaddr += effective_page_size;
23354 			continue;
23355 		}
23356 
23357 		/* would region's cfr_num_pages overflow? */
23358 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
23359 		    &num_pages_tmp)) {
23360 			/* overflow: create a new region */
23361 			new_footprint_region =
23362 			    vm_map_corpse_footprint_new_region(
23363 				footprint_header);
23364 			if (new_footprint_region == NULL) {
23365 				goto over_the_edge;
23366 			}
23367 			footprint_region = new_footprint_region;
23368 			footprint_region->cfr_vaddr = va;
23369 			footprint_region->cfr_num_pages = 0;
23370 		}
23371 
23372 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
23373 		    sizeof(*footprint_region));
23374 		next_disp_p += footprint_region->cfr_num_pages;
23375 		/* check that we're not going over the edge */
23376 		if ((uintptr_t)next_disp_p >= footprint_edge) {
23377 			goto over_the_edge;
23378 		}
23379 		/* store this dispostion */
23380 		*next_disp_p = cf_disp;
23381 		footprint_region->cfr_num_pages++;
23382 
23383 		if (cf_disp != 0) {
23384 			/* non-zero disp: break the current zero streak */
23385 			footprint_header->cf_last_zeroes = 0;
23386 			/* done */
23387 			continue;
23388 		}
23389 
23390 		/* zero disp: add to the current streak of zeroes */
23391 		footprint_header->cf_last_zeroes++;
23392 		if ((footprint_header->cf_last_zeroes +
23393 		    roundup(((footprint_region->cfr_num_pages -
23394 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23395 		    (sizeof(int) - 1),
23396 		    sizeof(int))) <
23397 		    (sizeof(*footprint_header))) {
23398 			/*
23399 			 * There are not enough trailing "zero" dispositions
23400 			 * (+ the extra padding we would need for the previous
23401 			 * region); creating a new region would not save space
23402 			 * at this point, so let's keep this "zero" disposition
23403 			 * in this region and reconsider later.
23404 			 */
23405 			continue;
23406 		}
23407 		/*
23408 		 * Create a new region to avoid having too many consecutive
23409 		 * "zero" dispositions.
23410 		 */
23411 		new_footprint_region =
23412 		    vm_map_corpse_footprint_new_region(footprint_header);
23413 		if (new_footprint_region == NULL) {
23414 			goto over_the_edge;
23415 		}
23416 		footprint_region = new_footprint_region;
23417 		/* initialize the new region as empty ... */
23418 		footprint_region->cfr_num_pages = 0;
23419 		/* ... and skip this "zero" disp */
23420 		footprint_region->cfr_vaddr = va + effective_page_size;
23421 	}
23422 
23423 	return KERN_SUCCESS;
23424 
23425 over_the_edge:
23426 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23427 	vm_map_corpse_footprint_full++;
23428 	return KERN_RESOURCE_SHORTAGE;
23429 }
23430 
23431 /*
23432  * vm_map_corpse_footprint_collect_done:
23433  *	completes the footprint collection by getting rid of any remaining
23434  *	trailing "zero" dispositions and trimming the unused part of the
23435  *	kernel buffer
23436  */
23437 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23438 vm_map_corpse_footprint_collect_done(
23439 	vm_map_t        new_map)
23440 {
23441 	struct vm_map_corpse_footprint_header *footprint_header;
23442 	struct vm_map_corpse_footprint_region *footprint_region;
23443 	vm_size_t       buf_size, actual_size;
23444 	kern_return_t   kr;
23445 
23446 	assert(new_map->has_corpse_footprint);
23447 	if (!new_map->has_corpse_footprint ||
23448 	    new_map->vmmap_corpse_footprint == NULL) {
23449 		return;
23450 	}
23451 
23452 	footprint_header = (struct vm_map_corpse_footprint_header *)
23453 	    new_map->vmmap_corpse_footprint;
23454 	buf_size = footprint_header->cf_size;
23455 
23456 	footprint_region = (struct vm_map_corpse_footprint_region *)
23457 	    ((char *)footprint_header +
23458 	    footprint_header->cf_last_region);
23459 
23460 	/* get rid of trailing zeroes in last region */
23461 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23462 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23463 	footprint_header->cf_last_zeroes = 0;
23464 
23465 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
23466 	    sizeof(*footprint_region) +
23467 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23468 
23469 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23470 	vm_map_corpse_footprint_size_avg =
23471 	    (((vm_map_corpse_footprint_size_avg *
23472 	    vm_map_corpse_footprint_count) +
23473 	    actual_size) /
23474 	    (vm_map_corpse_footprint_count + 1));
23475 	vm_map_corpse_footprint_count++;
23476 	if (actual_size > vm_map_corpse_footprint_size_max) {
23477 		vm_map_corpse_footprint_size_max = actual_size;
23478 	}
23479 
23480 	actual_size = round_page(actual_size);
23481 	if (buf_size > actual_size) {
23482 		kr = vm_deallocate(kernel_map,
23483 		    vm_sanitize_wrap_addr((vm_address_t)footprint_header +
23484 		    actual_size + PAGE_SIZE), /* trailing guard page */
23485 		    vm_sanitize_wrap_size(buf_size - actual_size));
23486 		assertf(kr == KERN_SUCCESS,
23487 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23488 		    footprint_header,
23489 		    (uint64_t) buf_size,
23490 		    (uint64_t) actual_size,
23491 		    kr);
23492 		kr = vm_protect(kernel_map,
23493 		    (vm_address_t)footprint_header + actual_size,
23494 		    PAGE_SIZE,
23495 		    FALSE,             /* set_maximum */
23496 		    vm_sanitize_wrap_prot(VM_PROT_NONE));
23497 		assertf(kr == KERN_SUCCESS,
23498 		    "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23499 		    footprint_header,
23500 		    (uint64_t) buf_size,
23501 		    (uint64_t) actual_size,
23502 		    kr);
23503 	}
23504 
23505 	footprint_header->cf_size = actual_size;
23506 }
23507 
23508 /*
23509  * vm_map_corpse_footprint_query_page_info:
23510  *	retrieves the disposition of the page at virtual address "vaddr"
23511  *	in the forked corpse's VM map
23512  *
23513  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23514  */
23515 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23516 vm_map_corpse_footprint_query_page_info(
23517 	vm_map_t        map,
23518 	vm_map_offset_t va,
23519 	int             *disposition_p)
23520 {
23521 	struct vm_map_corpse_footprint_header *footprint_header;
23522 	struct vm_map_corpse_footprint_region *footprint_region;
23523 	uint32_t        footprint_region_offset;
23524 	vm_map_offset_t region_start, region_end;
23525 	int             disp_idx;
23526 	kern_return_t   kr;
23527 	int             effective_page_size;
23528 	cf_disp_t       cf_disp;
23529 
23530 	if (!map->has_corpse_footprint) {
23531 		*disposition_p = 0;
23532 		kr = KERN_INVALID_ARGUMENT;
23533 		goto done;
23534 	}
23535 
23536 	footprint_header = map->vmmap_corpse_footprint;
23537 	if (footprint_header == NULL) {
23538 		*disposition_p = 0;
23539 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23540 		kr = KERN_INVALID_ARGUMENT;
23541 		goto done;
23542 	}
23543 
23544 	/* start looking at the hint ("cf_hint_region") */
23545 	footprint_region_offset = footprint_header->cf_hint_region;
23546 
23547 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23548 
23549 lookup_again:
23550 	if (footprint_region_offset < sizeof(*footprint_header)) {
23551 		/* hint too low: start from 1st region */
23552 		footprint_region_offset = sizeof(*footprint_header);
23553 	}
23554 	if (footprint_region_offset > footprint_header->cf_last_region) {
23555 		/* hint too high: re-start from 1st region */
23556 		footprint_region_offset = sizeof(*footprint_header);
23557 	}
23558 	footprint_region = (struct vm_map_corpse_footprint_region *)
23559 	    ((char *)footprint_header + footprint_region_offset);
23560 	region_start = footprint_region->cfr_vaddr;
23561 	region_end = (region_start +
23562 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23563 	    effective_page_size));
23564 	if (va < region_start &&
23565 	    footprint_region_offset != sizeof(*footprint_header)) {
23566 		/* our range starts before the hint region */
23567 
23568 		/* reset the hint (in a racy way...) */
23569 		footprint_header->cf_hint_region = sizeof(*footprint_header);
23570 		/* lookup "va" again from 1st region */
23571 		footprint_region_offset = sizeof(*footprint_header);
23572 		goto lookup_again;
23573 	}
23574 
23575 	while (va >= region_end) {
23576 		if (footprint_region_offset >= footprint_header->cf_last_region) {
23577 			break;
23578 		}
23579 		/* skip the region's header */
23580 		footprint_region_offset += sizeof(*footprint_region);
23581 		/* skip the region's page dispositions */
23582 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23583 		/* align to next word boundary */
23584 		footprint_region_offset =
23585 		    roundup(footprint_region_offset,
23586 		    sizeof(int));
23587 		footprint_region = (struct vm_map_corpse_footprint_region *)
23588 		    ((char *)footprint_header + footprint_region_offset);
23589 		region_start = footprint_region->cfr_vaddr;
23590 		region_end = (region_start +
23591 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23592 		    effective_page_size));
23593 	}
23594 	if (va < region_start || va >= region_end) {
23595 		/* page not found */
23596 		*disposition_p = 0;
23597 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23598 		kr = KERN_SUCCESS;
23599 		goto done;
23600 	}
23601 
23602 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
23603 	footprint_header->cf_hint_region = footprint_region_offset;
23604 
23605 	/* get page disposition for "va" in this region */
23606 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23607 	cf_disp = footprint_region->cfr_disposition[disp_idx];
23608 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23609 	kr = KERN_SUCCESS;
23610 done:
23611 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23612 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23613 	DTRACE_VM4(footprint_query_page_info,
23614 	    vm_map_t, map,
23615 	    vm_map_offset_t, va,
23616 	    int, *disposition_p,
23617 	    kern_return_t, kr);
23618 
23619 	return kr;
23620 }
23621 
23622 void
vm_map_corpse_footprint_destroy(vm_map_t map)23623 vm_map_corpse_footprint_destroy(
23624 	vm_map_t        map)
23625 {
23626 	if (map->has_corpse_footprint &&
23627 	    map->vmmap_corpse_footprint != 0) {
23628 		struct vm_map_corpse_footprint_header *footprint_header;
23629 		vm_size_t buf_size;
23630 		kern_return_t kr;
23631 
23632 		footprint_header = map->vmmap_corpse_footprint;
23633 		buf_size = footprint_header->cf_size;
23634 		kr = vm_deallocate(kernel_map,
23635 		    vm_sanitize_wrap_addr((vm_offset_t) map->vmmap_corpse_footprint),
23636 		    vm_sanitize_wrap_size(buf_size + PAGE_SIZE)); /* trailing guard page */
23637 		assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
23638 		map->vmmap_corpse_footprint = 0;
23639 		map->has_corpse_footprint = FALSE;
23640 	}
23641 }
23642 
23643 /*
23644  * vm_map_copy_footprint_ledgers:
23645  *	copies any ledger that's relevant to the memory footprint of "old_task"
23646  *	into the forked corpse's task ("new_task")
23647  */
23648 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23649 vm_map_copy_footprint_ledgers(
23650 	task_t  old_task,
23651 	task_t  new_task)
23652 {
23653 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23654 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23655 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23656 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23657 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23658 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23659 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23660 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23661 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23662 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23663 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23664 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23665 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23666 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23667 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23668 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23669 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23670 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23671 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23672 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23673 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_nofootprint_total);
23674 }
23675 
23676 /*
23677  * vm_map_copy_ledger:
23678  *	copy a single ledger from "old_task" to "new_task"
23679  */
23680 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23681 vm_map_copy_ledger(
23682 	task_t  old_task,
23683 	task_t  new_task,
23684 	int     ledger_entry)
23685 {
23686 	ledger_amount_t old_balance, new_balance, delta;
23687 
23688 	assert(new_task->map->has_corpse_footprint);
23689 	if (!new_task->map->has_corpse_footprint) {
23690 		return;
23691 	}
23692 
23693 	/* turn off sanity checks for the ledger we're about to mess with */
23694 	ledger_disable_panic_on_negative(new_task->ledger,
23695 	    ledger_entry);
23696 
23697 	/* adjust "new_task" to match "old_task" */
23698 	ledger_get_balance(old_task->ledger,
23699 	    ledger_entry,
23700 	    &old_balance);
23701 	ledger_get_balance(new_task->ledger,
23702 	    ledger_entry,
23703 	    &new_balance);
23704 	if (new_balance == old_balance) {
23705 		/* new == old: done */
23706 	} else if (new_balance > old_balance) {
23707 		/* new > old ==> new -= new - old */
23708 		delta = new_balance - old_balance;
23709 		ledger_debit(new_task->ledger,
23710 		    ledger_entry,
23711 		    delta);
23712 	} else {
23713 		/* new < old ==> new += old - new */
23714 		delta = old_balance - new_balance;
23715 		ledger_credit(new_task->ledger,
23716 		    ledger_entry,
23717 		    delta);
23718 	}
23719 }
23720 
23721 /*
23722  * vm_map_get_pmap:
23723  * returns the pmap associated with the vm_map
23724  */
23725 pmap_t
vm_map_get_pmap(vm_map_t map)23726 vm_map_get_pmap(vm_map_t map)
23727 {
23728 	return vm_map_pmap(map);
23729 }
23730 
23731 ppnum_t
vm_map_get_phys_page(vm_map_t map,vm_offset_t addr)23732 vm_map_get_phys_page(
23733 	vm_map_t                map,
23734 	vm_offset_t             addr)
23735 {
23736 	vm_object_offset_t      offset;
23737 	vm_object_t             object;
23738 	vm_map_offset_t         map_offset;
23739 	vm_map_entry_t          entry;
23740 	ppnum_t                 phys_page = 0;
23741 
23742 	map_offset = vm_map_trunc_page(addr, PAGE_MASK);
23743 
23744 	vm_map_lock(map);
23745 	while (vm_map_lookup_entry(map, map_offset, &entry)) {
23746 		if (entry->is_sub_map) {
23747 			vm_map_t        old_map;
23748 			vm_map_lock(VME_SUBMAP(entry));
23749 			old_map = map;
23750 			map = VME_SUBMAP(entry);
23751 			map_offset = (VME_OFFSET(entry) +
23752 			    (map_offset - entry->vme_start));
23753 			vm_map_unlock(old_map);
23754 			continue;
23755 		}
23756 		if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
23757 			vm_map_unlock(map);
23758 			return (ppnum_t) 0;
23759 		}
23760 		if (VME_OBJECT(entry)->phys_contiguous) {
23761 			/* These are  not standard pageable memory mappings */
23762 			/* If they are not present in the object they will  */
23763 			/* have to be picked up from the pager through the  */
23764 			/* fault mechanism.  */
23765 			if (VME_OBJECT(entry)->vo_shadow_offset == 0) {
23766 				/* need to call vm_fault */
23767 				vm_map_unlock(map);
23768 				vm_fault(map, map_offset, VM_PROT_NONE,
23769 				    FALSE /* change_wiring */, VM_KERN_MEMORY_NONE,
23770 				    THREAD_UNINT, NULL, 0);
23771 				vm_map_lock(map);
23772 				continue;
23773 			}
23774 			offset = (VME_OFFSET(entry) +
23775 			    (map_offset - entry->vme_start));
23776 			phys_page = (ppnum_t)
23777 			    ((VME_OBJECT(entry)->vo_shadow_offset
23778 			    + offset) >> PAGE_SHIFT);
23779 			break;
23780 		}
23781 		offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start));
23782 		object = VME_OBJECT(entry);
23783 		vm_object_lock(object);
23784 		while (TRUE) {
23785 			vm_page_t dst_page = vm_page_lookup(object, offset);
23786 			if (dst_page == VM_PAGE_NULL) {
23787 				if (object->shadow) {
23788 					vm_object_t old_object;
23789 					vm_object_lock(object->shadow);
23790 					old_object = object;
23791 					offset = offset + object->vo_shadow_offset;
23792 					object = object->shadow;
23793 					vm_object_unlock(old_object);
23794 				} else {
23795 					vm_object_unlock(object);
23796 					break;
23797 				}
23798 			} else {
23799 				phys_page = (ppnum_t)(VM_PAGE_GET_PHYS_PAGE(dst_page));
23800 				vm_object_unlock(object);
23801 				break;
23802 			}
23803 		}
23804 		break;
23805 	}
23806 
23807 	vm_map_unlock(map);
23808 	return phys_page;
23809 }
23810 
23811 #if CONFIG_MAP_RANGES
23812 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23813 static bitmap_t vm_map_user_range_large_file_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23814 
23815 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23816 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23817 
23818 /*
23819  * vm_map_range_map_init:
23820  *  initializes the VM range ID map to enable index lookup
23821  *  of user VM ranges based on VM tag from userspace.
23822  */
23823 static void
vm_map_range_map_init(void)23824 vm_map_range_map_init(void)
23825 {
23826 	/*
23827 	 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
23828 	 * - the former is malloc metadata which should be kept separate
23829 	 * - the latter has its own ranges
23830 	 */
23831 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
23832 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
23833 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
23834 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
23835 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
23836 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
23837 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
23838 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
23839 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
23840 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
23841 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
23842 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
23843 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
23844 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
23845 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
23846 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
23847 	bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOACCELERATOR);
23848 	bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOSURFACE);
23849 }
23850 
23851 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)23852 vm_map_range_random_uniform(
23853 	vm_map_size_t           req_size,
23854 	vm_map_offset_t         min_addr,
23855 	vm_map_offset_t         max_addr,
23856 	vm_map_offset_t         offmask)
23857 {
23858 	vm_map_offset_t random_addr;
23859 	struct mach_vm_range alloc;
23860 
23861 	req_size = (req_size + offmask) & ~offmask;
23862 	min_addr = (min_addr + offmask) & ~offmask;
23863 	max_addr = max_addr & ~offmask;
23864 
23865 	read_random(&random_addr, sizeof(random_addr));
23866 	random_addr %= (max_addr - req_size - min_addr);
23867 	random_addr &= ~offmask;
23868 
23869 	alloc.min_address = min_addr + random_addr;
23870 	alloc.max_address = min_addr + random_addr + req_size;
23871 	return alloc;
23872 }
23873 
23874 static vm_map_offset_t
vm_map_range_offmask(void)23875 vm_map_range_offmask(void)
23876 {
23877 	uint32_t pte_depth;
23878 
23879 	/*
23880 	 * PTE optimizations
23881 	 *
23882 	 *
23883 	 * 16k pages systems
23884 	 * ~~~~~~~~~~~~~~~~~
23885 	 *
23886 	 * A single L1 (sub-)page covers the address space.
23887 	 * - L2 pages cover 64G,
23888 	 * - L3 pages cover 32M.
23889 	 *
23890 	 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
23891 	 * As a result, we really only need to align the ranges to 32M to avoid
23892 	 * partial L3 pages.
23893 	 *
23894 	 * On macOS, the usage of L2 pages will increase, so as a result we will
23895 	 * want to align ranges to 64G in order to utilize them fully.
23896 	 *
23897 	 *
23898 	 * 4k pages systems
23899 	 * ~~~~~~~~~~~~~~~~
23900 	 *
23901 	 * A single L0 (sub-)page covers the address space.
23902 	 * - L1 pages cover 512G,
23903 	 * - L2 pages cover 1G,
23904 	 * - L3 pages cover 2M.
23905 	 *
23906 	 * The long tail of processes on a system will tend to have a VA usage
23907 	 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
23908 	 * This is achievable with a single L1 and a few L2s without
23909 	 * randomization.
23910 	 *
23911 	 * However once randomization is introduced, the system will immediately
23912 	 * need several L1s and many more L2s. As a result:
23913 	 *
23914 	 * - on embedded devices, the cost of these extra pages isn't
23915 	 *   sustainable, and we just disable the feature entirely,
23916 	 *
23917 	 * - on macOS we align ranges to a 512G boundary so that the extra L1
23918 	 *   pages can be used to their full potential.
23919 	 */
23920 
23921 	/*
23922 	 * note, this function assumes _non exotic mappings_
23923 	 * which is why it uses the native kernel's PAGE_SHIFT.
23924 	 */
23925 #if XNU_PLATFORM_MacOSX
23926 	pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
23927 #else /* !XNU_PLATFORM_MacOSX */
23928 	pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
23929 #endif /* !XNU_PLATFORM_MacOSX */
23930 
23931 	if (pte_depth == 0) {
23932 		return 0;
23933 	}
23934 
23935 	return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
23936 }
23937 
23938 /*
23939  * vm_map_range_configure:
23940  *	configures the user vm_map ranges by increasing the maximum VA range of
23941  *  the map and carving out a range at the end of VA space (searching backwards
23942  *  in the newly expanded map).
23943  */
23944 kern_return_t
vm_map_range_configure(vm_map_t map,__unused bool needs_extra_jumbo_va)23945 vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va)
23946 {
23947 	const vm_map_offset_t offmask = vm_map_range_offmask();
23948 	struct mach_vm_range data_range;
23949 	vm_map_offset_t default_end;
23950 	kern_return_t kr;
23951 
23952 	if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
23953 		/*
23954 		 * No point doing vm ranges in a 32bit address space.
23955 		 */
23956 		return KERN_NOT_SUPPORTED;
23957 	}
23958 
23959 	/* Should not be applying ranges to kernel map or kernel map submaps */
23960 	assert(vm_map_pmap(map) != kernel_pmap);
23961 
23962 #if XNU_PLATFORM_MacOSX
23963 
23964 	/*
23965 	 * on macOS, the address space is a massive 47 bits (128T),
23966 	 * with several carve outs that processes can't use:
23967 	 * - the shared region
23968 	 * - the commpage region
23969 	 * - the GPU carve out (if applicable)
23970 	 *
23971 	 * and when nano-malloc is in use it desires memory at the 96T mark.
23972 	 *
23973 	 * However, their location is architecture dependent:
23974 	 * - On intel, the shared region and commpage are
23975 	 *   at the very end of the usable address space (above +127T),
23976 	 *   and there is no GPU carve out, and pthread wants to place
23977 	 *   threads at the 112T mark (0x70T).
23978 	 *
23979 	 * - On arm64, these are in the same spot as on embedded devices:
23980 	 *   o shared region:   [ 6G,  10G)  [ will likely grow over time ]
23981 	 *   o commpage region: [63G,  64G)
23982 	 *   o GPU carve out:   [64G, 448G)
23983 	 *
23984 	 * This is conveninent because the mappings at the end of the address
23985 	 * space (when they exist) are made by the kernel.
23986 	 *
23987 	 * The policy is to allocate a random 1T for the data heap
23988 	 * in the end of the address-space in the:
23989 	 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
23990 	 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
23991 	 */
23992 
23993 	/* see NANOZONE_SIGNATURE in libmalloc */
23994 #if __x86_64__
23995 	default_end = 0x71ull << 40;
23996 #else
23997 	default_end = 0x61ull << 40;
23998 #endif
23999 	data_range  = vm_map_range_random_uniform(1ull << 40,
24000 	        default_end, 0x7full << 40, offmask);
24001 
24002 #else /* !XNU_PLATFORM_MacOSX */
24003 
24004 	/*
24005 	 * Embedded devices:
24006 	 *
24007 	 *   The default VA Size scales with the device physical memory.
24008 	 *
24009 	 *   Out of that:
24010 	 *   - the "zero" page typically uses 4G + some slide
24011 	 *   - the shared region uses SHARED_REGION_SIZE bytes (4G)
24012 	 *
24013 	 *   Without the use of jumbo or any adjustment to the address space,
24014 	 *   a default VM map typically looks like this:
24015 	 *
24016 	 *       0G -->╒════════════╕
24017 	 *             │  pagezero  │
24018 	 *             │  + slide   │
24019 	 *      ~4G -->╞════════════╡<-- vm_map_min(map)
24020 	 *             │            │
24021 	 *       6G -->├────────────┤
24022 	 *             │   shared   │
24023 	 *             │   region   │
24024 	 *      10G -->├────────────┤
24025 	 *             │            │
24026 	 *   max_va -->├────────────┤<-- vm_map_max(map)
24027 	 *             │            │
24028 	 *             ╎   jumbo    ╎
24029 	 *             ╎            ╎
24030 	 *             │            │
24031 	 *      63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
24032 	 *             │  commpage  │
24033 	 *      64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
24034 	 *             │            │
24035 	 *             ╎    GPU     ╎
24036 	 *             ╎  carveout  ╎
24037 	 *             │            │
24038 	 *     448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
24039 	 *             │            │
24040 	 *             ╎            ╎
24041 	 *             ╎            ╎
24042 	 *             │            │
24043 	 *     512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
24044 	 *
24045 	 *   When this drawing was made, "max_va" was smaller than
24046 	 *   ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
24047 	 *   12G of address space for the zero-page, slide, files,
24048 	 *   binaries, heap ...
24049 	 *
24050 	 *   We will want to make a "heap/data" carve out inside
24051 	 *   the jumbo range of half of that usable space, assuming
24052 	 *   that this is less than a forth of the jumbo range.
24053 	 *
24054 	 *   The assert below intends to catch when max_va grows
24055 	 *   too large for this heuristic.
24056 	 */
24057 
24058 	vm_map_lock_read(map);
24059 	default_end = vm_map_max(map);
24060 	vm_map_unlock_read(map);
24061 
24062 	/*
24063 	 * Check that we're not already jumbo'd,
24064 	 * or our address space was somehow modified.
24065 	 *
24066 	 * If so we cannot guarantee that we can set up the ranges
24067 	 * safely without interfering with the existing map.
24068 	 */
24069 	if (default_end > vm_compute_max_offset(true)) {
24070 		return KERN_NO_SPACE;
24071 	}
24072 
24073 	if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
24074 		/*
24075 		 * an override boot-arg was set, disable user-ranges
24076 		 *
24077 		 * XXX: this is problematic because it means these boot-args
24078 		 *      no longer test the behavior changing the value
24079 		 *      of ARM64_MAX_OFFSET_DEVICE_* would have.
24080 		 */
24081 		return KERN_NOT_SUPPORTED;
24082 	}
24083 
24084 	/* expand the default VM space to 64GB */
24085 	vm_map_set_jumbo(map);
24086 
24087 	assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
24088 	data_range = vm_map_range_random_uniform(GiB(10),
24089 	    default_end + PAGE_SIZE, vm_map_max(map), offmask);
24090 
24091 #endif /* !XNU_PLATFORM_MacOSX */
24092 
24093 	/*
24094 	 * Poke holes so that ASAN or people listing regions
24095 	 * do not think this space is free.
24096 	 */
24097 
24098 	if (default_end != data_range.min_address) {
24099 		kr = vm_map_enter(map, &default_end,
24100 		    data_range.min_address - default_end,
24101 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24102 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24103 		assert(kr == KERN_SUCCESS);
24104 	}
24105 
24106 	if (data_range.max_address != vm_map_max(map)) {
24107 		vm_map_entry_t entry;
24108 		vm_size_t size;
24109 
24110 		/*
24111 		 * Extend the end of the hole to the next VM entry or the end of the map,
24112 		 * whichever comes first.
24113 		 */
24114 		vm_map_lock_read(map);
24115 		vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
24116 		if (entry == vm_map_to_entry(map) || entry->vme_start > vm_map_max(map)) {
24117 			size = vm_map_max(map) - data_range.max_address;
24118 		} else {
24119 			size = entry->vme_start - data_range.max_address;
24120 		}
24121 		vm_map_unlock_read(map);
24122 
24123 		kr = vm_map_enter(map, &data_range.max_address, size,
24124 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24125 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24126 		assert(kr == KERN_SUCCESS);
24127 	}
24128 
24129 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24130 	if (needs_extra_jumbo_va) {
24131 		/* This will grow the address space to MACH_VM_MAX_ADDRESS */
24132 		vm_map_set_extra_jumbo(map);
24133 	}
24134 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24135 
24136 	vm_map_lock(map);
24137 	map->default_range.min_address = vm_map_min(map);
24138 	map->default_range.max_address = default_end;
24139 	map->data_range = data_range;
24140 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24141 	/* If process has "extra jumbo" entitlement, enable large file range */
24142 	if (needs_extra_jumbo_va) {
24143 		map->large_file_range = vm_map_range_random_uniform(TiB(1),
24144 		    MACH_VM_JUMBO_ADDRESS, MACH_VM_MAX_ADDRESS, offmask);
24145 	}
24146 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24147 	map->uses_user_ranges = true;
24148 	vm_map_unlock(map);
24149 
24150 	return KERN_SUCCESS;
24151 }
24152 
24153 /*
24154  * vm_map_range_fork:
24155  *	clones the array of ranges from old_map to new_map in support
24156  *  of a VM map fork.
24157  */
24158 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)24159 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
24160 {
24161 	if (!old_map->uses_user_ranges) {
24162 		/* nothing to do */
24163 		return;
24164 	}
24165 
24166 	new_map->default_range = old_map->default_range;
24167 	new_map->data_range = old_map->data_range;
24168 
24169 	if (old_map->extra_ranges_count) {
24170 		vm_map_user_range_t otable, ntable;
24171 		uint16_t count;
24172 
24173 		otable = old_map->extra_ranges;
24174 		count  = old_map->extra_ranges_count;
24175 		ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
24176 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
24177 		memcpy(ntable, otable,
24178 		    count * sizeof(struct vm_map_user_range));
24179 
24180 		new_map->extra_ranges_count = count;
24181 		new_map->extra_ranges = ntable;
24182 	}
24183 
24184 	new_map->uses_user_ranges = true;
24185 }
24186 
24187 /*
24188  * vm_map_get_user_range:
24189  *	copy the VM user range for the given VM map and range ID.
24190  */
24191 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)24192 vm_map_get_user_range(
24193 	vm_map_t                map,
24194 	vm_map_range_id_t       range_id,
24195 	mach_vm_range_t         range)
24196 {
24197 	if (map == NULL || !map->uses_user_ranges || range == NULL) {
24198 		return KERN_INVALID_ARGUMENT;
24199 	}
24200 
24201 	switch (range_id) {
24202 	case UMEM_RANGE_ID_DEFAULT:
24203 		*range = map->default_range;
24204 		return KERN_SUCCESS;
24205 
24206 	case UMEM_RANGE_ID_HEAP:
24207 		*range = map->data_range;
24208 		return KERN_SUCCESS;
24209 
24210 	case UMEM_RANGE_ID_LARGE_FILE:
24211 		/*
24212 		 * Because this function tells a user-space process about the user
24213 		 * ranges in its VM map, this case communicates whether the large file
24214 		 * range is in use. Note that this is different from how the large file
24215 		 * range ID is handled in `vm_map_get_range()`: there, we "resolve" the
24216 		 * VA policy and return either the large file range or data range,
24217 		 * depending on whether the large file range is enabled.
24218 		 */
24219 		if (map->large_file_range.min_address != map->large_file_range.max_address) {
24220 			/* large file range is configured and should be used */
24221 			*range = map->large_file_range;
24222 		} else {
24223 			return KERN_INVALID_ARGUMENT;
24224 		}
24225 		return KERN_SUCCESS;
24226 
24227 	default:
24228 		return KERN_INVALID_ARGUMENT;
24229 	}
24230 }
24231 
24232 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)24233 vm_map_user_range_resolve(
24234 	vm_map_t                map,
24235 	mach_vm_address_t       addr,
24236 	mach_vm_size_t          size,
24237 	mach_vm_range_t         range)
24238 {
24239 	struct mach_vm_range tmp;
24240 
24241 	vm_map_lock_assert_held(map);
24242 
24243 	static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
24244 	static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
24245 
24246 	if (mach_vm_range_contains(&map->default_range, addr, size)) {
24247 		if (range) {
24248 			*range = map->default_range;
24249 		}
24250 		return UMEM_RANGE_ID_DEFAULT;
24251 	}
24252 
24253 	if (mach_vm_range_contains(&map->data_range, addr, size)) {
24254 		if (range) {
24255 			*range = map->data_range;
24256 		}
24257 		return UMEM_RANGE_ID_HEAP;
24258 	}
24259 
24260 	if (mach_vm_range_contains(&map->large_file_range, addr, size)) {
24261 		if (range) {
24262 			*range = map->large_file_range;
24263 		}
24264 		return UMEM_RANGE_ID_LARGE_FILE;
24265 	}
24266 
24267 	for (size_t i = 0; i < map->extra_ranges_count; i++) {
24268 		vm_map_user_range_t r = &map->extra_ranges[i];
24269 
24270 		tmp.min_address = r->vmur_min_address;
24271 		tmp.max_address = r->vmur_max_address;
24272 
24273 		if (mach_vm_range_contains(&tmp, addr, size)) {
24274 			if (range) {
24275 				*range = tmp;
24276 			}
24277 			return r->vmur_range_id;
24278 		}
24279 	}
24280 
24281 	if (range) {
24282 		range->min_address = range->max_address = 0;
24283 	}
24284 	return UMEM_RANGE_ID_DEFAULT;
24285 }
24286 #endif /* CONFIG_MAP_RANGES */
24287 
24288 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map,__unused vm_map_size_t size)24289 vm_map_kernel_flags_update_range_id(
24290 	vm_map_kernel_flags_t *vmkf,
24291 	vm_map_t map,
24292 	__unused vm_map_size_t size)
24293 {
24294 	if (map == kernel_map) {
24295 		if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
24296 			vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
24297 		}
24298 #if CONFIG_MAP_RANGES
24299 	} else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
24300 	    vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT) {
24301 		if (bitmap_test(vm_map_user_range_large_file_map, vmkf->vm_tag)
24302 		    || size >= VM_LARGE_FILE_THRESHOLD) {
24303 			/*
24304 			 * if the map doesn't have the large file range configured,
24305 			 * the range will get resolved to the heap range in `vm_map_get_range`
24306 			 */
24307 			vmkf->vmkf_range_id = UMEM_RANGE_ID_LARGE_FILE;
24308 		} else if (bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
24309 			vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
24310 		}
24311 #endif /* CONFIG_MAP_RANGES */
24312 	}
24313 }
24314 
24315 /*
24316  * vm_map_entry_has_device_pager:
24317  * Check if the vm map entry specified by the virtual address has a device pager.
24318  * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
24319  */
24320 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)24321 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
24322 {
24323 	vm_map_entry_t entry;
24324 	vm_object_t object;
24325 	boolean_t result;
24326 
24327 	if (map == NULL) {
24328 		return FALSE;
24329 	}
24330 
24331 	vm_map_lock(map);
24332 	while (TRUE) {
24333 		if (!vm_map_lookup_entry(map, vaddr, &entry)) {
24334 			result = FALSE;
24335 			break;
24336 		}
24337 		if (entry->is_sub_map) {
24338 			// Check the submap
24339 			vm_map_t submap = VME_SUBMAP(entry);
24340 			assert(submap != NULL);
24341 			vm_map_lock(submap);
24342 			vm_map_unlock(map);
24343 			map = submap;
24344 			continue;
24345 		}
24346 		object = VME_OBJECT(entry);
24347 		if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
24348 			result = TRUE;
24349 			break;
24350 		}
24351 		result = FALSE;
24352 		break;
24353 	}
24354 
24355 	vm_map_unlock(map);
24356 	return result;
24357 }
24358 
24359 
24360 #if MACH_ASSERT
24361 
24362 extern int pmap_ledgers_panic;
24363 extern int pmap_ledgers_panic_leeway;
24364 
24365 #define LEDGER_DRIFT(__LEDGER)                    \
24366 	int             __LEDGER##_over;          \
24367 	ledger_amount_t __LEDGER##_over_total;    \
24368 	ledger_amount_t __LEDGER##_over_max;      \
24369 	int             __LEDGER##_under;         \
24370 	ledger_amount_t __LEDGER##_under_total;   \
24371 	ledger_amount_t __LEDGER##_under_max
24372 
24373 struct {
24374 	uint64_t        num_pmaps_checked;
24375 
24376 	LEDGER_DRIFT(phys_footprint);
24377 	LEDGER_DRIFT(internal);
24378 	LEDGER_DRIFT(internal_compressed);
24379 	LEDGER_DRIFT(external);
24380 	LEDGER_DRIFT(reusable);
24381 	LEDGER_DRIFT(iokit_mapped);
24382 	LEDGER_DRIFT(alternate_accounting);
24383 	LEDGER_DRIFT(alternate_accounting_compressed);
24384 	LEDGER_DRIFT(page_table);
24385 	LEDGER_DRIFT(purgeable_volatile);
24386 	LEDGER_DRIFT(purgeable_nonvolatile);
24387 	LEDGER_DRIFT(purgeable_volatile_compressed);
24388 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24389 	LEDGER_DRIFT(tagged_nofootprint);
24390 	LEDGER_DRIFT(tagged_footprint);
24391 	LEDGER_DRIFT(tagged_nofootprint_compressed);
24392 	LEDGER_DRIFT(tagged_footprint_compressed);
24393 	LEDGER_DRIFT(network_volatile);
24394 	LEDGER_DRIFT(network_nonvolatile);
24395 	LEDGER_DRIFT(network_volatile_compressed);
24396 	LEDGER_DRIFT(network_nonvolatile_compressed);
24397 	LEDGER_DRIFT(media_nofootprint);
24398 	LEDGER_DRIFT(media_footprint);
24399 	LEDGER_DRIFT(media_nofootprint_compressed);
24400 	LEDGER_DRIFT(media_footprint_compressed);
24401 	LEDGER_DRIFT(graphics_nofootprint);
24402 	LEDGER_DRIFT(graphics_footprint);
24403 	LEDGER_DRIFT(graphics_nofootprint_compressed);
24404 	LEDGER_DRIFT(graphics_footprint_compressed);
24405 	LEDGER_DRIFT(neural_nofootprint);
24406 	LEDGER_DRIFT(neural_footprint);
24407 	LEDGER_DRIFT(neural_nofootprint_compressed);
24408 	LEDGER_DRIFT(neural_footprint_compressed);
24409 	LEDGER_DRIFT(neural_nofootprint_total);
24410 } pmap_ledgers_drift;
24411 
24412 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24413 vm_map_pmap_check_ledgers(
24414 	pmap_t          pmap,
24415 	ledger_t        ledger,
24416 	int             pid,
24417 	char            *procname)
24418 {
24419 	ledger_amount_t bal;
24420 	boolean_t       do_panic;
24421 
24422 	do_panic = FALSE;
24423 
24424 	pmap_ledgers_drift.num_pmaps_checked++;
24425 
24426 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
24427 MACRO_BEGIN                                                             \
24428 	int panic_on_negative = TRUE;                                   \
24429 	ledger_get_balance(ledger,                                      \
24430 	                   task_ledgers.__LEDGER,                       \
24431 	                   &bal);                                       \
24432 	ledger_get_panic_on_negative(ledger,                            \
24433 	                             task_ledgers.__LEDGER,             \
24434 	                             &panic_on_negative);               \
24435 	if (bal != 0) {                                                 \
24436 	        if (panic_on_negative ||                                \
24437 	            (pmap_ledgers_panic &&                              \
24438 	             pmap_ledgers_panic_leeway > 0 &&                   \
24439 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
24440 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24441 	                do_panic = TRUE;                                \
24442 	        }                                                       \
24443 	        printf("LEDGER BALANCE proc %d (%s) "                   \
24444 	               "\"%s\" = %lld\n",                               \
24445 	               pid, procname, #__LEDGER, bal);                  \
24446 	        if (bal > 0) {                                          \
24447 	                pmap_ledgers_drift.__LEDGER##_over++;           \
24448 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24449 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24450 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24451 	                }                                               \
24452 	        } else if (bal < 0) {                                   \
24453 	                pmap_ledgers_drift.__LEDGER##_under++;          \
24454 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24455 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24456 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24457 	                }                                               \
24458 	        }                                                       \
24459 	}                                                               \
24460 MACRO_END
24461 
24462 	LEDGER_CHECK_BALANCE(phys_footprint);
24463 	LEDGER_CHECK_BALANCE(internal);
24464 	LEDGER_CHECK_BALANCE(internal_compressed);
24465 	LEDGER_CHECK_BALANCE(external);
24466 	LEDGER_CHECK_BALANCE(reusable);
24467 	LEDGER_CHECK_BALANCE(iokit_mapped);
24468 	LEDGER_CHECK_BALANCE(alternate_accounting);
24469 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24470 	LEDGER_CHECK_BALANCE(page_table);
24471 	LEDGER_CHECK_BALANCE(purgeable_volatile);
24472 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24473 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24474 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24475 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
24476 	LEDGER_CHECK_BALANCE(tagged_footprint);
24477 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24478 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24479 	LEDGER_CHECK_BALANCE(network_volatile);
24480 	LEDGER_CHECK_BALANCE(network_nonvolatile);
24481 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
24482 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24483 	LEDGER_CHECK_BALANCE(media_nofootprint);
24484 	LEDGER_CHECK_BALANCE(media_footprint);
24485 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24486 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
24487 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
24488 	LEDGER_CHECK_BALANCE(graphics_footprint);
24489 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24490 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24491 	LEDGER_CHECK_BALANCE(neural_nofootprint);
24492 	LEDGER_CHECK_BALANCE(neural_footprint);
24493 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24494 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24495 	LEDGER_CHECK_BALANCE(neural_nofootprint_total);
24496 
24497 	if (do_panic) {
24498 		if (pmap_ledgers_panic) {
24499 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24500 			    pmap, pid, procname);
24501 		} else {
24502 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24503 			    pmap, pid, procname);
24504 		}
24505 	}
24506 }
24507 
24508 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24509 vm_map_pmap_set_process(
24510 	vm_map_t map,
24511 	int pid,
24512 	char *procname)
24513 {
24514 	pmap_set_process(vm_map_pmap(map), pid, procname);
24515 }
24516 
24517 #endif /* MACH_ASSERT */
24518