xref: /xnu-11215.1.10/osfmk/vm/vm_map.c (revision 8d741a5de7ff4191bf97d57b9f54c2f6d4a15585)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_map.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Virtual memory mapping module.
64  */
65 
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68 
69 #include <vm/vm_options.h>
70 
71 #include <libkern/OSAtomic.h>
72 
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm_server.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83 
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90 
91 #include <vm/cpm_internal.h>
92 #include <vm/memory_types.h>
93 #include <vm/vm_compressor_xnu.h>
94 #include <vm/vm_compressor_pager_internal.h>
95 #include <vm/vm_init_xnu.h>
96 #include <vm/vm_fault_internal.h>
97 #include <vm/vm_map_internal.h>
98 #include <vm/vm_object_internal.h>
99 #include <vm/vm_page_internal.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/pmap.h>
102 #include <vm/vm_kern_internal.h>
103 #include <ipc/ipc_port.h>
104 #include <kern/sched_prim.h>
105 #include <kern/misc_protos.h>
106 
107 #include <mach/vm_map_server.h>
108 #include <mach/mach_host_server.h>
109 #include <vm/vm_memtag.h>
110 #include <vm/vm_protos_internal.h>
111 #include <vm/vm_purgeable_internal.h>
112 
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_shared_region_internal.h>
115 #include <vm/vm_map_store_internal.h>
116 #include <vm/vm_memory_entry_xnu.h>
117 #include <vm/memory_object_internal.h>
118 #include <vm/vm_memory_entry.h>
119 #include <vm/vm_sanitize_internal.h>
120 #if DEVELOPMENT || DEBUG
121 #include <vm/vm_compressor_info.h>
122 #endif /* DEVELOPMENT || DEBUG */
123 #include <san/kasan.h>
124 
125 #include <sys/resource.h>
126 #include <sys/random.h>
127 #include <sys/codesign.h>
128 #include <sys/code_signing.h>
129 #include <sys/mman.h>
130 #include <sys/reboot.h>
131 #include <sys/kdebug_triage.h>
132 #include <sys/reason.h>
133 
134 #include <libkern/section_keywords.h>
135 
136 #if DEVELOPMENT || DEBUG
137 extern int proc_selfcsflags(void);
138 int vm_log_xnu_user_debug = 0;
139 int panic_on_unsigned_execute = 0;
140 int panic_on_mlock_failure = 0;
141 #endif /* DEVELOPMENT || DEBUG */
142 
143 #if DEVELOPMENT || DEBUG
144 int debug4k_filter = 0;
145 char debug4k_proc_name[1024] = "";
146 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
147 int debug4k_panic_on_misaligned_sharing = 0;
148 const char *debug4k_category_name[] = {
149 	"error",        /* 0 */
150 	"life",         /* 1 */
151 	"load",         /* 2 */
152 	"fault",        /* 3 */
153 	"copy",         /* 4 */
154 	"share",        /* 5 */
155 	"adjust",       /* 6 */
156 	"pmap",         /* 7 */
157 	"mementry",     /* 8 */
158 	"iokit",        /* 9 */
159 	"upl",          /* 10 */
160 	"exc",          /* 11 */
161 	"vfs"           /* 12 */
162 };
163 #endif /* DEVELOPMENT || DEBUG */
164 int debug4k_no_cow_copyin = 0;
165 
166 
167 #if __arm64__
168 extern const int fourk_binary_compatibility_unsafe;
169 #endif /* __arm64__ */
170 extern int proc_selfpid(void);
171 extern char *proc_name_address(void *p);
172 extern const char *proc_best_name(struct proc *p);
173 
174 #if VM_MAP_DEBUG_APPLE_PROTECT
175 int vm_map_debug_apple_protect = 0;
176 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
177 #if VM_MAP_DEBUG_FOURK
178 int vm_map_debug_fourk = 0;
179 #endif /* VM_MAP_DEBUG_FOURK */
180 
181 #if DEBUG || DEVELOPMENT
182 static TUNABLE(bool, vm_map_executable_immutable,
183     "vm_map_executable_immutable", true);
184 #else
185 #define vm_map_executable_immutable true
186 #endif
187 
188 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
189 
190 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
191 /* Internal prototypes
192  */
193 
194 typedef struct vm_map_zap {
195 	vm_map_entry_t          vmz_head;
196 	vm_map_entry_t         *vmz_tail;
197 } *vm_map_zap_t;
198 
199 #define VM_MAP_ZAP_DECLARE(zap) \
200 	struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
201 
202 extern kern_return_t vm_map_wire_external(
203 	vm_map_t                map,
204 	vm_map_offset_ut        start_u,
205 	vm_map_offset_ut        end_u,
206 	vm_prot_ut              prot_u,
207 	boolean_t               user_wire) __exported;
208 
209 #if XNU_PLATFORM_MacOSX
210 extern /* exported via Private.<arch>.MacOSX.exports on macOS */
211 #else
212 static
213 #endif
214 kern_return_t vm_map_copyin_common(
215 	vm_map_t                src_map,
216 	vm_map_address_ut       src_addr,
217 	vm_map_size_ut          len,
218 	boolean_t               src_destroy,
219 	boolean_t               src_volatile,
220 	vm_map_copy_t          *copy_result,                           /* OUT */
221 	boolean_t               use_maxprot);
222 
223 static vm_map_entry_t   vm_map_entry_insert(
224 	vm_map_t                map,
225 	vm_map_entry_t          insp_entry,
226 	vm_map_offset_t         start,
227 	vm_map_offset_t         end,
228 	vm_object_t             object,
229 	vm_object_offset_t      offset,
230 	vm_map_kernel_flags_t   vmk_flags,
231 	boolean_t               needs_copy,
232 	vm_prot_t               cur_protection,
233 	vm_prot_t               max_protection,
234 	vm_inherit_t            inheritance,
235 	boolean_t               clear_map_aligned);
236 
237 static void vm_map_simplify_range(
238 	vm_map_t        map,
239 	vm_map_offset_t start,
240 	vm_map_offset_t end);   /* forward */
241 
242 static boolean_t        vm_map_range_check(
243 	vm_map_t        map,
244 	vm_map_offset_t start,
245 	vm_map_offset_t end,
246 	vm_map_entry_t  *entry);
247 
248 static void vm_map_submap_pmap_clean(
249 	vm_map_t        map,
250 	vm_map_offset_t start,
251 	vm_map_offset_t end,
252 	vm_map_t        sub_map,
253 	vm_map_offset_t offset);
254 
255 static void             vm_map_pmap_enter(
256 	vm_map_t                map,
257 	vm_map_offset_t         addr,
258 	vm_map_offset_t         end_addr,
259 	vm_object_t             object,
260 	vm_object_offset_t      offset,
261 	vm_prot_t               protection);
262 
263 static void             _vm_map_clip_end(
264 	struct vm_map_header    *map_header,
265 	vm_map_entry_t          entry,
266 	vm_map_offset_t         end);
267 
268 static void             _vm_map_clip_start(
269 	struct vm_map_header    *map_header,
270 	vm_map_entry_t          entry,
271 	vm_map_offset_t         start);
272 
273 static kmem_return_t vm_map_delete(
274 	vm_map_t        map,
275 	vm_map_offset_t start,
276 	vm_map_offset_t end,
277 	vmr_flags_t     flags,
278 	kmem_guard_t    guard,
279 	vm_map_zap_t    zap);
280 
281 static void             vm_map_copy_insert(
282 	vm_map_t        map,
283 	vm_map_entry_t  after_where,
284 	vm_map_copy_t   copy);
285 
286 static kern_return_t    vm_map_copy_overwrite_unaligned(
287 	vm_map_t        dst_map,
288 	vm_map_entry_t  entry,
289 	vm_map_copy_t   copy,
290 	vm_map_address_t start,
291 	boolean_t       discard_on_success);
292 
293 static kern_return_t    vm_map_copy_overwrite_aligned(
294 	vm_map_t        dst_map,
295 	vm_map_entry_t  tmp_entry,
296 	vm_map_copy_t   copy,
297 	vm_map_offset_t start,
298 	pmap_t          pmap);
299 
300 static kern_return_t    vm_map_copyin_kernel_buffer(
301 	vm_map_t        src_map,
302 	vm_map_address_t src_addr,
303 	vm_map_size_t   len,
304 	boolean_t       src_destroy,
305 	vm_map_copy_t   *copy_result);  /* OUT */
306 
307 static kern_return_t    vm_map_copyout_kernel_buffer(
308 	vm_map_t        map,
309 	vm_map_address_t *addr, /* IN/OUT */
310 	vm_map_copy_t   copy,
311 	vm_map_size_t   copy_size,
312 	boolean_t       overwrite,
313 	boolean_t       consume_on_success);
314 
315 static void             vm_map_fork_share(
316 	vm_map_t        old_map,
317 	vm_map_entry_t  old_entry,
318 	vm_map_t        new_map);
319 
320 static boolean_t        vm_map_fork_copy(
321 	vm_map_t        old_map,
322 	vm_map_entry_t  *old_entry_p,
323 	vm_map_t        new_map,
324 	int             vm_map_copyin_flags);
325 
326 static kern_return_t    vm_map_wire_nested(
327 	vm_map_t                   map,
328 	vm_map_offset_t            start,
329 	vm_map_offset_t            end,
330 	vm_prot_t                  caller_prot,
331 	vm_tag_t                   tag,
332 	boolean_t                  user_wire,
333 	pmap_t                     map_pmap,
334 	vm_map_offset_t            pmap_addr,
335 	ppnum_t                   *physpage_p);
336 
337 static kern_return_t    vm_map_unwire_nested(
338 	vm_map_t                   map,
339 	vm_map_offset_t            start,
340 	vm_map_offset_t            end,
341 	boolean_t                  user_wire,
342 	pmap_t                     map_pmap,
343 	vm_map_offset_t            pmap_addr);
344 
345 static kern_return_t    vm_map_overwrite_submap_recurse(
346 	vm_map_t                   dst_map,
347 	vm_map_offset_t            dst_addr,
348 	vm_map_size_t              dst_size);
349 
350 static kern_return_t    vm_map_copy_overwrite_nested(
351 	vm_map_t                   dst_map,
352 	vm_map_offset_t            dst_addr,
353 	vm_map_copy_t              copy,
354 	boolean_t                  interruptible,
355 	pmap_t                     pmap,
356 	boolean_t                  discard_on_success);
357 
358 static kern_return_t    vm_map_remap_extract(
359 	vm_map_t                map,
360 	vm_map_offset_t         addr,
361 	vm_map_size_t           size,
362 	boolean_t               copy,
363 	vm_map_copy_t           map_copy,
364 	vm_prot_t               *cur_protection,
365 	vm_prot_t               *max_protection,
366 	vm_inherit_t            inheritance,
367 	vm_map_kernel_flags_t   vmk_flags);
368 
369 static void             vm_map_region_look_for_page(
370 	vm_map_t                   map,
371 	vm_map_offset_t            va,
372 	vm_object_t                object,
373 	vm_object_offset_t         offset,
374 	int                        max_refcnt,
375 	unsigned short             depth,
376 	vm_region_extended_info_t  extended,
377 	mach_msg_type_number_t count);
378 
379 static boolean_t        vm_map_region_has_obj_ref(
380 	vm_map_entry_t             entry,
381 	vm_object_t                object);
382 
383 
384 static kern_return_t    vm_map_willneed(
385 	vm_map_t        map,
386 	vm_map_offset_t start,
387 	vm_map_offset_t end);
388 
389 static kern_return_t    vm_map_reuse_pages(
390 	vm_map_t        map,
391 	vm_map_offset_t start,
392 	vm_map_offset_t end);
393 
394 static kern_return_t    vm_map_reusable_pages(
395 	vm_map_t        map,
396 	vm_map_offset_t start,
397 	vm_map_offset_t end);
398 
399 static kern_return_t    vm_map_can_reuse(
400 	vm_map_t        map,
401 	vm_map_offset_t start,
402 	vm_map_offset_t end);
403 
404 static kern_return_t    vm_map_zero(
405 	vm_map_t        map,
406 	vm_map_offset_t start,
407 	vm_map_offset_t end);
408 
409 static kern_return_t    vm_map_random_address_for_size(
410 	vm_map_t                map,
411 	vm_map_offset_t        *address,
412 	vm_map_size_t           size,
413 	vm_map_kernel_flags_t   vmk_flags);
414 
415 
416 #if CONFIG_MAP_RANGES
417 
418 static vm_map_range_id_t vm_map_user_range_resolve(
419 	vm_map_t                map,
420 	mach_vm_address_t       addr,
421 	mach_vm_address_t       size,
422 	mach_vm_range_t         range);
423 
424 #endif /* CONFIG_MAP_RANGES */
425 #if MACH_ASSERT
426 static kern_return_t    vm_map_pageout(
427 	vm_map_t        map,
428 	vm_map_offset_t start,
429 	vm_map_offset_t end);
430 #endif /* MACH_ASSERT */
431 
432 kern_return_t vm_map_corpse_footprint_collect(
433 	vm_map_t        old_map,
434 	vm_map_entry_t  old_entry,
435 	vm_map_t        new_map);
436 void vm_map_corpse_footprint_collect_done(
437 	vm_map_t        new_map);
438 void vm_map_corpse_footprint_destroy(
439 	vm_map_t        map);
440 kern_return_t vm_map_corpse_footprint_query_page_info(
441 	vm_map_t        map,
442 	vm_map_offset_t va,
443 	int             *disposition_p);
444 void vm_map_footprint_query_page_info(
445 	vm_map_t        map,
446 	vm_map_entry_t  map_entry,
447 	vm_map_offset_t curr_s_offset,
448 	int             *disposition_p);
449 
450 #if CONFIG_MAP_RANGES
451 static void vm_map_range_map_init(void);
452 #endif /* CONFIG_MAP_RANGES */
453 
454 pid_t find_largest_process_vm_map_entries(void);
455 
456 __attribute__((always_inline))
457 int
vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)458 vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)
459 {
460 	int flags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
461 
462 	/* in vmk flags the meaning of fixed/anywhere is inverted */
463 	return flags ^ (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
464 }
465 
466 __attribute__((always_inline, overloadable))
467 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags,vm_tag_t vm_tag)468 vm_map_kernel_flags_set_vmflags(
469 	vm_map_kernel_flags_t  *vmk_flags,
470 	int                     vm_flags,
471 	vm_tag_t                vm_tag)
472 {
473 	vm_flags ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
474 	vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
475 	vmk_flags->__vm_flags |= (vm_flags & VM_FLAGS_ANY_MASK);
476 	vmk_flags->vm_tag = vm_tag;
477 }
478 
479 __attribute__((always_inline, overloadable))
480 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_and_tag)481 vm_map_kernel_flags_set_vmflags(
482 	vm_map_kernel_flags_t  *vmk_flags,
483 	int                     vm_flags_and_tag)
484 {
485 	vm_flags_and_tag ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
486 	vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
487 	vmk_flags->__vm_flags |= (vm_flags_and_tag & VM_FLAGS_ANY_MASK);
488 	VM_GET_FLAGS_ALIAS(vm_flags_and_tag, vmk_flags->vm_tag);
489 }
490 
491 __attribute__((always_inline))
492 void
vm_map_kernel_flags_and_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_mask)493 vm_map_kernel_flags_and_vmflags(
494 	vm_map_kernel_flags_t  *vmk_flags,
495 	int                     vm_flags_mask)
496 {
497 	/* this function doesn't handle the inverted FIXED/ANYWHERE */
498 	assert(vm_flags_mask & VM_FLAGS_ANYWHERE);
499 	vmk_flags->__vm_flags &= vm_flags_mask;
500 }
501 
502 __attribute__((always_inline))
503 bool
vm_map_kernel_flags_check_vm_and_kflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)504 vm_map_kernel_flags_check_vm_and_kflags(
505 	vm_map_kernel_flags_t   vmk_flags,
506 	int                     vm_flags_mask)
507 {
508 	return (vmk_flags.__vm_flags & ~vm_flags_mask) == 0;
509 }
510 
511 bool
vm_map_kernel_flags_check_vmflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)512 vm_map_kernel_flags_check_vmflags(
513 	vm_map_kernel_flags_t   vmk_flags,
514 	int                     vm_flags_mask)
515 {
516 	int vmflags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
517 
518 	/* Note: up to 16 still has good calling conventions */
519 	static_assert(sizeof(vm_map_kernel_flags_t) == 8);
520 
521 #if DEBUG || DEVELOPMENT
522 	/*
523 	 * All of this compiles to nothing if all checks pass.
524 	 */
525 #define check(field, value)  ({ \
526 	vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE; \
527 	fl.__vm_flags = (value); \
528 	fl.field = 0; \
529 	assert(fl.__vm_flags == 0); \
530 })
531 
532 	/* bits 0-7 */
533 	check(vmf_fixed, VM_FLAGS_ANYWHERE); // kind of a lie this is inverted
534 	check(vmf_purgeable, VM_FLAGS_PURGABLE);
535 	check(vmf_4gb_chunk, VM_FLAGS_4GB_CHUNK);
536 	check(vmf_random_addr, VM_FLAGS_RANDOM_ADDR);
537 	check(vmf_no_cache, VM_FLAGS_NO_CACHE);
538 	check(vmf_resilient_codesign, VM_FLAGS_RESILIENT_CODESIGN);
539 	check(vmf_resilient_media, VM_FLAGS_RESILIENT_MEDIA);
540 	check(vmf_permanent, VM_FLAGS_PERMANENT);
541 
542 	/* bits 8-15 */
543 	check(vmf_tpro, VM_FLAGS_TPRO);
544 	check(vmf_overwrite, VM_FLAGS_OVERWRITE);
545 
546 	/* bits 16-23 */
547 	check(vmf_superpage_size, VM_FLAGS_SUPERPAGE_MASK);
548 	check(vmf_return_data_addr, VM_FLAGS_RETURN_DATA_ADDR);
549 	check(vmf_return_4k_data_addr, VM_FLAGS_RETURN_4K_DATA_ADDR);
550 
551 	{
552 		vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE;
553 
554 		/* check user tags will never clip */
555 		fl.vm_tag = VM_MEMORY_COUNT - 1;
556 		assert(fl.vm_tag == VM_MEMORY_COUNT - 1);
557 
558 		/* check kernel tags will never clip */
559 		fl.vm_tag = VM_MAX_TAG_VALUE - 1;
560 		assert(fl.vm_tag == VM_MAX_TAG_VALUE - 1);
561 	}
562 
563 
564 #undef check
565 #endif /* DEBUG || DEVELOPMENT */
566 
567 	return (vmflags & ~vm_flags_mask) == 0;
568 }
569 
570 /*
571  * Macros to copy a vm_map_entry. We must be careful to correctly
572  * manage the wired page count. vm_map_entry_copy() creates a new
573  * map entry to the same memory - the wired count in the new entry
574  * must be set to zero. vm_map_entry_copy_full() creates a new
575  * entry that is identical to the old entry.  This preserves the
576  * wire count; it's used for map splitting and zone changing in
577  * vm_map_copyout.
578  */
579 
580 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)581 vm_map_entry_copy_csm_assoc(
582 	vm_map_t map __unused,
583 	vm_map_entry_t new __unused,
584 	vm_map_entry_t old __unused)
585 {
586 #if CODE_SIGNING_MONITOR
587 	/* when code signing monitor is enabled, we want to reset on copy */
588 	new->csm_associated = FALSE;
589 #else
590 	/* when code signing monitor is not enabled, assert as a sanity check */
591 	assert(new->csm_associated == FALSE);
592 #endif
593 #if DEVELOPMENT || DEBUG
594 	if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
595 		printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] resetting vme_xnu_user_debug\n",
596 		    proc_selfpid(),
597 		    (get_bsdtask_info(current_task())
598 		    ? proc_name_address(get_bsdtask_info(current_task()))
599 		    : "?"),
600 		    __FUNCTION__, __LINE__,
601 		    map, new, new->vme_start, new->vme_end);
602 	}
603 #endif /* DEVELOPMENT || DEBUG */
604 	new->vme_xnu_user_debug = FALSE;
605 }
606 
607 /*
608  * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
609  * But for security reasons on some platforms, we don't want the
610  * new mapping to be "used for jit", so we reset the flag here.
611  */
612 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)613 vm_map_entry_copy_code_signing(
614 	vm_map_t map,
615 	vm_map_entry_t new,
616 	vm_map_entry_t old __unused)
617 {
618 	if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
619 		assert(new->used_for_jit == old->used_for_jit);
620 	} else {
621 		if (old->used_for_jit) {
622 			DTRACE_VM3(cs_wx,
623 			    uint64_t, new->vme_start,
624 			    uint64_t, new->vme_end,
625 			    vm_prot_t, new->protection);
626 			printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
627 			    proc_selfpid(),
628 			    (get_bsdtask_info(current_task())
629 			    ? proc_name_address(get_bsdtask_info(current_task()))
630 			    : "?"),
631 			    __FUNCTION__,
632 			    "removing execute access");
633 			new->protection &= ~VM_PROT_EXECUTE;
634 			new->max_protection &= ~VM_PROT_EXECUTE;
635 		}
636 		new->used_for_jit = FALSE;
637 	}
638 }
639 
640 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)641 vm_map_entry_copy_full(
642 	vm_map_entry_t new,
643 	vm_map_entry_t old)
644 {
645 #if MAP_ENTRY_CREATION_DEBUG
646 	btref_put(new->vme_creation_bt);
647 	btref_retain(old->vme_creation_bt);
648 #endif
649 #if MAP_ENTRY_INSERTION_DEBUG
650 	btref_put(new->vme_insertion_bt);
651 	btref_retain(old->vme_insertion_bt);
652 #endif
653 #if VM_BTLOG_TAGS
654 	/* Discard the btref that might be in the new entry */
655 	if (new->vme_kernel_object) {
656 		btref_put(new->vme_tag_btref);
657 	}
658 	/* Retain the btref in the old entry to account for its copy */
659 	if (old->vme_kernel_object) {
660 		btref_retain(old->vme_tag_btref);
661 	}
662 #endif /* VM_BTLOG_TAGS */
663 	*new = *old;
664 }
665 
666 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)667 vm_map_entry_copy(
668 	vm_map_t map,
669 	vm_map_entry_t new,
670 	vm_map_entry_t old)
671 {
672 	vm_map_entry_copy_full(new, old);
673 
674 	new->is_shared = FALSE;
675 	new->needs_wakeup = FALSE;
676 	new->in_transition = FALSE;
677 	new->wired_count = 0;
678 	new->user_wired_count = 0;
679 	new->vme_permanent = FALSE;
680 	vm_map_entry_copy_code_signing(map, new, old);
681 	vm_map_entry_copy_csm_assoc(map, new, old);
682 	if (new->iokit_acct) {
683 		assertf(!new->use_pmap, "old %p new %p\n", old, new);
684 		new->iokit_acct = FALSE;
685 		new->use_pmap = TRUE;
686 	}
687 	new->vme_resilient_codesign = FALSE;
688 	new->vme_resilient_media = FALSE;
689 	new->vme_atomic = FALSE;
690 	new->vme_no_copy_on_read = FALSE;
691 }
692 
693 /*
694  * Normal lock_read_to_write() returns FALSE/0 on failure.
695  * These functions evaluate to zero on success and non-zero value on failure.
696  */
697 __attribute__((always_inline))
698 int
vm_map_lock_read_to_write(vm_map_t map)699 vm_map_lock_read_to_write(vm_map_t map)
700 {
701 	if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
702 		DTRACE_VM(vm_map_lock_upgrade);
703 		return 0;
704 	}
705 	return 1;
706 }
707 
708 __attribute__((always_inline))
709 boolean_t
vm_map_try_lock(vm_map_t map)710 vm_map_try_lock(vm_map_t map)
711 {
712 	if (lck_rw_try_lock_exclusive(&(map)->lock)) {
713 		DTRACE_VM(vm_map_lock_w);
714 		return TRUE;
715 	}
716 	return FALSE;
717 }
718 
719 __attribute__((always_inline))
720 boolean_t
vm_map_try_lock_read(vm_map_t map)721 vm_map_try_lock_read(vm_map_t map)
722 {
723 	if (lck_rw_try_lock_shared(&(map)->lock)) {
724 		DTRACE_VM(vm_map_lock_r);
725 		return TRUE;
726 	}
727 	return FALSE;
728 }
729 
730 /*!
731  * @function kdp_vm_map_is_acquired_exclusive
732  *
733  * @abstract
734  * Checks if vm map is acquired exclusive.
735  *
736  * @discussion
737  * NOT SAFE: To be used only by kernel debugger.
738  *
739  * @param map map to check
740  *
741  * @returns TRUE if the map is acquired exclusively.
742  */
743 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)744 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
745 {
746 	return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
747 }
748 
749 /*
750  * Routines to get the page size the caller should
751  * use while inspecting the target address space.
752  * Use the "_safely" variant if the caller is dealing with a user-provided
753  * array whose size depends on the page size, to avoid any overflow or
754  * underflow of a user-allocated buffer.
755  */
756 int
vm_self_region_page_shift_safely(vm_map_t target_map)757 vm_self_region_page_shift_safely(
758 	vm_map_t target_map)
759 {
760 	int effective_page_shift = 0;
761 
762 	if (PAGE_SIZE == (4096)) {
763 		/* x86_64 and 4k watches: always use 4k */
764 		return PAGE_SHIFT;
765 	}
766 	/* did caller provide an explicit page size for this thread to use? */
767 	effective_page_shift = thread_self_region_page_shift();
768 	if (effective_page_shift) {
769 		/* use the explicitly-provided page size */
770 		return effective_page_shift;
771 	}
772 	/* no explicit page size: use the caller's page size... */
773 	effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
774 	if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
775 		/* page size match: safe to use */
776 		return effective_page_shift;
777 	}
778 	/* page size mismatch */
779 	return -1;
780 }
781 int
vm_self_region_page_shift(vm_map_t target_map)782 vm_self_region_page_shift(
783 	vm_map_t target_map)
784 {
785 	int effective_page_shift;
786 
787 	effective_page_shift = vm_self_region_page_shift_safely(target_map);
788 	if (effective_page_shift == -1) {
789 		/* no safe value but OK to guess for caller */
790 		effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
791 		    VM_MAP_PAGE_SHIFT(target_map));
792 	}
793 	return effective_page_shift;
794 }
795 
796 
797 /*
798  *	Decide if we want to allow processes to execute from their data or stack areas.
799  *	override_nx() returns true if we do.  Data/stack execution can be enabled independently
800  *	for 32 and 64 bit processes.  Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
801  *	or allow_stack_exec to enable data execution for that type of data area for that particular
802  *	ABI (or both by or'ing the flags together).  These are initialized in the architecture
803  *	specific pmap files since the default behavior varies according to architecture.  The
804  *	main reason it varies is because of the need to provide binary compatibility with old
805  *	applications that were written before these restrictions came into being.  In the old
806  *	days, an app could execute anything it could read, but this has slowly been tightened
807  *	up over time.  The default behavior is:
808  *
809  *	32-bit PPC apps		may execute from both stack and data areas
810  *	32-bit Intel apps	may exeucte from data areas but not stack
811  *	64-bit PPC/Intel apps	may not execute from either data or stack
812  *
813  *	An application on any architecture may override these defaults by explicitly
814  *	adding PROT_EXEC permission to the page in question with the mprotect(2)
815  *	system call.  This code here just determines what happens when an app tries to
816  *      execute from a page that lacks execute permission.
817  *
818  *	Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
819  *	default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
820  *	a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
821  *	execution from data areas for a particular binary even if the arch normally permits it. As
822  *	a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
823  *	to support some complicated use cases, notably browsers with out-of-process plugins that
824  *	are not all NX-safe.
825  */
826 
827 extern int allow_data_exec, allow_stack_exec;
828 
829 int
override_nx(vm_map_t map,uint32_t user_tag)830 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
831 {
832 	int current_abi;
833 
834 	if (map->pmap == kernel_pmap) {
835 		return FALSE;
836 	}
837 
838 	/*
839 	 * Determine if the app is running in 32 or 64 bit mode.
840 	 */
841 
842 	if (vm_map_is_64bit(map)) {
843 		current_abi = VM_ABI_64;
844 	} else {
845 		current_abi = VM_ABI_32;
846 	}
847 
848 	/*
849 	 * Determine if we should allow the execution based on whether it's a
850 	 * stack or data area and the current architecture.
851 	 */
852 
853 	if (user_tag == VM_MEMORY_STACK) {
854 		return allow_stack_exec & current_abi;
855 	}
856 
857 	return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
858 }
859 
860 
861 /*
862  *	Virtual memory maps provide for the mapping, protection,
863  *	and sharing of virtual memory objects.  In addition,
864  *	this module provides for an efficient virtual copy of
865  *	memory from one map to another.
866  *
867  *	Synchronization is required prior to most operations.
868  *
869  *	Maps consist of an ordered doubly-linked list of simple
870  *	entries; a single hint is used to speed up lookups.
871  *
872  *	Sharing maps have been deleted from this version of Mach.
873  *	All shared objects are now mapped directly into the respective
874  *	maps.  This requires a change in the copy on write strategy;
875  *	the asymmetric (delayed) strategy is used for shared temporary
876  *	objects instead of the symmetric (shadow) strategy.  All maps
877  *	are now "top level" maps (either task map, kernel map or submap
878  *	of the kernel map).
879  *
880  *	Since portions of maps are specified by start/end addreses,
881  *	which may not align with existing map entries, all
882  *	routines merely "clip" entries to these start/end values.
883  *	[That is, an entry is split into two, bordering at a
884  *	start or end value.]  Note that these clippings may not
885  *	always be necessary (as the two resulting entries are then
886  *	not changed); however, the clipping is done for convenience.
887  *	No attempt is currently made to "glue back together" two
888  *	abutting entries.
889  *
890  *	The symmetric (shadow) copy strategy implements virtual copy
891  *	by copying VM object references from one map to
892  *	another, and then marking both regions as copy-on-write.
893  *	It is important to note that only one writeable reference
894  *	to a VM object region exists in any map when this strategy
895  *	is used -- this means that shadow object creation can be
896  *	delayed until a write operation occurs.  The symmetric (delayed)
897  *	strategy allows multiple maps to have writeable references to
898  *	the same region of a vm object, and hence cannot delay creating
899  *	its copy objects.  See vm_object_copy_quickly() in vm_object.c.
900  *	Copying of permanent objects is completely different; see
901  *	vm_object_copy_strategically() in vm_object.c.
902  */
903 
904 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
905 
906 #define VM_MAP_ZONE_NAME        "maps"
907 #define VM_MAP_ZFLAGS           (ZC_NOENCRYPT | ZC_VM)
908 
909 #define VM_MAP_ENTRY_ZONE_NAME  "VM map entries"
910 #define VM_MAP_ENTRY_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
911 
912 #define VM_MAP_HOLES_ZONE_NAME  "VM map holes"
913 #define VM_MAP_HOLES_ZFLAGS     (ZC_NOENCRYPT | ZC_VM)
914 
915 /*
916  * Asserts that a vm_map_copy object is coming from the
917  * vm_map_copy_zone to ensure that it isn't a fake constructed
918  * anywhere else.
919  */
920 void
vm_map_copy_require(struct vm_map_copy * copy)921 vm_map_copy_require(struct vm_map_copy *copy)
922 {
923 	zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
924 }
925 
926 /*
927  *	vm_map_require:
928  *
929  *	Ensures that the argument is memory allocated from the genuine
930  *	vm map zone. (See zone_id_require_allow_foreign).
931  */
932 void
vm_map_require(vm_map_t map)933 vm_map_require(vm_map_t map)
934 {
935 	zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
936 }
937 
938 #define VM_MAP_EARLY_COUNT_MAX         16
939 static __startup_data vm_offset_t      map_data;
940 static __startup_data vm_size_t        map_data_size;
941 static __startup_data vm_offset_t      kentry_data;
942 static __startup_data vm_size_t        kentry_data_size;
943 static __startup_data vm_offset_t      map_holes_data;
944 static __startup_data vm_size_t        map_holes_data_size;
945 static __startup_data vm_map_t        *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
946 static __startup_data uint32_t         early_map_count;
947 
948 #if XNU_TARGET_OS_OSX
949 #define         NO_COALESCE_LIMIT  ((1024 * 128) - 1)
950 #else /* XNU_TARGET_OS_OSX */
951 #define         NO_COALESCE_LIMIT  0
952 #endif /* XNU_TARGET_OS_OSX */
953 
954 /* Skip acquiring locks if we're in the midst of a kernel core dump */
955 unsigned int not_in_kdp = 1;
956 
957 unsigned int vm_map_set_cache_attr_count = 0;
958 
959 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)960 vm_map_set_cache_attr(
961 	vm_map_t        map,
962 	vm_map_offset_t va)
963 {
964 	vm_map_entry_t  map_entry;
965 	vm_object_t     object;
966 	kern_return_t   kr = KERN_SUCCESS;
967 
968 	vm_map_lock_read(map);
969 
970 	if (!vm_map_lookup_entry(map, va, &map_entry) ||
971 	    map_entry->is_sub_map) {
972 		/*
973 		 * that memory is not properly mapped
974 		 */
975 		kr = KERN_INVALID_ARGUMENT;
976 		goto done;
977 	}
978 	object = VME_OBJECT(map_entry);
979 
980 	if (object == VM_OBJECT_NULL) {
981 		/*
982 		 * there should be a VM object here at this point
983 		 */
984 		kr = KERN_INVALID_ARGUMENT;
985 		goto done;
986 	}
987 	vm_object_lock(object);
988 	object->set_cache_attr = TRUE;
989 	vm_object_unlock(object);
990 
991 	vm_map_set_cache_attr_count++;
992 done:
993 	vm_map_unlock_read(map);
994 
995 	return kr;
996 }
997 
998 
999 #if CONFIG_CODE_DECRYPTION
1000 /*
1001  * vm_map_apple_protected:
1002  * This remaps the requested part of the object with an object backed by
1003  * the decrypting pager.
1004  * crypt_info contains entry points and session data for the crypt module.
1005  * The crypt_info block will be copied by vm_map_apple_protected. The data structures
1006  * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
1007  */
1008 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)1009 vm_map_apple_protected(
1010 	vm_map_t                map,
1011 	vm_map_offset_t         start,
1012 	vm_map_offset_t         end,
1013 	vm_object_offset_t      crypto_backing_offset,
1014 	struct pager_crypt_info *crypt_info,
1015 	uint32_t                cryptid)
1016 {
1017 	boolean_t       map_locked;
1018 	kern_return_t   kr;
1019 	vm_map_entry_t  map_entry;
1020 	struct vm_map_entry tmp_entry;
1021 	memory_object_t unprotected_mem_obj;
1022 	vm_object_t     protected_object;
1023 	vm_map_offset_t map_addr;
1024 	vm_map_offset_t start_aligned, end_aligned;
1025 	vm_object_offset_t      crypto_start, crypto_end;
1026 	boolean_t       cache_pager;
1027 
1028 	map_locked = FALSE;
1029 	unprotected_mem_obj = MEMORY_OBJECT_NULL;
1030 
1031 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
1032 		return KERN_INVALID_ADDRESS;
1033 	}
1034 	start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
1035 	end_aligned = vm_map_round_page(end, PAGE_MASK_64);
1036 	start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
1037 	end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
1038 
1039 #if __arm64__
1040 	/*
1041 	 * "start" and "end" might be 4K-aligned but not 16K-aligned,
1042 	 * so we might have to loop and establish up to 3 mappings:
1043 	 *
1044 	 * + the first 16K-page, which might overlap with the previous
1045 	 *   4K-aligned mapping,
1046 	 * + the center,
1047 	 * + the last 16K-page, which might overlap with the next
1048 	 *   4K-aligned mapping.
1049 	 * Each of these mapping might be backed by a vnode pager (if
1050 	 * properly page-aligned) or a "fourk_pager", itself backed by a
1051 	 * vnode pager (if 4K-aligned but not page-aligned).
1052 	 */
1053 #endif /* __arm64__ */
1054 
1055 	map_addr = start_aligned;
1056 	for (map_addr = start_aligned;
1057 	    map_addr < end;
1058 	    map_addr = tmp_entry.vme_end) {
1059 		vm_map_lock(map);
1060 		map_locked = TRUE;
1061 
1062 		/* lookup the protected VM object */
1063 		if (!vm_map_lookup_entry(map,
1064 		    map_addr,
1065 		    &map_entry) ||
1066 		    map_entry->is_sub_map ||
1067 		    VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
1068 			/* that memory is not properly mapped */
1069 			kr = KERN_INVALID_ARGUMENT;
1070 			goto done;
1071 		}
1072 
1073 		/* ensure mapped memory is mapped as executable except
1074 		 *  except for model decryption flow */
1075 		if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
1076 		    !(map_entry->protection & VM_PROT_EXECUTE)) {
1077 			kr = KERN_INVALID_ARGUMENT;
1078 			goto done;
1079 		}
1080 
1081 		/* get the protected object to be decrypted */
1082 		protected_object = VME_OBJECT(map_entry);
1083 		if (protected_object == VM_OBJECT_NULL) {
1084 			/* there should be a VM object here at this point */
1085 			kr = KERN_INVALID_ARGUMENT;
1086 			goto done;
1087 		}
1088 		/* ensure protected object stays alive while map is unlocked */
1089 		vm_object_reference(protected_object);
1090 
1091 		/* limit the map entry to the area we want to cover */
1092 		vm_map_clip_start(map, map_entry, start_aligned);
1093 		vm_map_clip_end(map, map_entry, end_aligned);
1094 
1095 		tmp_entry = *map_entry;
1096 		map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
1097 		vm_map_unlock(map);
1098 		map_locked = FALSE;
1099 
1100 		/*
1101 		 * This map entry might be only partially encrypted
1102 		 * (if not fully "page-aligned").
1103 		 */
1104 		crypto_start = 0;
1105 		crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
1106 		if (tmp_entry.vme_start < start) {
1107 			if (tmp_entry.vme_start != start_aligned) {
1108 				kr = KERN_INVALID_ADDRESS;
1109 				vm_object_deallocate(protected_object);
1110 				goto done;
1111 			}
1112 			crypto_start += (start - tmp_entry.vme_start);
1113 		}
1114 		if (tmp_entry.vme_end > end) {
1115 			if (tmp_entry.vme_end != end_aligned) {
1116 				kr = KERN_INVALID_ADDRESS;
1117 				vm_object_deallocate(protected_object);
1118 				goto done;
1119 			}
1120 			crypto_end -= (tmp_entry.vme_end - end);
1121 		}
1122 
1123 		/*
1124 		 * This "extra backing offset" is needed to get the decryption
1125 		 * routine to use the right key.  It adjusts for the possibly
1126 		 * relative offset of an interposed "4K" pager...
1127 		 */
1128 		if (crypto_backing_offset == (vm_object_offset_t) -1) {
1129 			crypto_backing_offset = VME_OFFSET(&tmp_entry);
1130 		}
1131 
1132 		cache_pager = TRUE;
1133 #if XNU_TARGET_OS_OSX
1134 		if (vm_map_is_alien(map)) {
1135 			cache_pager = FALSE;
1136 		}
1137 #endif /* XNU_TARGET_OS_OSX */
1138 
1139 		/*
1140 		 * Lookup (and create if necessary) the protected memory object
1141 		 * matching that VM object.
1142 		 * If successful, this also grabs a reference on the memory object,
1143 		 * to guarantee that it doesn't go away before we get a chance to map
1144 		 * it.
1145 		 */
1146 		unprotected_mem_obj = apple_protect_pager_setup(
1147 			protected_object,
1148 			VME_OFFSET(&tmp_entry),
1149 			crypto_backing_offset,
1150 			crypt_info,
1151 			crypto_start,
1152 			crypto_end,
1153 			cache_pager);
1154 
1155 		/* release extra ref on protected object */
1156 		vm_object_deallocate(protected_object);
1157 
1158 		if (unprotected_mem_obj == NULL) {
1159 			kr = KERN_FAILURE;
1160 			goto done;
1161 		}
1162 
1163 		/* can overwrite an immutable mapping */
1164 		vm_map_kernel_flags_t vmk_flags = {
1165 			.vmf_fixed = true,
1166 			.vmf_overwrite = true,
1167 			.vmkf_overwrite_immutable = true,
1168 		};
1169 		/* make the new mapping as "permanent" as the one it replaces */
1170 		vmk_flags.vmf_permanent = tmp_entry.vme_permanent;
1171 
1172 		/* map this memory object in place of the current one */
1173 		map_addr = tmp_entry.vme_start;
1174 		kr = mach_vm_map_kernel(map,
1175 		    vm_sanitize_wrap_addr_ref(&map_addr),
1176 		    (tmp_entry.vme_end -
1177 		    tmp_entry.vme_start),
1178 		    (mach_vm_offset_t) 0,
1179 		    vmk_flags,
1180 		    (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1181 		    0,
1182 		    TRUE,
1183 		    tmp_entry.protection,
1184 		    tmp_entry.max_protection,
1185 		    tmp_entry.inheritance);
1186 		assertf(kr == KERN_SUCCESS,
1187 		    "kr = 0x%x\n", kr);
1188 		assertf(map_addr == tmp_entry.vme_start,
1189 		    "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1190 		    (uint64_t)map_addr,
1191 		    (uint64_t) tmp_entry.vme_start,
1192 		    &tmp_entry);
1193 
1194 #if VM_MAP_DEBUG_APPLE_PROTECT
1195 		if (vm_map_debug_apple_protect) {
1196 			printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1197 			    " backing:[object:%p,offset:0x%llx,"
1198 			    "crypto_backing_offset:0x%llx,"
1199 			    "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1200 			    map,
1201 			    (uint64_t) map_addr,
1202 			    (uint64_t) (map_addr + (tmp_entry.vme_end -
1203 			    tmp_entry.vme_start)),
1204 			    unprotected_mem_obj,
1205 			    protected_object,
1206 			    VME_OFFSET(&tmp_entry),
1207 			    crypto_backing_offset,
1208 			    crypto_start,
1209 			    crypto_end);
1210 		}
1211 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1212 
1213 		/*
1214 		 * Release the reference obtained by
1215 		 * apple_protect_pager_setup().
1216 		 * The mapping (if it succeeded) is now holding a reference on
1217 		 * the memory object.
1218 		 */
1219 		memory_object_deallocate(unprotected_mem_obj);
1220 		unprotected_mem_obj = MEMORY_OBJECT_NULL;
1221 
1222 		/* continue with next map entry */
1223 		crypto_backing_offset += (tmp_entry.vme_end -
1224 		    tmp_entry.vme_start);
1225 		crypto_backing_offset -= crypto_start;
1226 	}
1227 	kr = KERN_SUCCESS;
1228 
1229 done:
1230 	if (map_locked) {
1231 		vm_map_unlock(map);
1232 	}
1233 	return kr;
1234 }
1235 #endif  /* CONFIG_CODE_DECRYPTION */
1236 
1237 
1238 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1239 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1240 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1241 
1242 #if XNU_TARGET_OS_OSX
1243 #define MALLOC_NO_COW_DEFAULT 1
1244 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1245 #else /* XNU_TARGET_OS_OSX */
1246 #define MALLOC_NO_COW_DEFAULT 1
1247 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1248 #endif /* XNU_TARGET_OS_OSX */
1249 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1250 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1251 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1252 #if DEBUG
1253 int vm_check_map_sanity = 0;
1254 #endif
1255 
1256 /*
1257  *	vm_map_init:
1258  *
1259  *	Initialize the vm_map module.  Must be called before
1260  *	any other vm_map routines.
1261  *
1262  *	Map and entry structures are allocated from zones -- we must
1263  *	initialize those zones.
1264  *
1265  *	There are three zones of interest:
1266  *
1267  *	vm_map_zone:		used to allocate maps.
1268  *	vm_map_entry_zone:	used to allocate map entries.
1269  *
1270  *	LP32:
1271  *	vm_map_entry_reserved_zone:     fallback zone for kernel map entries
1272  *
1273  *	The kernel allocates map entries from a special zone that is initially
1274  *	"crammed" with memory.  It would be difficult (perhaps impossible) for
1275  *	the kernel to allocate more memory to a entry zone when it became
1276  *	empty since the very act of allocating memory implies the creation
1277  *	of a new entry.
1278  */
1279 __startup_func
1280 void
vm_map_init(void)1281 vm_map_init(void)
1282 {
1283 
1284 #if MACH_ASSERT
1285 	PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1286 	    sizeof(debug4k_filter));
1287 #endif /* MACH_ASSERT */
1288 
1289 	zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1290 	    VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1291 
1292 	/*
1293 	 * Don't quarantine because we always need elements available
1294 	 * Disallow GC on this zone... to aid the GC.
1295 	 */
1296 	zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1297 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1298 	    ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1299 		z->z_elems_rsv = (uint16_t)(32 *
1300 		(ml_early_cpu_max_number() + 1));
1301 	});
1302 
1303 	zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1304 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1305 	    ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1306 		z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1307 	});
1308 
1309 	zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1310 	    ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1311 
1312 	/*
1313 	 * Add the stolen memory to zones, adjust zone size and stolen counts.
1314 	 */
1315 	zone_cram_early(vm_map_zone, map_data, map_data_size);
1316 	zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1317 	zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1318 	printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1319 	    zone_count_free(vm_map_zone),
1320 	    zone_count_free(vm_map_entry_zone),
1321 	    zone_count_free(vm_map_holes_zone));
1322 
1323 	/*
1324 	 * Since these are covered by zones, remove them from stolen page accounting.
1325 	 */
1326 	VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1327 
1328 #if VM_MAP_DEBUG_APPLE_PROTECT
1329 	PE_parse_boot_argn("vm_map_debug_apple_protect",
1330 	    &vm_map_debug_apple_protect,
1331 	    sizeof(vm_map_debug_apple_protect));
1332 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1333 #if VM_MAP_DEBUG_APPLE_FOURK
1334 	PE_parse_boot_argn("vm_map_debug_fourk",
1335 	    &vm_map_debug_fourk,
1336 	    sizeof(vm_map_debug_fourk));
1337 #endif /* VM_MAP_DEBUG_FOURK */
1338 
1339 	if (malloc_no_cow) {
1340 		vm_memory_malloc_no_cow_mask = 0ULL;
1341 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1342 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1343 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1344 #if XNU_TARGET_OS_OSX
1345 		/*
1346 		 * On macOS, keep copy-on-write for MALLOC_LARGE because
1347 		 * realloc() may use vm_copy() to transfer the old contents
1348 		 * to the new location.
1349 		 */
1350 #else /* XNU_TARGET_OS_OSX */
1351 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1352 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1353 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1354 #endif /* XNU_TARGET_OS_OSX */
1355 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1356 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1357 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1358 		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1359 //		vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1360 		PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1361 		    &vm_memory_malloc_no_cow_mask,
1362 		    sizeof(vm_memory_malloc_no_cow_mask));
1363 	}
1364 
1365 #if CONFIG_MAP_RANGES
1366 	vm_map_range_map_init();
1367 #endif /* CONFIG_MAP_RANGES */
1368 
1369 #if DEBUG
1370 	PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1371 	if (vm_check_map_sanity) {
1372 		kprintf("VM sanity checking enabled\n");
1373 	} else {
1374 		kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1375 	}
1376 #endif /* DEBUG */
1377 
1378 #if DEVELOPMENT || DEBUG
1379 	PE_parse_boot_argn("panic_on_unsigned_execute",
1380 	    &panic_on_unsigned_execute,
1381 	    sizeof(panic_on_unsigned_execute));
1382 	PE_parse_boot_argn("panic_on_mlock_failure",
1383 	    &panic_on_mlock_failure,
1384 	    sizeof(panic_on_mlock_failure));
1385 #endif /* DEVELOPMENT || DEBUG */
1386 }
1387 
1388 __startup_func
1389 static void
vm_map_steal_memory(void)1390 vm_map_steal_memory(void)
1391 {
1392 	/*
1393 	 * We need to reserve enough memory to support boostraping VM maps
1394 	 * and the zone subsystem.
1395 	 *
1396 	 * The VM Maps that need to function before zones can support them
1397 	 * are the ones registered with vm_map_will_allocate_early_map(),
1398 	 * which are:
1399 	 * - the kernel map
1400 	 * - the various submaps used by zones (pgz, meta, ...)
1401 	 *
1402 	 * We also need enough entries and holes to support them
1403 	 * until zone_metadata_init() is called, which is when
1404 	 * the zone allocator becomes capable of expanding dynamically.
1405 	 *
1406 	 * We need:
1407 	 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1408 	 * - To allow for 3-4 entries per map, but the kernel map
1409 	 *   needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1410 	 *   to describe the submaps, so double it (and make it 8x too)
1411 	 * - To allow for holes between entries,
1412 	 *   hence needs the same budget as entries
1413 	 */
1414 	map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1415 	    sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1416 	    VM_MAP_EARLY_COUNT_MAX);
1417 
1418 	kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1419 	    sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1420 	    8 * VM_MAP_EARLY_COUNT_MAX);
1421 
1422 	map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1423 	    sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1424 	    8 * VM_MAP_EARLY_COUNT_MAX);
1425 
1426 	/*
1427 	 * Steal a contiguous range of memory so that a simple range check
1428 	 * can validate early addresses being freed/crammed to these
1429 	 * zones
1430 	 */
1431 	map_data       = zone_early_mem_init(map_data_size + kentry_data_size +
1432 	    map_holes_data_size);
1433 	kentry_data    = map_data + map_data_size;
1434 	map_holes_data = kentry_data + kentry_data_size;
1435 }
1436 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1437 
1438 __startup_func
1439 static void
vm_kernel_boostraped(void)1440 vm_kernel_boostraped(void)
1441 {
1442 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1443 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1444 	zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1445 
1446 	printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1447 	    zone_count_free(vm_map_zone),
1448 	    zone_count_free(vm_map_entry_zone),
1449 	    zone_count_free(vm_map_holes_zone));
1450 }
1451 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1452 
1453 void
vm_map_disable_hole_optimization(vm_map_t map)1454 vm_map_disable_hole_optimization(vm_map_t map)
1455 {
1456 	vm_map_entry_t  head_entry, hole_entry, next_hole_entry;
1457 
1458 	if (map->holelistenabled) {
1459 		head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1460 
1461 		while (hole_entry != NULL) {
1462 			next_hole_entry = hole_entry->vme_next;
1463 
1464 			hole_entry->vme_next = NULL;
1465 			hole_entry->vme_prev = NULL;
1466 			zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1467 
1468 			if (next_hole_entry == head_entry) {
1469 				hole_entry = NULL;
1470 			} else {
1471 				hole_entry = next_hole_entry;
1472 			}
1473 		}
1474 
1475 		map->holes_list = NULL;
1476 		map->holelistenabled = FALSE;
1477 
1478 		map->first_free = vm_map_first_entry(map);
1479 		SAVE_HINT_HOLE_WRITE(map, NULL);
1480 	}
1481 }
1482 
1483 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1484 vm_kernel_map_is_kernel(vm_map_t map)
1485 {
1486 	return map->pmap == kernel_pmap;
1487 }
1488 
1489 /*
1490  *	vm_map_create:
1491  *
1492  *	Creates and returns a new empty VM map with
1493  *	the given physical map structure, and having
1494  *	the given lower and upper address bounds.
1495  */
1496 
1497 extern vm_map_t vm_map_create_external(
1498 	pmap_t                  pmap,
1499 	vm_map_offset_t         min_off,
1500 	vm_map_offset_t         max_off,
1501 	boolean_t               pageable);
1502 
1503 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1504 vm_map_create_external(
1505 	pmap_t                  pmap,
1506 	vm_map_offset_t         min,
1507 	vm_map_offset_t         max,
1508 	boolean_t               pageable)
1509 {
1510 	vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1511 
1512 	if (pageable) {
1513 		options |= VM_MAP_CREATE_PAGEABLE;
1514 	}
1515 	return vm_map_create_options(pmap, min, max, options);
1516 }
1517 
1518 __startup_func
1519 void
vm_map_will_allocate_early_map(vm_map_t * owner)1520 vm_map_will_allocate_early_map(vm_map_t *owner)
1521 {
1522 	if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1523 		panic("VM_MAP_EARLY_COUNT_MAX is too low");
1524 	}
1525 
1526 	early_map_owners[early_map_count++] = owner;
1527 }
1528 
1529 __startup_func
1530 void
vm_map_relocate_early_maps(vm_offset_t delta)1531 vm_map_relocate_early_maps(vm_offset_t delta)
1532 {
1533 	for (uint32_t i = 0; i < early_map_count; i++) {
1534 		vm_address_t addr = (vm_address_t)*early_map_owners[i];
1535 
1536 		*early_map_owners[i] = (vm_map_t)(addr + delta);
1537 	}
1538 
1539 	early_map_count = ~0u;
1540 }
1541 
1542 /*
1543  *	Routine:	vm_map_relocate_early_elem
1544  *
1545  *	Purpose:
1546  *		Early zone elements are allocated in a temporary part
1547  *		of the address space.
1548  *
1549  *		Once the zones live in their final place, the early
1550  *		VM maps, map entries and map holes need to be relocated.
1551  *
1552  *		It involves rewriting any vm_map_t, vm_map_entry_t or
1553  *		pointers to vm_map_links. Other pointers to other types
1554  *		are fine.
1555  *
1556  *		Fortunately, pointers to those types are self-contained
1557  *		in those zones, _except_ for pointers to VM maps,
1558  *		which are tracked during early boot and fixed with
1559  *		vm_map_relocate_early_maps().
1560  */
1561 __startup_func
1562 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1563 vm_map_relocate_early_elem(
1564 	uint32_t                zone_id,
1565 	vm_offset_t             new_addr,
1566 	vm_offset_t             delta)
1567 {
1568 #define relocate(type_t, field)  ({ \
1569 	typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field;   \
1570 	if (*__field) {                                                        \
1571 	        *__field = (typeof(*__field))((vm_offset_t)*__field + delta);  \
1572 	}                                                                      \
1573 })
1574 
1575 	switch (zone_id) {
1576 	case ZONE_ID_VM_MAP:
1577 	case ZONE_ID_VM_MAP_ENTRY:
1578 	case ZONE_ID_VM_MAP_HOLES:
1579 		break;
1580 
1581 	default:
1582 		panic("Unexpected zone ID %d", zone_id);
1583 	}
1584 
1585 	if (zone_id == ZONE_ID_VM_MAP) {
1586 		relocate(vm_map_t, hdr.links.prev);
1587 		relocate(vm_map_t, hdr.links.next);
1588 		((vm_map_t)new_addr)->pmap = kernel_pmap;
1589 #ifdef VM_MAP_STORE_USE_RB
1590 		relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1591 #endif /* VM_MAP_STORE_USE_RB */
1592 		relocate(vm_map_t, hint);
1593 		relocate(vm_map_t, hole_hint);
1594 		relocate(vm_map_t, first_free);
1595 		return;
1596 	}
1597 
1598 	relocate(struct vm_map_links *, prev);
1599 	relocate(struct vm_map_links *, next);
1600 
1601 	if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1602 #ifdef VM_MAP_STORE_USE_RB
1603 		relocate(vm_map_entry_t, store.entry.rbe_left);
1604 		relocate(vm_map_entry_t, store.entry.rbe_right);
1605 		relocate(vm_map_entry_t, store.entry.rbe_parent);
1606 #endif /* VM_MAP_STORE_USE_RB */
1607 		if (((vm_map_entry_t)new_addr)->is_sub_map) {
1608 			/* no object to relocate because we haven't made any */
1609 			((vm_map_entry_t)new_addr)->vme_submap +=
1610 			    delta >> VME_SUBMAP_SHIFT;
1611 		}
1612 #if MAP_ENTRY_CREATION_DEBUG
1613 		relocate(vm_map_entry_t, vme_creation_maphdr);
1614 #endif /* MAP_ENTRY_CREATION_DEBUG */
1615 	}
1616 
1617 #undef relocate
1618 }
1619 
1620 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1621 vm_map_create_options(
1622 	pmap_t                  pmap,
1623 	vm_map_offset_t         min,
1624 	vm_map_offset_t         max,
1625 	vm_map_create_options_t options)
1626 {
1627 	vm_map_t result;
1628 
1629 #if DEBUG || DEVELOPMENT
1630 	if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1631 		if (early_map_count != ~0u && early_map_count !=
1632 		    zone_count_allocated(vm_map_zone) + 1) {
1633 			panic("allocating %dth early map, owner not known",
1634 			    zone_count_allocated(vm_map_zone) + 1);
1635 		}
1636 		if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1637 			panic("allocating %dth early map for non kernel pmap",
1638 			    early_map_count);
1639 		}
1640 	}
1641 #endif /* DEBUG || DEVELOPMENT */
1642 
1643 	result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1644 
1645 	vm_map_store_init(&result->hdr);
1646 	result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1647 	vm_map_set_page_shift(result, PAGE_SHIFT);
1648 
1649 	result->size_limit      = RLIM_INFINITY;        /* default unlimited */
1650 	result->data_limit      = RLIM_INFINITY;        /* default unlimited */
1651 	result->user_wire_limit = MACH_VM_MAX_ADDRESS;  /* default limit is unlimited */
1652 	os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1653 	result->pmap = pmap;
1654 	result->min_offset = min;
1655 	result->max_offset = max;
1656 	result->first_free = vm_map_to_entry(result);
1657 	result->hint = vm_map_to_entry(result);
1658 
1659 	if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1660 		assert(pmap == kernel_pmap);
1661 		result->never_faults = true;
1662 	}
1663 
1664 	/* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1665 	if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1666 		result->has_corpse_footprint = true;
1667 	} else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1668 		struct vm_map_links *hole_entry;
1669 
1670 		hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1671 		hole_entry->start = min;
1672 		/*
1673 		 * Holes can be used to track ranges all the way up to
1674 		 * MACH_VM_MAX_ADDRESS or more (e.g. kernel map).
1675 		 */
1676 		hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1677 		result->holes_list = result->hole_hint = hole_entry;
1678 		hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1679 		result->holelistenabled = true;
1680 	}
1681 
1682 	vm_map_lock_init(result);
1683 
1684 	return result;
1685 }
1686 
1687 /*
1688  * Adjusts a submap that was made by kmem_suballoc()
1689  * before it knew where it would be mapped,
1690  * so that it has the right min/max offsets.
1691  *
1692  * We do not need to hold any locks:
1693  * only the caller knows about this map,
1694  * and it is not published on any entry yet.
1695  */
1696 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1697 vm_map_adjust_offsets(
1698 	vm_map_t                map,
1699 	vm_map_offset_t         min_off,
1700 	vm_map_offset_t         max_off)
1701 {
1702 	assert(map->min_offset == 0);
1703 	assert(map->max_offset == max_off - min_off);
1704 	assert(map->hdr.nentries == 0);
1705 	assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1706 
1707 	map->min_offset = min_off;
1708 	map->max_offset = max_off;
1709 
1710 	if (map->holelistenabled) {
1711 		struct vm_map_links *hole = map->holes_list;
1712 
1713 		hole->start = min_off;
1714 #if defined(__arm64__)
1715 		hole->end = max_off;
1716 #else
1717 		hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1718 #endif
1719 	}
1720 }
1721 
1722 
1723 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1724 vm_map_adjusted_size(vm_map_t map)
1725 {
1726 	const struct vm_reserved_region *regions = NULL;
1727 	size_t num_regions = 0;
1728 	mach_vm_size_t  reserved_size = 0, map_size = 0;
1729 
1730 	if (map == NULL || (map->size == 0)) {
1731 		return 0;
1732 	}
1733 
1734 	map_size = map->size;
1735 
1736 	if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1737 		/*
1738 		 * No special reserved regions or not an exotic map or the task
1739 		 * is terminating and these special regions might have already
1740 		 * been deallocated.
1741 		 */
1742 		return map_size;
1743 	}
1744 
1745 	num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), &regions);
1746 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1747 
1748 	while (num_regions) {
1749 		reserved_size += regions[--num_regions].vmrr_size;
1750 	}
1751 
1752 	/*
1753 	 * There are a few places where the map is being switched out due to
1754 	 * 'termination' without that bit being set (e.g. exec and corpse purging).
1755 	 * In those cases, we could have the map's regions being deallocated on
1756 	 * a core while some accounting process is trying to get the map's size.
1757 	 * So this assert can't be enabled till all those places are uniform in
1758 	 * their use of the 'map->terminated' bit.
1759 	 *
1760 	 * assert(map_size >= reserved_size);
1761 	 */
1762 
1763 	return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1764 }
1765 
1766 /*
1767  *	vm_map_entry_create:	[ internal use only ]
1768  *
1769  *	Allocates a VM map entry for insertion in the
1770  *	given map (or map copy).  No fields are filled.
1771  *
1772  *	The VM entry will be zero initialized, except for:
1773  *	- behavior set to VM_BEHAVIOR_DEFAULT
1774  *	- inheritance set to VM_INHERIT_DEFAULT
1775  */
1776 #define vm_map_entry_create(map)    _vm_map_entry_create(&(map)->hdr)
1777 
1778 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1779 
1780 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1781 _vm_map_entry_create(
1782 	struct vm_map_header    *map_header __unused)
1783 {
1784 	vm_map_entry_t entry = NULL;
1785 
1786 	entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1787 
1788 	/*
1789 	 * Help the compiler with what we know to be true,
1790 	 * so that the further bitfields inits have good codegen.
1791 	 *
1792 	 * See rdar://87041299
1793 	 */
1794 	__builtin_assume(entry->vme_object_value == 0);
1795 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1796 	__builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1797 
1798 	static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1799 	    "VME_ALIAS_MASK covers tags");
1800 
1801 	static_assert(VM_BEHAVIOR_DEFAULT == 0,
1802 	    "can skip zeroing of the behavior field");
1803 	entry->inheritance = VM_INHERIT_DEFAULT;
1804 
1805 #if MAP_ENTRY_CREATION_DEBUG
1806 	entry->vme_creation_maphdr = map_header;
1807 	entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1808 	    BTREF_GET_NOWAIT);
1809 #endif
1810 	return entry;
1811 }
1812 
1813 /*
1814  *	vm_map_entry_dispose:	[ internal use only ]
1815  *
1816  *	Inverse of vm_map_entry_create.
1817  *
1818  *      write map lock held so no need to
1819  *	do anything special to insure correctness
1820  *      of the stores
1821  */
1822 static void
vm_map_entry_dispose(vm_map_entry_t entry)1823 vm_map_entry_dispose(
1824 	vm_map_entry_t          entry)
1825 {
1826 #if VM_BTLOG_TAGS
1827 	if (entry->vme_kernel_object) {
1828 		btref_put(entry->vme_tag_btref);
1829 	}
1830 #endif /* VM_BTLOG_TAGS */
1831 #if MAP_ENTRY_CREATION_DEBUG
1832 	btref_put(entry->vme_creation_bt);
1833 #endif
1834 #if MAP_ENTRY_INSERTION_DEBUG
1835 	btref_put(entry->vme_insertion_bt);
1836 #endif
1837 	zfree(vm_map_entry_zone, entry);
1838 }
1839 
1840 #define vm_map_copy_entry_dispose(copy_entry) \
1841 	vm_map_entry_dispose(copy_entry)
1842 
1843 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1844 vm_map_zap_first_entry(
1845 	vm_map_zap_t            list)
1846 {
1847 	return list->vmz_head;
1848 }
1849 
1850 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1851 vm_map_zap_last_entry(
1852 	vm_map_zap_t            list)
1853 {
1854 	assert(vm_map_zap_first_entry(list));
1855 	return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1856 }
1857 
1858 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1859 vm_map_zap_append(
1860 	vm_map_zap_t            list,
1861 	vm_map_entry_t          entry)
1862 {
1863 	entry->vme_next = VM_MAP_ENTRY_NULL;
1864 	*list->vmz_tail = entry;
1865 	list->vmz_tail = &entry->vme_next;
1866 }
1867 
1868 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1869 vm_map_zap_pop(
1870 	vm_map_zap_t            list)
1871 {
1872 	vm_map_entry_t head = list->vmz_head;
1873 
1874 	if (head != VM_MAP_ENTRY_NULL &&
1875 	    (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1876 		list->vmz_tail = &list->vmz_head;
1877 	}
1878 
1879 	return head;
1880 }
1881 
1882 static void
vm_map_zap_dispose(vm_map_zap_t list)1883 vm_map_zap_dispose(
1884 	vm_map_zap_t            list)
1885 {
1886 	vm_map_entry_t          entry;
1887 
1888 	while ((entry = vm_map_zap_pop(list))) {
1889 		if (entry->is_sub_map) {
1890 			vm_map_deallocate(VME_SUBMAP(entry));
1891 		} else {
1892 			vm_object_deallocate(VME_OBJECT(entry));
1893 		}
1894 
1895 		vm_map_entry_dispose(entry);
1896 	}
1897 }
1898 
1899 #if MACH_ASSERT
1900 static boolean_t first_free_check = FALSE;
1901 boolean_t
first_free_is_valid(vm_map_t map)1902 first_free_is_valid(
1903 	vm_map_t        map)
1904 {
1905 	if (!first_free_check) {
1906 		return TRUE;
1907 	}
1908 
1909 	return first_free_is_valid_store( map );
1910 }
1911 #endif /* MACH_ASSERT */
1912 
1913 
1914 #define vm_map_copy_entry_link(copy, after_where, entry)                \
1915 	_vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1916 
1917 #define vm_map_copy_entry_unlink(copy, entry)                           \
1918 	_vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1919 
1920 /*
1921  *	vm_map_destroy:
1922  *
1923  *	Actually destroy a map.
1924  */
1925 void
vm_map_destroy(vm_map_t map)1926 vm_map_destroy(
1927 	vm_map_t        map)
1928 {
1929 	/* final cleanup: this is not allowed to fail */
1930 	vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1931 
1932 	VM_MAP_ZAP_DECLARE(zap);
1933 
1934 	vm_map_lock(map);
1935 
1936 	map->terminated = true;
1937 	/* clean up regular map entries */
1938 	(void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1939 	    KMEM_GUARD_NONE, &zap);
1940 	/* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1941 	(void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1942 	    KMEM_GUARD_NONE, &zap);
1943 
1944 	vm_map_disable_hole_optimization(map);
1945 	vm_map_corpse_footprint_destroy(map);
1946 
1947 	vm_map_unlock(map);
1948 
1949 	vm_map_zap_dispose(&zap);
1950 
1951 	assert(map->hdr.nentries == 0);
1952 
1953 	if (map->pmap) {
1954 		pmap_destroy(map->pmap);
1955 	}
1956 
1957 	lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1958 
1959 #if CONFIG_MAP_RANGES
1960 	kfree_data(map->extra_ranges,
1961 	    map->extra_ranges_count * sizeof(struct vm_map_user_range));
1962 #endif
1963 
1964 	zfree_id(ZONE_ID_VM_MAP, map);
1965 }
1966 
1967 /*
1968  * Returns pid of the task with the largest number of VM map entries.
1969  * Used in the zone-map-exhaustion jetsam path.
1970  */
1971 pid_t
find_largest_process_vm_map_entries(void)1972 find_largest_process_vm_map_entries(void)
1973 {
1974 	pid_t victim_pid = -1;
1975 	int max_vm_map_entries = 0;
1976 	task_t task = TASK_NULL;
1977 	queue_head_t *task_list = &tasks;
1978 
1979 	lck_mtx_lock(&tasks_threads_lock);
1980 	queue_iterate(task_list, task, task_t, tasks) {
1981 		if (task == kernel_task || !task->active) {
1982 			continue;
1983 		}
1984 
1985 		vm_map_t task_map = task->map;
1986 		if (task_map != VM_MAP_NULL) {
1987 			int task_vm_map_entries = task_map->hdr.nentries;
1988 			if (task_vm_map_entries > max_vm_map_entries) {
1989 				max_vm_map_entries = task_vm_map_entries;
1990 				victim_pid = pid_from_task(task);
1991 			}
1992 		}
1993 	}
1994 	lck_mtx_unlock(&tasks_threads_lock);
1995 
1996 	printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1997 	return victim_pid;
1998 }
1999 
2000 
2001 /*
2002  *	vm_map_lookup_entry:	[ internal use only ]
2003  *
2004  *	Calls into the vm map store layer to find the map
2005  *	entry containing (or immediately preceding) the
2006  *	specified address in the given map; the entry is returned
2007  *	in the "entry" parameter.  The boolean
2008  *	result indicates whether the address is
2009  *	actually contained in the map.
2010  */
2011 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2012 vm_map_lookup_entry(
2013 	vm_map_t        map,
2014 	vm_map_offset_t address,
2015 	vm_map_entry_t  *entry)         /* OUT */
2016 {
2017 	bool result = false;
2018 	if (VM_KERNEL_ADDRESS(address)) {
2019 		address = VM_KERNEL_STRIP_UPTR(address);
2020 	}
2021 
2022 #if CONFIG_PROB_GZALLOC
2023 	if (map->pmap == kernel_pmap) {
2024 		assertf(!pgz_owned(address),
2025 		    "it is the responsibility of callers to unguard PGZ addresses");
2026 	}
2027 #endif /* CONFIG_PROB_GZALLOC */
2028 	result = vm_map_store_lookup_entry( map, address, entry );
2029 
2030 	return result;
2031 }
2032 
2033 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2034 vm_map_lookup_entry_or_next(
2035 	vm_map_t        map,
2036 	vm_map_offset_t address,
2037 	vm_map_entry_t  *entry)         /* OUT */
2038 {
2039 	if (vm_map_lookup_entry(map, address, entry)) {
2040 		return true;
2041 	}
2042 
2043 	*entry = (*entry)->vme_next;
2044 	return false;
2045 }
2046 
2047 #if CONFIG_PROB_GZALLOC
2048 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2049 vm_map_lookup_entry_allow_pgz(
2050 	vm_map_t        map,
2051 	vm_map_offset_t address,
2052 	vm_map_entry_t  *entry)         /* OUT */
2053 {
2054 	if (VM_KERNEL_ADDRESS(address)) {
2055 		address = VM_KERNEL_STRIP_UPTR(address);
2056 	}
2057 	return vm_map_store_lookup_entry( map, address, entry );
2058 }
2059 #endif /* CONFIG_PROB_GZALLOC */
2060 
2061 /*
2062  *	Routine:	vm_map_range_invalid_panic
2063  *	Purpose:
2064  *			Panic on detection of an invalid range id.
2065  */
2066 __abortlike
2067 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)2068 vm_map_range_invalid_panic(
2069 	vm_map_t                map,
2070 	vm_map_range_id_t       range_id)
2071 {
2072 	panic("invalid range ID (%u) for map %p", range_id, map);
2073 }
2074 
2075 /*
2076  *	Routine:	vm_map_get_range
2077  *	Purpose:
2078  *			Adjust bounds based on security policy.
2079  */
2080 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)2081 vm_map_get_range(
2082 	vm_map_t                map,
2083 	vm_map_address_t       *address,
2084 	vm_map_kernel_flags_t  *vmk_flags,
2085 	vm_map_size_t           size,
2086 	bool                   *is_ptr)
2087 {
2088 	struct mach_vm_range effective_range = {};
2089 	vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
2090 
2091 	if (map == kernel_map) {
2092 		effective_range = kmem_ranges[range_id];
2093 
2094 		if (startup_phase >= STARTUP_SUB_KMEM) {
2095 			/*
2096 			 * Hint provided by caller is zeroed as the range is restricted to a
2097 			 * subset of the entire kernel_map VA, which could put the hint outside
2098 			 * the range, causing vm_map_store_find_space to fail.
2099 			 */
2100 			*address = 0ull;
2101 			/*
2102 			 * Ensure that range_id passed in by the caller is within meaningful
2103 			 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
2104 			 * to fail as the corresponding range is invalid. Range id larger than
2105 			 * KMEM_RANGE_ID_MAX will lead to an OOB access.
2106 			 */
2107 			if ((range_id == KMEM_RANGE_ID_NONE) ||
2108 			    (range_id > KMEM_RANGE_ID_MAX)) {
2109 				vm_map_range_invalid_panic(map, range_id);
2110 			}
2111 
2112 			/*
2113 			 * Pointer ranges use kmem_locate_space to do allocations.
2114 			 *
2115 			 * Non pointer fronts look like [ Small | Large | Permanent ]
2116 			 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
2117 			 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
2118 			 * use the entire range.
2119 			 */
2120 			if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2121 				*is_ptr = true;
2122 			} else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2123 				effective_range = kmem_large_ranges[range_id];
2124 			}
2125 		}
2126 #if CONFIG_MAP_RANGES
2127 	} else if (map->uses_user_ranges) {
2128 		switch (range_id) {
2129 		case UMEM_RANGE_ID_DEFAULT:
2130 			effective_range = map->default_range;
2131 			break;
2132 		case UMEM_RANGE_ID_HEAP:
2133 			effective_range = map->data_range;
2134 			break;
2135 		case UMEM_RANGE_ID_LARGE_FILE:
2136 			if (map->large_file_range.min_address != map->large_file_range.max_address) {
2137 				/* large file range is configured and should be used */
2138 				effective_range = map->large_file_range;
2139 			} else {
2140 				/*
2141 				 * the user asking for this user range might not have the
2142 				 * permissions to use the large file range (i.e., it doesn't
2143 				 * hold the correct entitlement), so we give it the data range
2144 				 * instead
2145 				 */
2146 				effective_range = map->data_range;
2147 			}
2148 			break;
2149 		case UMEM_RANGE_ID_FIXED:
2150 			/*
2151 			 * anywhere allocations with an address in "FIXED"
2152 			 * makes no sense, leave the range empty
2153 			 */
2154 			break;
2155 
2156 		default:
2157 			vm_map_range_invalid_panic(map, range_id);
2158 		}
2159 #endif /* CONFIG_MAP_RANGES */
2160 	} else {
2161 		/*
2162 		 * If minimum is 0, bump it up by PAGE_SIZE.  We want to limit
2163 		 * allocations of PAGEZERO to explicit requests since its
2164 		 * normal use is to catch dereferences of NULL and many
2165 		 * applications also treat pointers with a value of 0 as
2166 		 * special and suddenly having address 0 contain useable
2167 		 * memory would tend to confuse those applications.
2168 		 */
2169 		effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2170 		effective_range.max_address = map->max_offset;
2171 	}
2172 
2173 	return effective_range;
2174 }
2175 
2176 kern_return_t
vm_map_locate_space_anywhere(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2177 vm_map_locate_space_anywhere(
2178 	vm_map_t                map,
2179 	vm_map_size_t           size,
2180 	vm_map_offset_t         mask,
2181 	vm_map_kernel_flags_t   vmk_flags,
2182 	vm_map_offset_t        *start_inout,
2183 	vm_map_entry_t         *entry_out)
2184 {
2185 	struct mach_vm_range effective_range = {};
2186 	vm_map_size_t   guard_offset;
2187 	vm_map_offset_t hint, limit;
2188 	vm_map_entry_t  entry;
2189 	bool            is_kmem_ptr_range = false;
2190 
2191 	/*
2192 	 * Only supported by vm_map_enter() with a fixed address.
2193 	 */
2194 	assert(!vmk_flags.vmf_fixed);
2195 	assert(!vmk_flags.vmkf_beyond_max);
2196 
2197 	if (__improbable(map->wait_for_space)) {
2198 		/*
2199 		 * support for "wait_for_space" is minimal,
2200 		 * its only consumer is the ipc_kernel_copy_map.
2201 		 */
2202 		assert(!map->holelistenabled &&
2203 		    !vmk_flags.vmkf_last_free &&
2204 		    !vmk_flags.vmkf_keep_map_locked &&
2205 		    !vmk_flags.vmkf_map_jit &&
2206 		    !vmk_flags.vmf_random_addr &&
2207 		    *start_inout <= map->min_offset);
2208 	} else if (vmk_flags.vmkf_last_free) {
2209 		assert(!vmk_flags.vmkf_map_jit &&
2210 		    !vmk_flags.vmf_random_addr);
2211 	}
2212 
2213 	if (vmk_flags.vmkf_guard_before) {
2214 		guard_offset = VM_MAP_PAGE_SIZE(map);
2215 		assert(size > guard_offset);
2216 		size -= guard_offset;
2217 	} else {
2218 		assert(size != 0);
2219 		guard_offset = 0;
2220 	}
2221 
2222 	/*
2223 	 * Validate range_id from flags and get associated range
2224 	 */
2225 	effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2226 	    &is_kmem_ptr_range);
2227 
2228 	if (is_kmem_ptr_range) {
2229 		return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2230 		           vmk_flags.vmkf_last_free, start_inout, entry_out);
2231 	}
2232 
2233 #if XNU_TARGET_OS_OSX
2234 	if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2235 		assert(map != kernel_map);
2236 		effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2237 	}
2238 #endif /* XNU_TARGET_OS_OSX */
2239 
2240 again:
2241 	if (vmk_flags.vmkf_last_free) {
2242 		hint = *start_inout;
2243 
2244 		if (hint == 0 || hint > effective_range.max_address) {
2245 			hint = effective_range.max_address;
2246 		}
2247 		if (hint <= effective_range.min_address) {
2248 			return KERN_NO_SPACE;
2249 		}
2250 		limit = effective_range.min_address;
2251 	} else {
2252 		hint = *start_inout;
2253 
2254 		if (vmk_flags.vmkf_map_jit) {
2255 			if (map->jit_entry_exists &&
2256 			    !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2257 				return KERN_INVALID_ARGUMENT;
2258 			}
2259 			if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2260 				vmk_flags.vmf_random_addr = true;
2261 			}
2262 		}
2263 
2264 		if (vmk_flags.vmf_random_addr) {
2265 			kern_return_t kr;
2266 
2267 			kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2268 			if (kr != KERN_SUCCESS) {
2269 				return kr;
2270 			}
2271 		}
2272 #if __x86_64__
2273 		else if ((hint == 0 || hint == vm_map_min(map)) &&
2274 		    !map->disable_vmentry_reuse &&
2275 		    map->vmmap_high_start != 0) {
2276 			hint = map->vmmap_high_start;
2277 		}
2278 #endif /* __x86_64__ */
2279 
2280 		if (hint < effective_range.min_address) {
2281 			hint = effective_range.min_address;
2282 		}
2283 		if (effective_range.max_address <= hint) {
2284 			return KERN_NO_SPACE;
2285 		}
2286 
2287 		limit = effective_range.max_address;
2288 	}
2289 	entry = vm_map_store_find_space(map,
2290 	    hint, limit, vmk_flags.vmkf_last_free,
2291 	    guard_offset, size, mask,
2292 	    start_inout);
2293 
2294 	if (__improbable(entry == NULL)) {
2295 		if (map->wait_for_space &&
2296 		    guard_offset + size <=
2297 		    effective_range.max_address - effective_range.min_address) {
2298 			assert_wait((event_t)map, THREAD_ABORTSAFE);
2299 			vm_map_unlock(map);
2300 			thread_block(THREAD_CONTINUE_NULL);
2301 			vm_map_lock(map);
2302 			goto again;
2303 		}
2304 		return KERN_NO_SPACE;
2305 	}
2306 
2307 	if (entry_out) {
2308 		*entry_out = entry;
2309 	}
2310 	return KERN_SUCCESS;
2311 }
2312 
2313 /*!
2314  * @function vm_map_locate_space_fixed()
2315  *
2316  * @brief
2317  * Locate (no reservation) a range in the specified VM map at a fixed address.
2318  *
2319  * @param map           the map to scan for memory, must be locked.
2320  * @param start         the fixed address trying to be reserved
2321  * @param size          the size of the allocation to make.
2322  * @param mask          an alignment mask the allocation must respect,
2323  * @param vmk_flags     the vm map kernel flags to influence this call.
2324  *                      vmk_flags.vmf_anywhere must not be set.
2325  * @param entry_out     the entry right before the hole.
2326  * @param zap_list      a zap list of entries to clean up after the call.
2327  *
2328  * @returns
2329  * - KERN_SUCCESS in case of success and no conflicting entry is found,
2330  *   in which case entry_out is set to the entry before the hole.
2331  *
2332  * - KERN_MEMORY_PRESENT if a conflicting entry is found,
2333  *   in which case entry_out is set the conflicting entry,
2334  *   the callers MUST handle this error explicitly.
2335  *
2336  * - KERN_INVALID_ADDRESS if the specified @c start or @c size
2337  *   would result in a mapping outside of the map.
2338  *
2339  * - KERN_NO_SPACE for various cases of unrecoverable failures.
2340  */
2341 static kern_return_t
vm_map_locate_space_fixed(vm_map_t map,vm_map_offset_t start,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * entry_out,vm_map_zap_t zap_list)2342 vm_map_locate_space_fixed(
2343 	vm_map_t                map,
2344 	vm_map_offset_t         start,
2345 	vm_map_size_t           size,
2346 	vm_map_offset_t         mask,
2347 	vm_map_kernel_flags_t   vmk_flags,
2348 	vm_map_entry_t         *entry_out,
2349 	vm_map_zap_t            zap_list)
2350 {
2351 	vm_map_offset_t effective_min_offset, effective_max_offset;
2352 	vm_map_entry_t  entry;
2353 	vm_map_offset_t end;
2354 
2355 	assert(vmk_flags.vmf_fixed);
2356 
2357 	effective_min_offset = map->min_offset;
2358 	effective_max_offset = map->max_offset;
2359 
2360 	if (vmk_flags.vmkf_beyond_max) {
2361 		/*
2362 		 * Allow an insertion beyond the map's max offset.
2363 		 */
2364 		effective_max_offset = 0x00000000FFFFF000ULL;
2365 		if (vm_map_is_64bit(map)) {
2366 			effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2367 		}
2368 #if XNU_TARGET_OS_OSX
2369 	} else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2370 		effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2371 #endif /* XNU_TARGET_OS_OSX */
2372 	}
2373 
2374 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2375 	    !vmk_flags.vmf_overwrite &&
2376 	    map->pmap == kernel_pmap &&
2377 	    vmk_flags.vm_tag == VM_MEMORY_REALLOC) {
2378 		/*
2379 		 * Force realloc() to switch to a new allocation,
2380 		 * to prevent 4k-fragmented virtual ranges.
2381 		 */
2382 //		DEBUG4K_ERROR("no realloc in place");
2383 		return KERN_NO_SPACE;
2384 	}
2385 
2386 	/*
2387 	 *	Verify that:
2388 	 *		the address doesn't itself violate
2389 	 *		the mask requirement.
2390 	 */
2391 
2392 	if ((start & mask) != 0) {
2393 		return KERN_NO_SPACE;
2394 	}
2395 
2396 #if CONFIG_MAP_RANGES
2397 	if (map->uses_user_ranges) {
2398 		struct mach_vm_range r;
2399 
2400 		vm_map_user_range_resolve(map, start, 1, &r);
2401 		if (r.max_address == 0) {
2402 			return KERN_INVALID_ADDRESS;
2403 		}
2404 		effective_min_offset = r.min_address;
2405 		effective_max_offset = r.max_address;
2406 	}
2407 #endif /* CONFIG_MAP_RANGES */
2408 
2409 	if ((startup_phase >= STARTUP_SUB_KMEM) && !vmk_flags.vmkf_submap &&
2410 	    (map == kernel_map)) {
2411 		mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2412 		effective_min_offset = r->min_address;
2413 		effective_max_offset = r->max_address;
2414 	}
2415 
2416 	/*
2417 	 *	...	the address is within bounds
2418 	 */
2419 
2420 	end = start + size;
2421 
2422 	if ((start < effective_min_offset) ||
2423 	    (end > effective_max_offset) ||
2424 	    (start >= end)) {
2425 		return KERN_INVALID_ADDRESS;
2426 	}
2427 
2428 	if (vmk_flags.vmf_overwrite) {
2429 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2430 		kern_return_t remove_kr;
2431 
2432 		/*
2433 		 * Fixed mapping and "overwrite" flag: attempt to
2434 		 * remove all existing mappings in the specified
2435 		 * address range, saving them in our "zap_list".
2436 		 *
2437 		 * This avoids releasing the VM map lock in
2438 		 * vm_map_entry_delete() and allows atomicity
2439 		 * when we want to replace some mappings with a new one.
2440 		 * It also allows us to restore the old VM mappings if the
2441 		 * new mapping fails.
2442 		 */
2443 		remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2444 
2445 		if (vmk_flags.vmkf_overwrite_immutable) {
2446 			/* we can overwrite immutable mappings */
2447 			remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2448 		}
2449 		if (vmk_flags.vmkf_remap_prot_copy) {
2450 			remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2451 		}
2452 		remove_kr = vm_map_delete(map, start, end, remove_flags,
2453 		    KMEM_GUARD_NONE, zap_list).kmr_return;
2454 		if (remove_kr) {
2455 			/* XXX FBDP restore zap_list? */
2456 			return remove_kr;
2457 		}
2458 	}
2459 
2460 	/*
2461 	 *	...	the starting address isn't allocated
2462 	 */
2463 
2464 	if (vm_map_lookup_entry(map, start, &entry)) {
2465 		*entry_out = entry;
2466 		return KERN_MEMORY_PRESENT;
2467 	}
2468 
2469 	/*
2470 	 *	...	the next region doesn't overlap the
2471 	 *		end point.
2472 	 */
2473 
2474 	if ((entry->vme_next != vm_map_to_entry(map)) &&
2475 	    (entry->vme_next->vme_start < end)) {
2476 		return KERN_NO_SPACE;
2477 	}
2478 
2479 	*entry_out = entry;
2480 	return KERN_SUCCESS;
2481 }
2482 
2483 /*
2484  *	Routine:	vm_map_find_space
2485  *	Purpose:
2486  *		Allocate a range in the specified virtual address map,
2487  *		returning the entry allocated for that range.
2488  *		Used by kmem_alloc, etc.
2489  *
2490  *		The map must be NOT be locked. It will be returned locked
2491  *		on KERN_SUCCESS, unlocked on failure.
2492  *
2493  *		If an entry is allocated, the object/offset fields
2494  *		are initialized to zero.
2495  */
2496 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2497 vm_map_find_space(
2498 	vm_map_t                map,
2499 	vm_map_offset_t         hint_address,
2500 	vm_map_size_t           size,
2501 	vm_map_offset_t         mask,
2502 	vm_map_kernel_flags_t   vmk_flags,
2503 	vm_map_entry_t          *o_entry)       /* OUT */
2504 {
2505 	vm_map_entry_t          new_entry, entry;
2506 	kern_return_t           kr;
2507 
2508 	if (size == 0) {
2509 		return KERN_INVALID_ARGUMENT;
2510 	}
2511 
2512 	new_entry = vm_map_entry_create(map);
2513 	new_entry->use_pmap = true;
2514 	new_entry->protection = VM_PROT_DEFAULT;
2515 	new_entry->max_protection = VM_PROT_ALL;
2516 
2517 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2518 		new_entry->map_aligned = true;
2519 	}
2520 	if (vmk_flags.vmf_permanent) {
2521 		new_entry->vme_permanent = true;
2522 	}
2523 
2524 	vm_map_lock(map);
2525 
2526 	kr = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
2527 	    &hint_address, &entry);
2528 	if (kr != KERN_SUCCESS) {
2529 		vm_map_unlock(map);
2530 		vm_map_entry_dispose(new_entry);
2531 		return kr;
2532 	}
2533 	new_entry->vme_start = hint_address;
2534 	new_entry->vme_end = hint_address + size;
2535 
2536 	/*
2537 	 *	At this point,
2538 	 *
2539 	 *	- new_entry's "vme_start" and "vme_end" should define
2540 	 *	  the endpoints of the available new range,
2541 	 *
2542 	 *	- and "entry" should refer to the region before
2543 	 *	  the new range,
2544 	 *
2545 	 *	- and the map should still be locked.
2546 	 */
2547 
2548 	assert(page_aligned(new_entry->vme_start));
2549 	assert(page_aligned(new_entry->vme_end));
2550 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2551 	assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2552 
2553 	/*
2554 	 *	Insert the new entry into the list
2555 	 */
2556 
2557 	vm_map_store_entry_link(map, entry, new_entry,
2558 	    VM_MAP_KERNEL_FLAGS_NONE);
2559 	map->size += size;
2560 
2561 	/*
2562 	 *	Update the lookup hint
2563 	 */
2564 	SAVE_HINT_MAP_WRITE(map, new_entry);
2565 
2566 	*o_entry = new_entry;
2567 	return KERN_SUCCESS;
2568 }
2569 
2570 int vm_map_pmap_enter_print = FALSE;
2571 int vm_map_pmap_enter_enable = FALSE;
2572 
2573 /*
2574  *	Routine:	vm_map_pmap_enter [internal only]
2575  *
2576  *	Description:
2577  *		Force pages from the specified object to be entered into
2578  *		the pmap at the specified address if they are present.
2579  *		As soon as a page not found in the object the scan ends.
2580  *
2581  *	Returns:
2582  *		Nothing.
2583  *
2584  *	In/out conditions:
2585  *		The source map should not be locked on entry.
2586  */
2587 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2588 vm_map_pmap_enter(
2589 	vm_map_t                map,
2590 	vm_map_offset_t         addr,
2591 	vm_map_offset_t         end_addr,
2592 	vm_object_t             object,
2593 	vm_object_offset_t      offset,
2594 	vm_prot_t               protection)
2595 {
2596 	int                     type_of_fault;
2597 	kern_return_t           kr;
2598 	uint8_t                 object_lock_type = 0;
2599 	struct vm_object_fault_info fault_info = {};
2600 
2601 	if (map->pmap == 0) {
2602 		return;
2603 	}
2604 
2605 	assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2606 
2607 	while (addr < end_addr) {
2608 		vm_page_t       m;
2609 
2610 
2611 		/*
2612 		 * TODO:
2613 		 * From vm_map_enter(), we come into this function without the map
2614 		 * lock held or the object lock held.
2615 		 * We haven't taken a reference on the object either.
2616 		 * We should do a proper lookup on the map to make sure
2617 		 * that things are sane before we go locking objects that
2618 		 * could have been deallocated from under us.
2619 		 */
2620 
2621 		object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2622 		vm_object_lock(object);
2623 
2624 		m = vm_page_lookup(object, offset);
2625 
2626 		if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2627 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2628 			vm_object_unlock(object);
2629 			return;
2630 		}
2631 
2632 		if (vm_map_pmap_enter_print) {
2633 			printf("vm_map_pmap_enter:");
2634 			printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2635 			    map, (unsigned long long)addr, object, (unsigned long long)offset);
2636 		}
2637 		type_of_fault = DBG_CACHE_HIT_FAULT;
2638 		kr = vm_fault_enter(m, map->pmap,
2639 		    addr,
2640 		    PAGE_SIZE, 0,
2641 		    protection, protection,
2642 		    VM_PAGE_WIRED(m),
2643 		    FALSE,                 /* change_wiring */
2644 		    VM_KERN_MEMORY_NONE,                 /* tag - not wiring */
2645 		    &fault_info,
2646 		    NULL,                  /* need_retry */
2647 		    &type_of_fault,
2648 		    &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2649 
2650 		vm_object_unlock(object);
2651 
2652 		offset += PAGE_SIZE_64;
2653 		addr += PAGE_SIZE;
2654 	}
2655 }
2656 
2657 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2658 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2659 vm_map_random_address_for_size(
2660 	vm_map_t                map,
2661 	vm_map_offset_t        *address,
2662 	vm_map_size_t           size,
2663 	vm_map_kernel_flags_t   vmk_flags)
2664 {
2665 	kern_return_t   kr = KERN_SUCCESS;
2666 	int             tries = 0;
2667 	vm_map_offset_t random_addr = 0;
2668 	vm_map_offset_t hole_end;
2669 
2670 	vm_map_entry_t  next_entry = VM_MAP_ENTRY_NULL;
2671 	vm_map_entry_t  prev_entry = VM_MAP_ENTRY_NULL;
2672 	vm_map_size_t   vm_hole_size = 0;
2673 	vm_map_size_t   addr_space_size;
2674 	bool            is_kmem_ptr;
2675 	struct mach_vm_range effective_range;
2676 
2677 	effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2678 	    &is_kmem_ptr);
2679 
2680 	addr_space_size = effective_range.max_address - effective_range.min_address;
2681 	if (size >= addr_space_size) {
2682 		return KERN_NO_SPACE;
2683 	}
2684 	addr_space_size -= size;
2685 
2686 	assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2687 
2688 	while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2689 		if (startup_phase < STARTUP_SUB_ZALLOC) {
2690 			random_addr = (vm_map_offset_t)early_random();
2691 		} else {
2692 			random_addr = (vm_map_offset_t)random();
2693 		}
2694 		random_addr <<= VM_MAP_PAGE_SHIFT(map);
2695 		random_addr = vm_map_trunc_page(
2696 			effective_range.min_address + (random_addr % addr_space_size),
2697 			VM_MAP_PAGE_MASK(map));
2698 
2699 #if CONFIG_PROB_GZALLOC
2700 		if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2701 			continue;
2702 		}
2703 #endif /* CONFIG_PROB_GZALLOC */
2704 
2705 		if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2706 			if (prev_entry == vm_map_to_entry(map)) {
2707 				next_entry = vm_map_first_entry(map);
2708 			} else {
2709 				next_entry = prev_entry->vme_next;
2710 			}
2711 			if (next_entry == vm_map_to_entry(map)) {
2712 				hole_end = vm_map_max(map);
2713 			} else {
2714 				hole_end = next_entry->vme_start;
2715 			}
2716 			vm_hole_size = hole_end - random_addr;
2717 			if (vm_hole_size >= size) {
2718 				*address = random_addr;
2719 				break;
2720 			}
2721 		}
2722 		tries++;
2723 	}
2724 
2725 	if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2726 		kr = KERN_NO_SPACE;
2727 	}
2728 	return kr;
2729 }
2730 
2731 static boolean_t
vm_memory_malloc_no_cow(int alias)2732 vm_memory_malloc_no_cow(
2733 	int alias)
2734 {
2735 	uint64_t alias_mask;
2736 
2737 	if (!malloc_no_cow) {
2738 		return FALSE;
2739 	}
2740 	if (alias > 63) {
2741 		return FALSE;
2742 	}
2743 	alias_mask = 1ULL << alias;
2744 	if (alias_mask & vm_memory_malloc_no_cow_mask) {
2745 		return TRUE;
2746 	}
2747 	return FALSE;
2748 }
2749 
2750 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2751 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2752 /*
2753  *	Routine:	vm_map_enter
2754  *
2755  *	Description:
2756  *		Allocate a range in the specified virtual address map.
2757  *		The resulting range will refer to memory defined by
2758  *		the given memory object and offset into that object.
2759  *
2760  *		Arguments are as defined in the vm_map call.
2761  */
2762 static unsigned int vm_map_enter_restore_successes = 0;
2763 static unsigned int vm_map_enter_restore_failures = 0;
2764 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2765 vm_map_enter(
2766 	vm_map_t                map,
2767 	vm_map_offset_t         *address,       /* IN/OUT */
2768 	vm_map_size_t           size,
2769 	vm_map_offset_t         mask,
2770 	vm_map_kernel_flags_t   vmk_flags,
2771 	vm_object_t             object,
2772 	vm_object_offset_t      offset,
2773 	boolean_t               needs_copy,
2774 	vm_prot_t               cur_protection,
2775 	vm_prot_t               max_protection,
2776 	vm_inherit_t            inheritance)
2777 {
2778 	vm_map_entry_t          entry, new_entry;
2779 	vm_map_offset_t         start, tmp_start, tmp_offset;
2780 	vm_map_offset_t         end, tmp_end;
2781 	vm_map_offset_t         tmp2_start, tmp2_end;
2782 	vm_map_offset_t         step;
2783 	kern_return_t           result = KERN_SUCCESS;
2784 	bool                    map_locked = FALSE;
2785 	bool                    pmap_empty = TRUE;
2786 	bool                    new_mapping_established = FALSE;
2787 	const bool              keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2788 	const bool              anywhere = !vmk_flags.vmf_fixed;
2789 	const bool              purgable = vmk_flags.vmf_purgeable;
2790 	const bool              no_cache = vmk_flags.vmf_no_cache;
2791 	const bool              is_submap = vmk_flags.vmkf_submap;
2792 	const bool              permanent = vmk_flags.vmf_permanent;
2793 	const bool              no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2794 	const bool              entry_for_jit = vmk_flags.vmkf_map_jit;
2795 	const bool              iokit_acct = vmk_flags.vmkf_iokit_acct;
2796 	const bool              resilient_codesign = vmk_flags.vmf_resilient_codesign;
2797 	const bool              resilient_media = vmk_flags.vmf_resilient_media;
2798 	const bool              entry_for_tpro = vmk_flags.vmf_tpro;
2799 	const unsigned int      superpage_size = vmk_flags.vmf_superpage_size;
2800 	const vm_tag_t          alias = vmk_flags.vm_tag;
2801 	vm_tag_t                user_alias;
2802 	kern_return_t           kr;
2803 	bool                    clear_map_aligned = FALSE;
2804 	vm_map_size_t           chunk_size = 0;
2805 	vm_object_t             caller_object;
2806 	VM_MAP_ZAP_DECLARE(zap_old_list);
2807 	VM_MAP_ZAP_DECLARE(zap_new_list);
2808 
2809 	caller_object = object;
2810 
2811 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2812 
2813 	if (vmk_flags.vmf_4gb_chunk) {
2814 #if defined(__LP64__)
2815 		chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2816 #else /* __LP64__ */
2817 		chunk_size = ANON_CHUNK_SIZE;
2818 #endif /* __LP64__ */
2819 	} else {
2820 		chunk_size = ANON_CHUNK_SIZE;
2821 	}
2822 
2823 
2824 
2825 	if (superpage_size) {
2826 		if (object != VM_OBJECT_NULL) {
2827 			/* caller can't provide their own VM object */
2828 			return KERN_INVALID_ARGUMENT;
2829 		}
2830 		switch (superpage_size) {
2831 			/*
2832 			 * Note that the current implementation only supports
2833 			 * a single size for superpages, SUPERPAGE_SIZE, per
2834 			 * architecture. As soon as more sizes are supposed
2835 			 * to be supported, SUPERPAGE_SIZE has to be replaced
2836 			 * with a lookup of the size depending on superpage_size.
2837 			 */
2838 #ifdef __x86_64__
2839 		case SUPERPAGE_SIZE_ANY:
2840 			/* handle it like 2 MB and round up to page size */
2841 			size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2842 			OS_FALLTHROUGH;
2843 		case SUPERPAGE_SIZE_2MB:
2844 			break;
2845 #endif
2846 		default:
2847 			return KERN_INVALID_ARGUMENT;
2848 		}
2849 		mask = SUPERPAGE_SIZE - 1;
2850 		if (size & (SUPERPAGE_SIZE - 1)) {
2851 			return KERN_INVALID_ARGUMENT;
2852 		}
2853 		inheritance = VM_INHERIT_NONE;  /* fork() children won't inherit superpages */
2854 	}
2855 
2856 
2857 	if ((cur_protection & VM_PROT_WRITE) &&
2858 	    (cur_protection & VM_PROT_EXECUTE) &&
2859 #if XNU_TARGET_OS_OSX
2860 	    map->pmap != kernel_pmap &&
2861 	    (cs_process_global_enforcement() ||
2862 	    (vmk_flags.vmkf_cs_enforcement_override
2863 	    ? vmk_flags.vmkf_cs_enforcement
2864 	    : (vm_map_cs_enforcement(map)
2865 #if __arm64__
2866 	    || !VM_MAP_IS_EXOTIC(map)
2867 #endif /* __arm64__ */
2868 	    ))) &&
2869 #endif /* XNU_TARGET_OS_OSX */
2870 #if CODE_SIGNING_MONITOR
2871 	    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2872 #endif
2873 	    (VM_MAP_POLICY_WX_FAIL(map) ||
2874 	    VM_MAP_POLICY_WX_STRIP_X(map)) &&
2875 	    !entry_for_jit) {
2876 		boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2877 
2878 		DTRACE_VM3(cs_wx,
2879 		    uint64_t, 0,
2880 		    uint64_t, 0,
2881 		    vm_prot_t, cur_protection);
2882 		printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2883 		    proc_selfpid(),
2884 		    (get_bsdtask_info(current_task())
2885 		    ? proc_name_address(get_bsdtask_info(current_task()))
2886 		    : "?"),
2887 		    __FUNCTION__,
2888 		    (vm_protect_wx_fail ? "failing" : "turning off execute"));
2889 		cur_protection &= ~VM_PROT_EXECUTE;
2890 		if (vm_protect_wx_fail) {
2891 			return KERN_PROTECTION_FAILURE;
2892 		}
2893 	}
2894 
2895 	if (entry_for_jit
2896 	    && cur_protection != VM_PROT_ALL) {
2897 		/*
2898 		 * Native macOS processes and all non-macOS processes are
2899 		 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2900 		 * the RWX requirement was not enforced, and thus, we must live
2901 		 * with our sins. We are now dealing with a JIT mapping without
2902 		 * RWX.
2903 		 *
2904 		 * We deal with these by letting the MAP_JIT stick in order
2905 		 * to avoid CS violations when these pages are mapped executable
2906 		 * down the line. In order to appease the page table monitor (you
2907 		 * know what I'm talking about), these pages will end up being
2908 		 * marked as XNU_USER_DEBUG, which will be allowed because we
2909 		 * don't enforce the code signing monitor on macOS systems. If
2910 		 * the user-space application ever changes permissions to RWX,
2911 		 * which they are allowed to since the mapping was originally
2912 		 * created with MAP_JIT, then they'll switch over to using the
2913 		 * XNU_USER_JIT type, and won't be allowed to downgrade any
2914 		 * more after that.
2915 		 *
2916 		 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
2917 		 * strictly disallowed.
2918 		 */
2919 
2920 #if XNU_TARGET_OS_OSX
2921 		/*
2922 		 * Continue to allow non-RWX JIT
2923 		 */
2924 #else
2925 		/* non-macOS: reject JIT regions without RWX */
2926 		DTRACE_VM3(cs_wx,
2927 		    uint64_t, 0,
2928 		    uint64_t, 0,
2929 		    vm_prot_t, cur_protection);
2930 		printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
2931 		    proc_selfpid(),
2932 		    (get_bsdtask_info(current_task())
2933 		    ? proc_name_address(get_bsdtask_info(current_task()))
2934 		    : "?"),
2935 		    __FUNCTION__,
2936 		    cur_protection);
2937 		return KERN_PROTECTION_FAILURE;
2938 #endif
2939 	}
2940 
2941 	/*
2942 	 * If the task has requested executable lockdown,
2943 	 * deny any new executable mapping.
2944 	 */
2945 	if (map->map_disallow_new_exec == TRUE) {
2946 		if (cur_protection & VM_PROT_EXECUTE) {
2947 			return KERN_PROTECTION_FAILURE;
2948 		}
2949 	}
2950 
2951 	if (resilient_codesign) {
2952 		assert(!is_submap);
2953 		int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2954 		if ((cur_protection | max_protection) & reject_prot) {
2955 			return KERN_PROTECTION_FAILURE;
2956 		}
2957 	}
2958 
2959 	if (resilient_media) {
2960 		assert(!is_submap);
2961 //		assert(!needs_copy);
2962 		if (object != VM_OBJECT_NULL &&
2963 		    !object->internal) {
2964 			/*
2965 			 * This mapping is directly backed by an external
2966 			 * memory manager (e.g. a vnode pager for a file):
2967 			 * we would not have any safe place to inject
2968 			 * a zero-filled page if an actual page is not
2969 			 * available, without possibly impacting the actual
2970 			 * contents of the mapped object (e.g. the file),
2971 			 * so we can't provide any media resiliency here.
2972 			 */
2973 			return KERN_INVALID_ARGUMENT;
2974 		}
2975 	}
2976 
2977 	if (entry_for_tpro) {
2978 		/*
2979 		 * TPRO overrides the effective permissions of the region
2980 		 * and explicitly maps as RW. Ensure we have been passed
2981 		 * the expected permissions. We accept `cur_protections`
2982 		 * RO as that will be handled on fault.
2983 		 */
2984 		if (!(max_protection & VM_PROT_READ) ||
2985 		    !(max_protection & VM_PROT_WRITE) ||
2986 		    !(cur_protection & VM_PROT_READ)) {
2987 			return KERN_PROTECTION_FAILURE;
2988 		}
2989 
2990 		/*
2991 		 * We can now downgrade the cur_protection to RO. This is a mild lie
2992 		 * to the VM layer. But TPRO will be responsible for toggling the
2993 		 * protections between RO/RW
2994 		 */
2995 		cur_protection = VM_PROT_READ;
2996 	}
2997 
2998 	if (is_submap) {
2999 		vm_map_t submap;
3000 		if (purgable) {
3001 			/* submaps can not be purgeable */
3002 			return KERN_INVALID_ARGUMENT;
3003 		}
3004 		if (object == VM_OBJECT_NULL) {
3005 			/* submaps can not be created lazily */
3006 			return KERN_INVALID_ARGUMENT;
3007 		}
3008 		submap = (vm_map_t) object;
3009 		if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
3010 			/* page size mismatch */
3011 			return KERN_INVALID_ARGUMENT;
3012 		}
3013 	}
3014 	if (vmk_flags.vmkf_already) {
3015 		/*
3016 		 * VM_FLAGS_ALREADY says that it's OK if the same mapping
3017 		 * is already present.  For it to be meaningul, the requested
3018 		 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
3019 		 * we shouldn't try and remove what was mapped there first
3020 		 * (!VM_FLAGS_OVERWRITE).
3021 		 */
3022 		if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
3023 			return KERN_INVALID_ARGUMENT;
3024 		}
3025 	}
3026 
3027 	if (size == 0 ||
3028 	    (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
3029 		*address = 0;
3030 		return KERN_INVALID_ARGUMENT;
3031 	}
3032 
3033 	if (map->pmap == kernel_pmap) {
3034 		user_alias = VM_KERN_MEMORY_NONE;
3035 	} else {
3036 		user_alias = alias;
3037 	}
3038 
3039 	if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
3040 		chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
3041 	}
3042 
3043 #define RETURN(value)   { result = value; goto BailOut; }
3044 
3045 	assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
3046 	assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
3047 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
3048 		assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
3049 		assertf(page_aligned(size), "0x%llx", (uint64_t)size);
3050 	}
3051 
3052 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3053 	    !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
3054 		/*
3055 		 * In most cases, the caller rounds the size up to the
3056 		 * map's page size.
3057 		 * If we get a size that is explicitly not map-aligned here,
3058 		 * we'll have to respect the caller's wish and mark the
3059 		 * mapping as "not map-aligned" to avoid tripping the
3060 		 * map alignment checks later.
3061 		 */
3062 		clear_map_aligned = TRUE;
3063 	}
3064 	if (!anywhere &&
3065 	    VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3066 	    !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
3067 		/*
3068 		 * We've been asked to map at a fixed address and that
3069 		 * address is not aligned to the map's specific alignment.
3070 		 * The caller should know what it's doing (i.e. most likely
3071 		 * mapping some fragmented copy map, transferring memory from
3072 		 * a VM map with a different alignment), so clear map_aligned
3073 		 * for this new VM map entry and proceed.
3074 		 */
3075 		clear_map_aligned = TRUE;
3076 	}
3077 
3078 	/*
3079 	 * Only zero-fill objects are allowed to be purgable.
3080 	 * LP64todo - limit purgable objects to 32-bits for now
3081 	 */
3082 	if (purgable &&
3083 	    (offset != 0 ||
3084 	    (object != VM_OBJECT_NULL &&
3085 	    (object->vo_size != size ||
3086 	    object->purgable == VM_PURGABLE_DENY))
3087 #if __LP64__
3088 	    || size > ANON_MAX_SIZE
3089 #endif
3090 	    )) {
3091 		return KERN_INVALID_ARGUMENT;
3092 	}
3093 
3094 	vm_map_lock(map);
3095 	map_locked = TRUE;
3096 
3097 	if (anywhere) {
3098 		result = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
3099 		    address, &entry);
3100 		start = *address;
3101 	} else {
3102 		start = *address;
3103 		result = vm_map_locate_space_fixed(map, start, size, mask,
3104 		    vmk_flags, &entry, &zap_old_list);
3105 	}
3106 
3107 	end = start + size;
3108 
3109 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
3110 
3111 	/*
3112 	 * Check if what's already there is what we want.
3113 	 */
3114 	if (result == KERN_MEMORY_PRESENT) {
3115 		assert(!anywhere);
3116 		if (!(vmk_flags.vmkf_already)) {
3117 			RETURN(KERN_NO_SPACE);
3118 		}
3119 		tmp_start = start;
3120 		tmp_offset = offset;
3121 		if (entry->vme_start < start) {
3122 			tmp_start -= start - entry->vme_start;
3123 			tmp_offset -= start - entry->vme_start;
3124 		}
3125 		for (; entry->vme_start < end;
3126 		    entry = entry->vme_next) {
3127 			/*
3128 			 * Check if the mapping's attributes
3129 			 * match the existing map entry.
3130 			 */
3131 			if (entry == vm_map_to_entry(map) ||
3132 			    entry->vme_start != tmp_start ||
3133 			    entry->is_sub_map != is_submap ||
3134 			    VME_OFFSET(entry) != tmp_offset ||
3135 			    entry->needs_copy != needs_copy ||
3136 			    entry->protection != cur_protection ||
3137 			    entry->max_protection != max_protection ||
3138 			    entry->inheritance != inheritance ||
3139 			    entry->iokit_acct != iokit_acct ||
3140 			    VME_ALIAS(entry) != alias) {
3141 				/* not the same mapping ! */
3142 				RETURN(KERN_NO_SPACE);
3143 			}
3144 			/*
3145 			 * Check if the same object is being mapped.
3146 			 */
3147 			if (is_submap) {
3148 				if (VME_SUBMAP(entry) !=
3149 				    (vm_map_t) object) {
3150 					/* not the same submap */
3151 					RETURN(KERN_NO_SPACE);
3152 				}
3153 			} else {
3154 				if (VME_OBJECT(entry) != object) {
3155 					/* not the same VM object... */
3156 					vm_object_t obj2;
3157 
3158 					obj2 = VME_OBJECT(entry);
3159 					if ((obj2 == VM_OBJECT_NULL || obj2->internal) &&
3160 					    (object == VM_OBJECT_NULL || object->internal)) {
3161 						/*
3162 						 * ... but both are
3163 						 * anonymous memory,
3164 						 * so equivalent.
3165 						 */
3166 					} else {
3167 						RETURN(KERN_NO_SPACE);
3168 					}
3169 				}
3170 			}
3171 
3172 			tmp_offset += entry->vme_end - entry->vme_start;
3173 			tmp_start += entry->vme_end - entry->vme_start;
3174 			if (entry->vme_end >= end) {
3175 				/* reached the end of our mapping */
3176 				break;
3177 			}
3178 		}
3179 		/* it all matches:  let's use what's already there ! */
3180 		RETURN(KERN_MEMORY_PRESENT);
3181 	}
3182 
3183 	if (result != KERN_SUCCESS) {
3184 		goto BailOut;
3185 	}
3186 
3187 
3188 	/*
3189 	 *	At this point,
3190 	 *		"start" and "end" should define the endpoints of the
3191 	 *			available new range, and
3192 	 *		"entry" should refer to the region before the new
3193 	 *			range, and
3194 	 *
3195 	 *		the map should be locked.
3196 	 */
3197 
3198 	/*
3199 	 *	See whether we can avoid creating a new entry (and object) by
3200 	 *	extending one of our neighbors.  [So far, we only attempt to
3201 	 *	extend from below.]  Note that we can never extend/join
3202 	 *	purgable objects because they need to remain distinct
3203 	 *	entities in order to implement their "volatile object"
3204 	 *	semantics.
3205 	 */
3206 
3207 	if (purgable ||
3208 	    entry_for_jit ||
3209 	    entry_for_tpro ||
3210 	    vm_memory_malloc_no_cow(user_alias)) {
3211 		if (superpage_size) {
3212 			/*
3213 			 * For "super page" allocations, we will allocate
3214 			 * special physically-contiguous VM objects later on,
3215 			 * so we should not have flags instructing us to create
3216 			 * a differently special VM object here.
3217 			 */
3218 			RETURN(KERN_INVALID_ARGUMENT);
3219 		}
3220 
3221 		if (object == VM_OBJECT_NULL) {
3222 			assert(!superpage_size);
3223 			object = vm_object_allocate(size);
3224 			vm_object_lock(object);
3225 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3226 			VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3227 			if (malloc_no_cow_except_fork &&
3228 			    !purgable &&
3229 			    !entry_for_jit &&
3230 			    !entry_for_tpro &&
3231 			    vm_memory_malloc_no_cow(user_alias)) {
3232 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3233 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3234 			}
3235 			if (entry_for_jit) {
3236 				object->vo_inherit_copy_none = true;
3237 			}
3238 			if (purgable) {
3239 				task_t owner;
3240 				VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3241 				if (map->pmap == kernel_pmap) {
3242 					/*
3243 					 * Purgeable mappings made in a kernel
3244 					 * map are "owned" by the kernel itself
3245 					 * rather than the current user task
3246 					 * because they're likely to be used by
3247 					 * more than this user task (see
3248 					 * execargs_purgeable_allocate(), for
3249 					 * example).
3250 					 */
3251 					owner = kernel_task;
3252 				} else {
3253 					owner = current_task();
3254 				}
3255 				assert(object->vo_owner == NULL);
3256 				assert(object->resident_page_count == 0);
3257 				assert(object->wired_page_count == 0);
3258 				vm_purgeable_nonvolatile_enqueue(object, owner);
3259 			}
3260 			vm_object_unlock(object);
3261 			offset = (vm_object_offset_t)0;
3262 		}
3263 	} else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3264 		/* no coalescing if address space uses sub-pages */
3265 	} else if ((is_submap == FALSE) &&
3266 	    (object == VM_OBJECT_NULL) &&
3267 	    (entry != vm_map_to_entry(map)) &&
3268 	    (entry->vme_end == start) &&
3269 	    (!entry->is_shared) &&
3270 	    (!entry->is_sub_map) &&
3271 	    (!entry->in_transition) &&
3272 	    (!entry->needs_wakeup) &&
3273 	    (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3274 	    (entry->protection == cur_protection) &&
3275 	    (entry->max_protection == max_protection) &&
3276 	    (entry->inheritance == inheritance) &&
3277 	    ((user_alias == VM_MEMORY_REALLOC) ||
3278 	    (VME_ALIAS(entry) == alias)) &&
3279 	    (entry->no_cache == no_cache) &&
3280 	    (entry->vme_permanent == permanent) &&
3281 	    /* no coalescing for immutable executable mappings */
3282 	    !((entry->protection & VM_PROT_EXECUTE) &&
3283 	    entry->vme_permanent) &&
3284 	    (!entry->superpage_size && !superpage_size) &&
3285 	    /*
3286 	     * No coalescing if not map-aligned, to avoid propagating
3287 	     * that condition any further than needed:
3288 	     */
3289 	    (!entry->map_aligned || !clear_map_aligned) &&
3290 	    (!entry->zero_wired_pages) &&
3291 	    (!entry->used_for_jit && !entry_for_jit) &&
3292 #if __arm64e__
3293 	    (!entry->used_for_tpro && !entry_for_tpro) &&
3294 #endif
3295 	    (!entry->csm_associated) &&
3296 	    (entry->iokit_acct == iokit_acct) &&
3297 	    (!entry->vme_resilient_codesign) &&
3298 	    (!entry->vme_resilient_media) &&
3299 	    (!entry->vme_atomic) &&
3300 	    (entry->vme_no_copy_on_read == no_copy_on_read) &&
3301 
3302 	    ((entry->vme_end - entry->vme_start) + size <=
3303 	    (user_alias == VM_MEMORY_REALLOC ?
3304 	    ANON_CHUNK_SIZE :
3305 	    NO_COALESCE_LIMIT)) &&
3306 
3307 	    (entry->wired_count == 0)) {        /* implies user_wired_count == 0 */
3308 		if (vm_object_coalesce(VME_OBJECT(entry),
3309 		    VM_OBJECT_NULL,
3310 		    VME_OFFSET(entry),
3311 		    (vm_object_offset_t) 0,
3312 		    (vm_map_size_t)(entry->vme_end - entry->vme_start),
3313 		    (vm_map_size_t)(end - entry->vme_end))) {
3314 			/*
3315 			 *	Coalesced the two objects - can extend
3316 			 *	the previous map entry to include the
3317 			 *	new range.
3318 			 */
3319 			map->size += (end - entry->vme_end);
3320 			assert(entry->vme_start < end);
3321 			assert(VM_MAP_PAGE_ALIGNED(end,
3322 			    VM_MAP_PAGE_MASK(map)));
3323 			if (__improbable(vm_debug_events)) {
3324 				DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3325 			}
3326 			entry->vme_end = end;
3327 			if (map->holelistenabled) {
3328 				vm_map_store_update_first_free(map, entry, TRUE);
3329 			} else {
3330 				vm_map_store_update_first_free(map, map->first_free, TRUE);
3331 			}
3332 			new_mapping_established = TRUE;
3333 			RETURN(KERN_SUCCESS);
3334 		}
3335 	}
3336 
3337 	step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3338 	new_entry = NULL;
3339 
3340 	if (vmk_flags.vmkf_submap_adjust) {
3341 		vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3342 		offset = start;
3343 	}
3344 
3345 	for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3346 		tmp2_end = tmp2_start + step;
3347 		/*
3348 		 *	Create a new entry
3349 		 *
3350 		 * XXX FBDP
3351 		 * The reserved "page zero" in each process's address space can
3352 		 * be arbitrarily large.  Splitting it into separate objects and
3353 		 * therefore different VM map entries serves no purpose and just
3354 		 * slows down operations on the VM map, so let's not split the
3355 		 * allocation into chunks if the max protection is NONE.  That
3356 		 * memory should never be accessible, so it will never get to the
3357 		 * default pager.
3358 		 */
3359 		tmp_start = tmp2_start;
3360 		if (!is_submap &&
3361 		    object == VM_OBJECT_NULL &&
3362 		    size > chunk_size &&
3363 		    max_protection != VM_PROT_NONE &&
3364 		    superpage_size == 0) {
3365 			tmp_end = tmp_start + chunk_size;
3366 		} else {
3367 			tmp_end = tmp2_end;
3368 		}
3369 		do {
3370 			if (!is_submap &&
3371 			    object != VM_OBJECT_NULL &&
3372 			    object->internal &&
3373 			    offset + (tmp_end - tmp_start) > object->vo_size) {
3374 //				printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3375 				DTRACE_VM5(vm_map_enter_overmap,
3376 				    vm_map_t, map,
3377 				    vm_map_address_t, tmp_start,
3378 				    vm_map_address_t, tmp_end,
3379 				    vm_object_offset_t, offset,
3380 				    vm_object_size_t, object->vo_size);
3381 			}
3382 			new_entry = vm_map_entry_insert(map,
3383 			    entry, tmp_start, tmp_end,
3384 			    object, offset, vmk_flags,
3385 			    needs_copy,
3386 			    cur_protection, max_protection,
3387 			    (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3388 			    VM_INHERIT_NONE : inheritance),
3389 			    clear_map_aligned);
3390 
3391 			assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3392 
3393 			if (resilient_codesign) {
3394 				int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3395 				if (!((cur_protection | max_protection) & reject_prot)) {
3396 					new_entry->vme_resilient_codesign = TRUE;
3397 				}
3398 			}
3399 
3400 			if (resilient_media &&
3401 			    (object == VM_OBJECT_NULL ||
3402 			    object->internal)) {
3403 				new_entry->vme_resilient_media = TRUE;
3404 			}
3405 
3406 			assert(!new_entry->iokit_acct);
3407 			if (!is_submap &&
3408 			    object != VM_OBJECT_NULL &&
3409 			    object->internal &&
3410 			    (object->purgable != VM_PURGABLE_DENY ||
3411 			    object->vo_ledger_tag)) {
3412 				assert(new_entry->use_pmap);
3413 				assert(!new_entry->iokit_acct);
3414 				/*
3415 				 * Turn off pmap accounting since
3416 				 * purgeable (or tagged) objects have their
3417 				 * own ledgers.
3418 				 */
3419 				new_entry->use_pmap = FALSE;
3420 			} else if (!is_submap &&
3421 			    iokit_acct &&
3422 			    object != VM_OBJECT_NULL &&
3423 			    object->internal) {
3424 				/* alternate accounting */
3425 				assert(!new_entry->iokit_acct);
3426 				assert(new_entry->use_pmap);
3427 				new_entry->iokit_acct = TRUE;
3428 				new_entry->use_pmap = FALSE;
3429 				DTRACE_VM4(
3430 					vm_map_iokit_mapped_region,
3431 					vm_map_t, map,
3432 					vm_map_offset_t, new_entry->vme_start,
3433 					vm_map_offset_t, new_entry->vme_end,
3434 					int, VME_ALIAS(new_entry));
3435 				vm_map_iokit_mapped_region(
3436 					map,
3437 					(new_entry->vme_end -
3438 					new_entry->vme_start));
3439 			} else if (!is_submap) {
3440 				assert(!new_entry->iokit_acct);
3441 				assert(new_entry->use_pmap);
3442 			}
3443 
3444 			if (is_submap) {
3445 				vm_map_t        submap;
3446 				boolean_t       submap_is_64bit;
3447 				boolean_t       use_pmap;
3448 
3449 				assert(new_entry->is_sub_map);
3450 				assert(!new_entry->use_pmap);
3451 				assert(!new_entry->iokit_acct);
3452 				submap = (vm_map_t) object;
3453 				submap_is_64bit = vm_map_is_64bit(submap);
3454 				use_pmap = vmk_flags.vmkf_nested_pmap;
3455 #ifndef NO_NESTED_PMAP
3456 				if (use_pmap && submap->pmap == NULL) {
3457 					ledger_t ledger = map->pmap->ledger;
3458 					/* we need a sub pmap to nest... */
3459 					submap->pmap = pmap_create_options(ledger, 0,
3460 					    submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3461 					if (submap->pmap == NULL) {
3462 						/* let's proceed without nesting... */
3463 					}
3464 #if defined(__arm64__)
3465 					else {
3466 						pmap_set_nested(submap->pmap);
3467 					}
3468 #endif
3469 				}
3470 				if (use_pmap && submap->pmap != NULL) {
3471 					if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3472 						DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3473 						kr = KERN_FAILURE;
3474 					} else {
3475 						kr = pmap_nest(map->pmap,
3476 						    submap->pmap,
3477 						    tmp_start,
3478 						    tmp_end - tmp_start);
3479 					}
3480 					if (kr != KERN_SUCCESS) {
3481 						printf("vm_map_enter: "
3482 						    "pmap_nest(0x%llx,0x%llx) "
3483 						    "error 0x%x\n",
3484 						    (long long)tmp_start,
3485 						    (long long)tmp_end,
3486 						    kr);
3487 					} else {
3488 						/* we're now nested ! */
3489 						new_entry->use_pmap = TRUE;
3490 						pmap_empty = FALSE;
3491 					}
3492 				}
3493 #endif /* NO_NESTED_PMAP */
3494 			}
3495 			entry = new_entry;
3496 
3497 			if (superpage_size) {
3498 				vm_page_t pages, m;
3499 				vm_object_t sp_object;
3500 				vm_object_offset_t sp_offset;
3501 
3502 				assert(object == VM_OBJECT_NULL);
3503 				VME_OFFSET_SET(entry, 0);
3504 
3505 				/* allocate one superpage */
3506 				kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3507 				if (kr != KERN_SUCCESS) {
3508 					/* deallocate whole range... */
3509 					new_mapping_established = TRUE;
3510 					/* ... but only up to "tmp_end" */
3511 					size -= end - tmp_end;
3512 					RETURN(kr);
3513 				}
3514 
3515 				/* create one vm_object per superpage */
3516 				sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3517 				vm_object_lock(sp_object);
3518 				sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3519 				VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3520 				sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3521 				VME_OBJECT_SET(entry, sp_object, false, 0);
3522 				assert(entry->use_pmap);
3523 
3524 				/* enter the base pages into the object */
3525 				for (sp_offset = 0;
3526 				    sp_offset < SUPERPAGE_SIZE;
3527 				    sp_offset += PAGE_SIZE) {
3528 					m = pages;
3529 					pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3530 					pages = NEXT_PAGE(m);
3531 					*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3532 					vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3533 				}
3534 				vm_object_unlock(sp_object);
3535 			}
3536 		} while (tmp_end != tmp2_end &&
3537 		    (tmp_start = tmp_end) &&
3538 		    (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3539 		    tmp_end + chunk_size : tmp2_end));
3540 	}
3541 
3542 	new_mapping_established = TRUE;
3543 
3544 BailOut:
3545 	assert(map_locked == TRUE);
3546 
3547 	/*
3548 	 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3549 	 * If we have identified and possibly established the new mapping(s),
3550 	 * make sure we did not go beyond the address space limit.
3551 	 */
3552 	if (result == KERN_SUCCESS) {
3553 		if (map->size_limit != RLIM_INFINITY &&
3554 		    map->size > map->size_limit) {
3555 			/*
3556 			 * Establishing the requested mappings would exceed
3557 			 * the process's RLIMIT_AS limit: fail with
3558 			 * KERN_NO_SPACE.
3559 			 */
3560 			result = KERN_NO_SPACE;
3561 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3562 			    proc_selfpid(),
3563 			    (get_bsdtask_info(current_task())
3564 			    ? proc_name_address(get_bsdtask_info(current_task()))
3565 			    : "?"),
3566 			    __FUNCTION__,
3567 			    (uint64_t) map->size,
3568 			    (uint64_t) map->size_limit);
3569 			DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3570 			    vm_map_size_t, map->size,
3571 			    uint64_t, map->size_limit);
3572 			vm_map_enter_RLIMIT_AS_count++;
3573 		} else if (map->data_limit != RLIM_INFINITY &&
3574 		    map->size > map->data_limit) {
3575 			/*
3576 			 * Establishing the requested mappings would exceed
3577 			 * the process's RLIMIT_DATA limit: fail with
3578 			 * KERN_NO_SPACE.
3579 			 */
3580 			result = KERN_NO_SPACE;
3581 			printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3582 			    proc_selfpid(),
3583 			    (get_bsdtask_info(current_task())
3584 			    ? proc_name_address(get_bsdtask_info(current_task()))
3585 			    : "?"),
3586 			    __FUNCTION__,
3587 			    (uint64_t) map->size,
3588 			    (uint64_t) map->data_limit);
3589 			DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3590 			    vm_map_size_t, map->size,
3591 			    uint64_t, map->data_limit);
3592 			vm_map_enter_RLIMIT_DATA_count++;
3593 		}
3594 	}
3595 
3596 	if (result == KERN_SUCCESS) {
3597 		vm_prot_t pager_prot;
3598 		memory_object_t pager;
3599 
3600 #if DEBUG
3601 		if (pmap_empty &&
3602 		    !(vmk_flags.vmkf_no_pmap_check)) {
3603 			assert(pmap_is_empty(map->pmap,
3604 			    *address,
3605 			    *address + size));
3606 		}
3607 #endif /* DEBUG */
3608 
3609 		/*
3610 		 * For "named" VM objects, let the pager know that the
3611 		 * memory object is being mapped.  Some pagers need to keep
3612 		 * track of this, to know when they can reclaim the memory
3613 		 * object, for example.
3614 		 * VM calls memory_object_map() for each mapping (specifying
3615 		 * the protection of each mapping) and calls
3616 		 * memory_object_last_unmap() when all the mappings are gone.
3617 		 */
3618 		pager_prot = max_protection;
3619 		if (needs_copy) {
3620 			/*
3621 			 * Copy-On-Write mapping: won't modify
3622 			 * the memory object.
3623 			 */
3624 			pager_prot &= ~VM_PROT_WRITE;
3625 		}
3626 		if (!is_submap &&
3627 		    object != VM_OBJECT_NULL &&
3628 		    object->named &&
3629 		    object->pager != MEMORY_OBJECT_NULL) {
3630 			vm_object_lock(object);
3631 			pager = object->pager;
3632 			if (object->named &&
3633 			    pager != MEMORY_OBJECT_NULL) {
3634 				assert(object->pager_ready);
3635 				vm_object_mapping_wait(object, THREAD_UNINT);
3636 				vm_object_mapping_begin(object);
3637 				vm_object_unlock(object);
3638 
3639 				kr = memory_object_map(pager, pager_prot);
3640 				assert(kr == KERN_SUCCESS);
3641 
3642 				vm_object_lock(object);
3643 				vm_object_mapping_end(object);
3644 			}
3645 			vm_object_unlock(object);
3646 		}
3647 	}
3648 
3649 	assert(map_locked == TRUE);
3650 
3651 	if (new_mapping_established) {
3652 		/*
3653 		 * If we release the map lock for any reason below,
3654 		 * another thread could deallocate our new mapping,
3655 		 * releasing the caller's reference on "caller_object",
3656 		 * which was transferred to the mapping.
3657 		 * If this was the only reference, the object could be
3658 		 * destroyed.
3659 		 *
3660 		 * We need to take an extra reference on "caller_object"
3661 		 * to keep it alive if we need to return the caller's
3662 		 * reference to the caller in case of failure.
3663 		 */
3664 		if (is_submap) {
3665 			vm_map_reference((vm_map_t)caller_object);
3666 		} else {
3667 			vm_object_reference(caller_object);
3668 		}
3669 	}
3670 
3671 	if (!keep_map_locked) {
3672 		vm_map_unlock(map);
3673 		map_locked = FALSE;
3674 		entry = VM_MAP_ENTRY_NULL;
3675 		new_entry = VM_MAP_ENTRY_NULL;
3676 	}
3677 
3678 	/*
3679 	 * We can't hold the map lock if we enter this block.
3680 	 */
3681 
3682 	if (result == KERN_SUCCESS) {
3683 		/*	Wire down the new entry if the user
3684 		 *	requested all new map entries be wired.
3685 		 */
3686 		if ((map->wiring_required) || (superpage_size)) {
3687 			assert(!keep_map_locked);
3688 			pmap_empty = FALSE; /* pmap won't be empty */
3689 			kr = vm_map_wire_nested(map, start, end,
3690 			    cur_protection, VM_KERN_MEMORY_MLOCK,
3691 			    TRUE, PMAP_NULL, 0, NULL);
3692 			result = kr;
3693 		}
3694 
3695 	}
3696 
3697 	if (result != KERN_SUCCESS) {
3698 		if (new_mapping_established) {
3699 			vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3700 
3701 			/*
3702 			 * We have to get rid of the new mappings since we
3703 			 * won't make them available to the user.
3704 			 * Try and do that atomically, to minimize the risk
3705 			 * that someone else create new mappings that range.
3706 			 */
3707 			if (!map_locked) {
3708 				vm_map_lock(map);
3709 				map_locked = TRUE;
3710 			}
3711 			remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3712 			remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3713 			if (permanent) {
3714 				remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3715 			}
3716 			(void) vm_map_delete(map,
3717 			    *address, *address + size,
3718 			    remove_flags,
3719 			    KMEM_GUARD_NONE, &zap_new_list);
3720 		}
3721 
3722 		if (vm_map_zap_first_entry(&zap_old_list)) {
3723 			vm_map_entry_t entry1, entry2;
3724 
3725 			/*
3726 			 * The new mapping failed.  Attempt to restore
3727 			 * the old mappings, saved in the "zap_old_map".
3728 			 */
3729 			if (!map_locked) {
3730 				vm_map_lock(map);
3731 				map_locked = TRUE;
3732 			}
3733 
3734 			/* first check if the coast is still clear */
3735 			start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3736 			end   = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3737 
3738 			if (vm_map_lookup_entry(map, start, &entry1) ||
3739 			    vm_map_lookup_entry(map, end, &entry2) ||
3740 			    entry1 != entry2) {
3741 				/*
3742 				 * Part of that range has already been
3743 				 * re-mapped:  we can't restore the old
3744 				 * mappings...
3745 				 */
3746 				vm_map_enter_restore_failures++;
3747 			} else {
3748 				/*
3749 				 * Transfer the saved map entries from
3750 				 * "zap_old_map" to the original "map",
3751 				 * inserting them all after "entry1".
3752 				 */
3753 				while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3754 					vm_map_size_t entry_size;
3755 
3756 					entry_size = (entry2->vme_end -
3757 					    entry2->vme_start);
3758 					vm_map_store_entry_link(map, entry1, entry2,
3759 					    VM_MAP_KERNEL_FLAGS_NONE);
3760 					map->size += entry_size;
3761 					entry1 = entry2;
3762 				}
3763 				if (map->wiring_required) {
3764 					/*
3765 					 * XXX TODO: we should rewire the
3766 					 * old pages here...
3767 					 */
3768 				}
3769 				vm_map_enter_restore_successes++;
3770 			}
3771 		}
3772 	}
3773 
3774 	/*
3775 	 * The caller is responsible for releasing the lock if it requested to
3776 	 * keep the map locked.
3777 	 */
3778 	if (map_locked && !keep_map_locked) {
3779 		vm_map_unlock(map);
3780 	}
3781 
3782 	vm_map_zap_dispose(&zap_old_list);
3783 	vm_map_zap_dispose(&zap_new_list);
3784 
3785 	if (new_mapping_established) {
3786 		/*
3787 		 * The caller had a reference on "caller_object" and we
3788 		 * transferred that reference to the mapping.
3789 		 * We also took an extra reference on "caller_object" to keep
3790 		 * it alive while the map was unlocked.
3791 		 */
3792 		if (result == KERN_SUCCESS) {
3793 			/*
3794 			 * On success, the caller's reference on the object gets
3795 			 * tranferred to the mapping.
3796 			 * Release our extra reference.
3797 			 */
3798 			if (is_submap) {
3799 				vm_map_deallocate((vm_map_t)caller_object);
3800 			} else {
3801 				vm_object_deallocate(caller_object);
3802 			}
3803 		} else {
3804 			/*
3805 			 * On error, the caller expects to still have a
3806 			 * reference on the object it gave us.
3807 			 * Let's use our extra reference for that.
3808 			 */
3809 		}
3810 	}
3811 
3812 	return result;
3813 
3814 #undef  RETURN
3815 }
3816 
3817 /*
3818  * Counters for the prefault optimization.
3819  */
3820 int64_t vm_prefault_nb_pages = 0;
3821 int64_t vm_prefault_nb_bailout = 0;
3822 
3823 static kern_return_t
vm_map_enter_adjust_offset(vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_offset_t quantity)3824 vm_map_enter_adjust_offset(
3825 	vm_object_offset_t *obj_offs,
3826 	vm_object_offset_t *obj_end,
3827 	vm_object_offset_t  quantity)
3828 {
3829 	if (os_add_overflow(*obj_offs, quantity, obj_offs) ||
3830 	    os_add_overflow(*obj_end, quantity, obj_end) ||
3831 	    vm_map_round_page_mask(*obj_end, PAGE_MASK) == 0) {
3832 		return KERN_INVALID_ARGUMENT;
3833 	}
3834 
3835 	return KERN_SUCCESS;
3836 }
3837 
3838 static inline kern_return_t
vm_map_enter_mem_object_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)3839 vm_map_enter_mem_object_sanitize(
3840 	vm_map_t                target_map,
3841 	vm_map_offset_ut        address_u,
3842 	vm_map_size_ut          initial_size_u,
3843 	vm_map_offset_ut        mask_u,
3844 	vm_object_offset_ut     offset_u,
3845 	vm_prot_ut              cur_protection_u,
3846 	vm_prot_ut              max_protection_u,
3847 	vm_inherit_ut           inheritance_u,
3848 	vm_map_kernel_flags_t   vmk_flags,
3849 	ipc_port_t              port,
3850 	vm_map_address_t       *map_addr,
3851 	vm_map_size_t          *map_size,
3852 	vm_map_offset_t        *mask,
3853 	vm_object_offset_t     *obj_offs,
3854 	vm_object_offset_t     *obj_end,
3855 	vm_object_size_t       *obj_size,
3856 	vm_prot_t              *cur_protection,
3857 	vm_prot_t              *max_protection,
3858 	vm_inherit_t           *inheritance)
3859 {
3860 	kern_return_t           result;
3861 
3862 	result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
3863 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3864 	    VM_PROT_IS_MASK, cur_protection,
3865 	    max_protection);
3866 	if (__improbable(result != KERN_SUCCESS)) {
3867 		return result;
3868 	}
3869 
3870 	result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3871 	    inheritance);
3872 	if (__improbable(result != KERN_SUCCESS)) {
3873 		return result;
3874 	}
3875 
3876 	result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, mask);
3877 	if (__improbable(result != KERN_SUCCESS)) {
3878 		return result;
3879 	}
3880 
3881 	if (vmk_flags.vmf_fixed) {
3882 		vm_map_address_t        map_end;
3883 
3884 		result = vm_sanitize_addr_size(address_u, initial_size_u,
3885 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3886 		    target_map,
3887 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_REALIGN_START,
3888 		    map_addr, &map_end, map_size);
3889 		if (__improbable(result != KERN_SUCCESS)) {
3890 			return result;
3891 		}
3892 	} else {
3893 		*map_addr = vm_sanitize_addr(target_map, address_u);
3894 		result = vm_sanitize_size(0, initial_size_u,
3895 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3896 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
3897 		if (__improbable(result != KERN_SUCCESS)) {
3898 			return result;
3899 		}
3900 	}
3901 
3902 	*obj_size = vm_object_round_page(*map_size);
3903 	if (__improbable(*obj_size == 0)) {
3904 		return KERN_INVALID_ARGUMENT;
3905 	}
3906 
3907 	if (IP_VALID(port)) {
3908 		result = vm_sanitize_addr_size(offset_u, *obj_size,
3909 		    VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3910 		    PAGE_MASK,
3911 		    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
3912 		    obj_offs, obj_end, obj_size);
3913 		if (__improbable(result != KERN_SUCCESS)) {
3914 			return result;
3915 		}
3916 	} else {
3917 		*obj_offs = 0;
3918 		*obj_end  = *obj_size;
3919 	}
3920 
3921 	return KERN_SUCCESS;
3922 }
3923 
3924 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset_u,boolean_t copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,upl_page_list_ptr_t page_list,unsigned int page_list_count)3925 vm_map_enter_mem_object(
3926 	vm_map_t                target_map,
3927 	vm_map_offset_ut       *address_u,
3928 	vm_map_size_ut          initial_size_u,
3929 	vm_map_offset_ut        mask_u,
3930 	vm_map_kernel_flags_t   vmk_flags,
3931 	ipc_port_t              port,
3932 	vm_object_offset_ut     offset_u,
3933 	boolean_t               copy,
3934 	vm_prot_ut              cur_protection_u,
3935 	vm_prot_ut              max_protection_u,
3936 	vm_inherit_ut           inheritance_u,
3937 	upl_page_list_ptr_t     page_list,
3938 	unsigned int            page_list_count)
3939 {
3940 	vm_map_offset_t         mask, address;
3941 	vm_prot_t               cur_protection;
3942 	vm_prot_t               max_protection;
3943 	vm_inherit_t            inheritance;
3944 	vm_map_address_t        map_addr, map_mask;
3945 	vm_map_size_t           map_size;
3946 	vm_object_t             object = VM_OBJECT_NULL;
3947 	vm_object_offset_t      obj_offs, obj_end;
3948 	vm_object_size_t        obj_size;
3949 	kern_return_t           result;
3950 	boolean_t               mask_cur_protection, mask_max_protection;
3951 	boolean_t               kernel_prefault, try_prefault = (page_list_count != 0);
3952 	vm_map_offset_t         offset_in_mapping = 0;
3953 
3954 	if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
3955 		/* XXX TODO4K prefaulting depends on page size... */
3956 		try_prefault = FALSE;
3957 	}
3958 
3959 	/*
3960 	 * Check arguments for validity
3961 	 */
3962 	if ((target_map == VM_MAP_NULL) ||
3963 	    (try_prefault && (copy || !page_list))) {
3964 		return KERN_INVALID_ARGUMENT;
3965 	}
3966 
3967 	map_mask = vm_map_page_mask(target_map);
3968 
3969 	/*
3970 	 * Sanitize any input parameters that are addr/size/prot/inherit
3971 	 */
3972 	result = vm_map_enter_mem_object_sanitize(
3973 		target_map,
3974 		*address_u,
3975 		initial_size_u,
3976 		mask_u,
3977 		offset_u,
3978 		cur_protection_u,
3979 		max_protection_u,
3980 		inheritance_u,
3981 		vmk_flags,
3982 		port,
3983 		&map_addr,
3984 		&map_size,
3985 		&mask,
3986 		&obj_offs,
3987 		&obj_end,
3988 		&obj_size,
3989 		&cur_protection,
3990 		&max_protection,
3991 		&inheritance);
3992 	if (__improbable(result != KERN_SUCCESS)) {
3993 		return vm_sanitize_get_kr(result);
3994 	}
3995 
3996 	assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
3997 	vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, map_size);
3998 
3999 	mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4000 	mask_max_protection = max_protection & VM_PROT_IS_MASK;
4001 	cur_protection &= ~VM_PROT_IS_MASK;
4002 	max_protection &= ~VM_PROT_IS_MASK;
4003 
4004 #if __arm64__
4005 	if (cur_protection & VM_PROT_EXECUTE) {
4006 		cur_protection |= VM_PROT_READ;
4007 	}
4008 #endif /* __arm64__ */
4009 
4010 	/*
4011 	 * Find the vm object (if any) corresponding to this port.
4012 	 */
4013 	if (!IP_VALID(port)) {
4014 		object = VM_OBJECT_NULL;
4015 		copy = FALSE;
4016 	} else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4017 		vm_named_entry_t        named_entry;
4018 		vm_object_offset_t      data_offset;
4019 		vm_object_size_t        initial_size;
4020 
4021 		named_entry = mach_memory_entry_from_port(port);
4022 
4023 		if (vmk_flags.vmf_return_data_addr ||
4024 		    vmk_flags.vmf_return_4k_data_addr) {
4025 			data_offset = named_entry->data_offset;
4026 			result = vm_map_enter_adjust_offset(&obj_offs,
4027 			    &obj_end, data_offset);
4028 			if (__improbable(result)) {
4029 				return result;
4030 			}
4031 		} else {
4032 			data_offset = 0;
4033 		}
4034 
4035 		/* a few checks to make sure user is obeying rules */
4036 		if (mask_max_protection) {
4037 			max_protection &= named_entry->protection;
4038 		}
4039 		if (mask_cur_protection) {
4040 			cur_protection &= named_entry->protection;
4041 		}
4042 		if ((named_entry->protection & max_protection) !=
4043 		    max_protection) {
4044 			return KERN_INVALID_RIGHT;
4045 		}
4046 		if ((named_entry->protection & cur_protection) !=
4047 		    cur_protection) {
4048 			return KERN_INVALID_RIGHT;
4049 		}
4050 
4051 		/*
4052 		 * unwrap is safe because we know obj_size is larger and doesn't
4053 		 * overflow
4054 		 */
4055 		initial_size = VM_SANITIZE_UNSAFE_UNWRAP(initial_size_u);
4056 		if (named_entry->size < obj_offs + initial_size) {
4057 			return KERN_INVALID_ARGUMENT;
4058 		}
4059 
4060 		/* for a vm_map_copy, we can only map it whole */
4061 		if (named_entry->is_copy &&
4062 		    (obj_size != named_entry->size) &&
4063 		    (vm_map_round_page(obj_size, map_mask) == named_entry->size)) {
4064 			/* XXX FBDP use the rounded size... */
4065 			obj_end += named_entry->size - obj_size;
4066 			obj_size = named_entry->size;
4067 		}
4068 
4069 		if (named_entry->offset) {
4070 			/*
4071 			 * the callers parameter offset is defined to be the
4072 			 * offset from beginning of named entry offset in object
4073 			 *
4074 			 * Because we checked above that
4075 			 *   obj_offs + obj_size < named_entry_size
4076 			 * these overflow checks should be redundant...
4077 			 */
4078 			result = vm_map_enter_adjust_offset(&obj_offs,
4079 			    &obj_end, named_entry->offset);
4080 			if (__improbable(result)) {
4081 				return result;
4082 			}
4083 		}
4084 
4085 		if (!VM_MAP_PAGE_ALIGNED(obj_size, map_mask)) {
4086 			/*
4087 			 * Let's not map more than requested;
4088 			 * vm_map_enter() will handle this "not map-aligned"
4089 			 * case.
4090 			 */
4091 			map_size = obj_size;
4092 		}
4093 
4094 		named_entry_lock(named_entry);
4095 		if (named_entry->is_sub_map) {
4096 			vm_map_t                submap;
4097 
4098 			if (vmk_flags.vmf_return_data_addr ||
4099 			    vmk_flags.vmf_return_4k_data_addr) {
4100 				panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4101 			}
4102 
4103 			submap = named_entry->backing.map;
4104 			vm_map_reference(submap);
4105 			named_entry_unlock(named_entry);
4106 
4107 			vmk_flags.vmkf_submap = TRUE;
4108 			result = vm_map_enter(target_map,
4109 			    &map_addr,
4110 			    map_size,
4111 			    mask,
4112 			    vmk_flags,
4113 			    (vm_object_t)(uintptr_t) submap,
4114 			    obj_offs,
4115 			    copy,
4116 			    cur_protection,
4117 			    max_protection,
4118 			    inheritance);
4119 			if (result != KERN_SUCCESS) {
4120 				vm_map_deallocate(submap);
4121 				return result;
4122 			}
4123 			/*
4124 			 * No need to lock "submap" just to check its
4125 			 * "mapped" flag: that flag is never reset
4126 			 * once it's been set and if we race, we'll
4127 			 * just end up setting it twice, which is OK.
4128 			 */
4129 			if (submap->mapped_in_other_pmaps == FALSE &&
4130 			    vm_map_pmap(submap) != PMAP_NULL &&
4131 			    vm_map_pmap(submap) !=
4132 			    vm_map_pmap(target_map)) {
4133 				/*
4134 				 * This submap is being mapped in a map
4135 				 * that uses a different pmap.
4136 				 * Set its "mapped_in_other_pmaps" flag
4137 				 * to indicate that we now need to
4138 				 * remove mappings from all pmaps rather
4139 				 * than just the submap's pmap.
4140 				 */
4141 				vm_map_lock(submap);
4142 				submap->mapped_in_other_pmaps = TRUE;
4143 				vm_map_unlock(submap);
4144 			}
4145 			address = map_addr;
4146 			goto out;
4147 		} else if (named_entry->is_copy) {
4148 			kern_return_t   kr;
4149 			vm_map_copy_t   copy_map;
4150 			vm_map_entry_t  copy_entry;
4151 			vm_map_offset_t copy_addr;
4152 			vm_map_copy_t   target_copy_map;
4153 			vm_map_offset_t overmap_start, overmap_end;
4154 			vm_map_offset_t trimmed_start;
4155 			vm_map_size_t   target_size;
4156 
4157 			if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4158 			    (VM_FLAGS_FIXED |
4159 			    VM_FLAGS_ANYWHERE |
4160 			    VM_FLAGS_OVERWRITE |
4161 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4162 			    VM_FLAGS_RETURN_DATA_ADDR))) {
4163 				named_entry_unlock(named_entry);
4164 				return KERN_INVALID_ARGUMENT;
4165 			}
4166 
4167 			copy_map = named_entry->backing.copy;
4168 			assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4169 			if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4170 				/* unsupported type; should not happen */
4171 				printf("vm_map_enter_mem_object: "
4172 				    "memory_entry->backing.copy "
4173 				    "unsupported type 0x%x\n",
4174 				    copy_map->type);
4175 				named_entry_unlock(named_entry);
4176 				return KERN_INVALID_ARGUMENT;
4177 			}
4178 
4179 			if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4180 				DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, obj_offs, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4181 			}
4182 
4183 			if (vmk_flags.vmf_return_data_addr ||
4184 			    vmk_flags.vmf_return_4k_data_addr) {
4185 				offset_in_mapping = obj_offs & map_mask;
4186 				if (vmk_flags.vmf_return_4k_data_addr) {
4187 					offset_in_mapping &= ~((signed)(0xFFF));
4188 				}
4189 			}
4190 
4191 			target_copy_map = VM_MAP_COPY_NULL;
4192 			target_size = copy_map->size;
4193 			overmap_start = 0;
4194 			overmap_end = 0;
4195 			trimmed_start = 0;
4196 			if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4197 				DEBUG4K_ADJUST("adjusting...\n");
4198 				kr = vm_map_copy_adjust_to_target(
4199 					copy_map,
4200 					obj_offs /* includes data_offset */,
4201 					initial_size,
4202 					target_map,
4203 					copy,
4204 					&target_copy_map,
4205 					&overmap_start,
4206 					&overmap_end,
4207 					&trimmed_start);
4208 				if (kr != KERN_SUCCESS) {
4209 					named_entry_unlock(named_entry);
4210 					return kr;
4211 				}
4212 				target_size = target_copy_map->size;
4213 				if (trimmed_start >= data_offset) {
4214 					data_offset = obj_offs & VM_MAP_PAGE_MASK(target_map);
4215 				} else {
4216 					data_offset -= trimmed_start;
4217 				}
4218 			} else {
4219 				/*
4220 				 * Assert that the vm_map_copy is coming from the right
4221 				 * zone and hasn't been forged
4222 				 */
4223 				vm_map_copy_require(copy_map);
4224 				target_copy_map = copy_map;
4225 			}
4226 
4227 			vm_map_kernel_flags_t rsv_flags = vmk_flags;
4228 
4229 			vm_map_kernel_flags_and_vmflags(&rsv_flags,
4230 			    (VM_FLAGS_FIXED |
4231 			    VM_FLAGS_ANYWHERE |
4232 			    VM_FLAGS_OVERWRITE |
4233 			    VM_FLAGS_RETURN_4K_DATA_ADDR |
4234 			    VM_FLAGS_RETURN_DATA_ADDR));
4235 
4236 			/* reserve a contiguous range */
4237 			kr = vm_map_enter(target_map,
4238 			    &map_addr,
4239 			    vm_map_round_page(target_size, map_mask),
4240 			    mask,
4241 			    rsv_flags,
4242 			    VM_OBJECT_NULL,
4243 			    0,
4244 			    FALSE,               /* copy */
4245 			    cur_protection,
4246 			    max_protection,
4247 			    inheritance);
4248 			if (kr != KERN_SUCCESS) {
4249 				DEBUG4K_ERROR("kr 0x%x\n", kr);
4250 				if (target_copy_map != copy_map) {
4251 					vm_map_copy_discard(target_copy_map);
4252 					target_copy_map = VM_MAP_COPY_NULL;
4253 				}
4254 				named_entry_unlock(named_entry);
4255 				return kr;
4256 			}
4257 
4258 			copy_addr = map_addr;
4259 
4260 			for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4261 			    copy_entry != vm_map_copy_to_entry(target_copy_map);
4262 			    copy_entry = copy_entry->vme_next) {
4263 				vm_map_t                copy_submap = VM_MAP_NULL;
4264 				vm_object_t             copy_object = VM_OBJECT_NULL;
4265 				vm_map_size_t           copy_size;
4266 				vm_object_offset_t      copy_offset;
4267 				boolean_t               do_copy = false;
4268 
4269 				if (copy_entry->is_sub_map) {
4270 					copy_submap = VME_SUBMAP(copy_entry);
4271 					copy_object = (vm_object_t)copy_submap;
4272 				} else {
4273 					copy_object = VME_OBJECT(copy_entry);
4274 				}
4275 				copy_offset = VME_OFFSET(copy_entry);
4276 				copy_size = (copy_entry->vme_end -
4277 				    copy_entry->vme_start);
4278 
4279 				/* sanity check */
4280 				if ((copy_addr + copy_size) >
4281 				    (map_addr +
4282 				    overmap_start + overmap_end +
4283 				    named_entry->size /* XXX full size */)) {
4284 					/* over-mapping too much !? */
4285 					kr = KERN_INVALID_ARGUMENT;
4286 					DEBUG4K_ERROR("kr 0x%x\n", kr);
4287 					/* abort */
4288 					break;
4289 				}
4290 
4291 				/* take a reference on the object */
4292 				if (copy_entry->is_sub_map) {
4293 					vm_map_reference(copy_submap);
4294 				} else {
4295 					if (!copy &&
4296 					    copy_object != VM_OBJECT_NULL &&
4297 					    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4298 						bool is_writable;
4299 
4300 						/*
4301 						 * We need to resolve our side of this
4302 						 * "symmetric" copy-on-write now; we
4303 						 * need a new object to map and share,
4304 						 * instead of the current one which
4305 						 * might still be shared with the
4306 						 * original mapping.
4307 						 *
4308 						 * Note: A "vm_map_copy_t" does not
4309 						 * have a lock but we're protected by
4310 						 * the named entry's lock here.
4311 						 */
4312 						// assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4313 						VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4314 						assert(copy_object != VME_OBJECT(copy_entry));
4315 						is_writable = false;
4316 						if (copy_entry->protection & VM_PROT_WRITE) {
4317 							is_writable = true;
4318 #if __arm64e__
4319 						} else if (copy_entry->used_for_tpro) {
4320 							is_writable = true;
4321 #endif /* __arm64e__ */
4322 						}
4323 						if (!copy_entry->needs_copy && is_writable) {
4324 							vm_prot_t prot;
4325 
4326 							prot = copy_entry->protection & ~VM_PROT_WRITE;
4327 							vm_object_pmap_protect(copy_object,
4328 							    copy_offset,
4329 							    copy_size,
4330 							    PMAP_NULL,
4331 							    PAGE_SIZE,
4332 							    0,
4333 							    prot);
4334 						}
4335 						copy_entry->needs_copy = FALSE;
4336 						copy_entry->is_shared = TRUE;
4337 						copy_object = VME_OBJECT(copy_entry);
4338 						copy_offset = VME_OFFSET(copy_entry);
4339 						vm_object_lock(copy_object);
4340 						/* we're about to make a shared mapping of this object */
4341 						copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4342 						VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4343 						vm_object_unlock(copy_object);
4344 					}
4345 
4346 					if (copy_object != VM_OBJECT_NULL &&
4347 					    copy_object->named &&
4348 					    copy_object->pager != MEMORY_OBJECT_NULL &&
4349 					    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4350 						memory_object_t pager;
4351 						vm_prot_t       pager_prot;
4352 
4353 						/*
4354 						 * For "named" VM objects, let the pager know that the
4355 						 * memory object is being mapped.  Some pagers need to keep
4356 						 * track of this, to know when they can reclaim the memory
4357 						 * object, for example.
4358 						 * VM calls memory_object_map() for each mapping (specifying
4359 						 * the protection of each mapping) and calls
4360 						 * memory_object_last_unmap() when all the mappings are gone.
4361 						 */
4362 						pager_prot = max_protection;
4363 						if (copy) {
4364 							/*
4365 							 * Copy-On-Write mapping: won't modify the
4366 							 * memory object.
4367 							 */
4368 							pager_prot &= ~VM_PROT_WRITE;
4369 						}
4370 						vm_object_lock(copy_object);
4371 						pager = copy_object->pager;
4372 						if (copy_object->named &&
4373 						    pager != MEMORY_OBJECT_NULL &&
4374 						    copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4375 							assert(copy_object->pager_ready);
4376 							vm_object_mapping_wait(copy_object, THREAD_UNINT);
4377 							vm_object_mapping_begin(copy_object);
4378 							vm_object_unlock(copy_object);
4379 
4380 							kr = memory_object_map(pager, pager_prot);
4381 							assert(kr == KERN_SUCCESS);
4382 
4383 							vm_object_lock(copy_object);
4384 							vm_object_mapping_end(copy_object);
4385 						}
4386 						vm_object_unlock(copy_object);
4387 					}
4388 
4389 					/*
4390 					 *	Perform the copy if requested
4391 					 */
4392 
4393 					if (copy && copy_object != VM_OBJECT_NULL) {
4394 						vm_object_t             new_object;
4395 						vm_object_offset_t      new_offset;
4396 
4397 						result = vm_object_copy_strategically(copy_object, copy_offset,
4398 						    copy_size,
4399 						    false,                                   /* forking */
4400 						    &new_object, &new_offset,
4401 						    &do_copy);
4402 
4403 
4404 						if (result == KERN_MEMORY_RESTART_COPY) {
4405 							boolean_t success;
4406 							boolean_t src_needs_copy;
4407 
4408 							/*
4409 							 * XXX
4410 							 * We currently ignore src_needs_copy.
4411 							 * This really is the issue of how to make
4412 							 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4413 							 * non-kernel users to use. Solution forthcoming.
4414 							 * In the meantime, since we don't allow non-kernel
4415 							 * memory managers to specify symmetric copy,
4416 							 * we won't run into problems here.
4417 							 */
4418 							new_object = copy_object;
4419 							new_offset = copy_offset;
4420 							success = vm_object_copy_quickly(new_object,
4421 							    new_offset,
4422 							    copy_size,
4423 							    &src_needs_copy,
4424 							    &do_copy);
4425 							assert(success);
4426 							result = KERN_SUCCESS;
4427 						}
4428 						if (result != KERN_SUCCESS) {
4429 							kr = result;
4430 							break;
4431 						}
4432 
4433 						copy_object = new_object;
4434 						copy_offset = new_offset;
4435 						/*
4436 						 * No extra object reference for the mapping:
4437 						 * the mapping should be the only thing keeping
4438 						 * this new object alive.
4439 						 */
4440 					} else {
4441 						/*
4442 						 * We already have the right object
4443 						 * to map.
4444 						 */
4445 						copy_object = VME_OBJECT(copy_entry);
4446 						/* take an extra ref for the mapping below */
4447 						vm_object_reference(copy_object);
4448 					}
4449 				}
4450 
4451 				/*
4452 				 * If the caller does not want a specific
4453 				 * tag for this new mapping:  use
4454 				 * the tag of the original mapping.
4455 				 */
4456 				vm_map_kernel_flags_t vmk_remap_flags = {
4457 					.vmkf_submap = copy_entry->is_sub_map,
4458 				};
4459 
4460 				vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4461 				    vm_map_kernel_flags_vmflags(vmk_flags),
4462 				    vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4463 
4464 				/* over-map the object into destination */
4465 				vmk_remap_flags.vmf_fixed = true;
4466 				vmk_remap_flags.vmf_overwrite = true;
4467 
4468 				if (!copy && !copy_entry->is_sub_map) {
4469 					/*
4470 					 * copy-on-write should have been
4471 					 * resolved at this point, or we would
4472 					 * end up sharing instead of copying.
4473 					 */
4474 					assert(!copy_entry->needs_copy);
4475 				}
4476 #if XNU_TARGET_OS_OSX
4477 				if (copy_entry->used_for_jit) {
4478 					vmk_remap_flags.vmkf_map_jit = TRUE;
4479 				}
4480 #endif /* XNU_TARGET_OS_OSX */
4481 
4482 				kr = vm_map_enter(target_map,
4483 				    &copy_addr,
4484 				    copy_size,
4485 				    (vm_map_offset_t) 0,
4486 				    vmk_remap_flags,
4487 				    copy_object,
4488 				    copy_offset,
4489 				    ((copy_object == NULL)
4490 				    ? FALSE
4491 				    : (copy || copy_entry->needs_copy)),
4492 				    cur_protection,
4493 				    max_protection,
4494 				    inheritance);
4495 				if (kr != KERN_SUCCESS) {
4496 					DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4497 					if (copy_entry->is_sub_map) {
4498 						vm_map_deallocate(copy_submap);
4499 					} else {
4500 						vm_object_deallocate(copy_object);
4501 					}
4502 					/* abort */
4503 					break;
4504 				}
4505 
4506 				/* next mapping */
4507 				copy_addr += copy_size;
4508 			}
4509 
4510 			if (kr == KERN_SUCCESS) {
4511 				if (vmk_flags.vmf_return_data_addr ||
4512 				    vmk_flags.vmf_return_4k_data_addr) {
4513 					address = map_addr + offset_in_mapping;
4514 				} else {
4515 					address = map_addr;
4516 				}
4517 				if (overmap_start) {
4518 					address += overmap_start;
4519 					DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)address);
4520 				}
4521 			}
4522 			named_entry_unlock(named_entry);
4523 			if (target_copy_map != copy_map) {
4524 				vm_map_copy_discard(target_copy_map);
4525 				target_copy_map = VM_MAP_COPY_NULL;
4526 			}
4527 
4528 			if (kr != KERN_SUCCESS && !vmk_flags.vmf_overwrite) {
4529 				/* deallocate the contiguous range */
4530 				vm_map_remove(target_map, map_addr,
4531 				    map_addr + map_size);
4532 			}
4533 			result = kr;
4534 			goto out;
4535 		}
4536 
4537 		if (named_entry->is_object) {
4538 			unsigned int    access;
4539 			unsigned int    wimg_mode;
4540 
4541 			/* we are mapping a VM object */
4542 
4543 			access = named_entry->access;
4544 
4545 			if (vmk_flags.vmf_return_data_addr ||
4546 			    vmk_flags.vmf_return_4k_data_addr) {
4547 				offset_in_mapping = obj_offs & map_mask;
4548 				if (vmk_flags.vmf_return_4k_data_addr) {
4549 					offset_in_mapping &= ~((signed)(0xFFF));
4550 				}
4551 				obj_offs = VM_MAP_TRUNC_PAGE(obj_offs, VM_MAP_PAGE_MASK(target_map));
4552 				map_size = VM_MAP_ROUND_PAGE((obj_offs + offset_in_mapping + initial_size) - obj_offs, VM_MAP_PAGE_MASK(target_map));
4553 			}
4554 
4555 			object = vm_named_entry_to_vm_object(named_entry);
4556 			assert(object != VM_OBJECT_NULL);
4557 			vm_object_lock(object);
4558 			named_entry_unlock(named_entry);
4559 
4560 			vm_object_reference_locked(object);
4561 
4562 			wimg_mode = object->wimg_bits;
4563 			vm_prot_to_wimg(access, &wimg_mode);
4564 			if (object->wimg_bits != wimg_mode) {
4565 				vm_object_change_wimg_mode(object, wimg_mode);
4566 			}
4567 
4568 			vm_object_unlock(object);
4569 		} else {
4570 			panic("invalid VM named entry %p", named_entry);
4571 		}
4572 	} else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4573 		/*
4574 		 * JMM - This is temporary until we unify named entries
4575 		 * and raw memory objects.
4576 		 *
4577 		 * Detected fake ip_kotype for a memory object.  In
4578 		 * this case, the port isn't really a port at all, but
4579 		 * instead is just a raw memory object.
4580 		 */
4581 		if (vmk_flags.vmf_return_data_addr ||
4582 		    vmk_flags.vmf_return_4k_data_addr) {
4583 			panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4584 		}
4585 
4586 		object = memory_object_to_vm_object((memory_object_t)port);
4587 		if (object == VM_OBJECT_NULL) {
4588 			return KERN_INVALID_OBJECT;
4589 		}
4590 		vm_object_reference(object);
4591 
4592 		/* wait for object (if any) to be ready */
4593 		if (object != VM_OBJECT_NULL) {
4594 			if (is_kernel_object(object)) {
4595 				printf("Warning: Attempt to map kernel object"
4596 				    " by a non-private kernel entity\n");
4597 				return KERN_INVALID_OBJECT;
4598 			}
4599 			if (!object->pager_ready) {
4600 				vm_object_lock(object);
4601 
4602 				while (!object->pager_ready) {
4603 					vm_object_sleep(object,
4604 					    VM_OBJECT_EVENT_PAGER_READY,
4605 					    THREAD_UNINT,
4606 					    LCK_SLEEP_EXCLUSIVE);
4607 				}
4608 				vm_object_unlock(object);
4609 			}
4610 		}
4611 	} else {
4612 		return KERN_INVALID_OBJECT;
4613 	}
4614 
4615 	if (object != VM_OBJECT_NULL &&
4616 	    object->named &&
4617 	    object->pager != MEMORY_OBJECT_NULL &&
4618 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4619 		memory_object_t pager;
4620 		vm_prot_t       pager_prot;
4621 		kern_return_t   kr;
4622 
4623 		/*
4624 		 * For "named" VM objects, let the pager know that the
4625 		 * memory object is being mapped.  Some pagers need to keep
4626 		 * track of this, to know when they can reclaim the memory
4627 		 * object, for example.
4628 		 * VM calls memory_object_map() for each mapping (specifying
4629 		 * the protection of each mapping) and calls
4630 		 * memory_object_last_unmap() when all the mappings are gone.
4631 		 */
4632 		pager_prot = max_protection;
4633 		if (copy) {
4634 			/*
4635 			 * Copy-On-Write mapping: won't modify the
4636 			 * memory object.
4637 			 */
4638 			pager_prot &= ~VM_PROT_WRITE;
4639 		}
4640 		vm_object_lock(object);
4641 		pager = object->pager;
4642 		if (object->named &&
4643 		    pager != MEMORY_OBJECT_NULL &&
4644 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4645 			assert(object->pager_ready);
4646 			vm_object_mapping_wait(object, THREAD_UNINT);
4647 			vm_object_mapping_begin(object);
4648 			vm_object_unlock(object);
4649 
4650 			kr = memory_object_map(pager, pager_prot);
4651 			assert(kr == KERN_SUCCESS);
4652 
4653 			vm_object_lock(object);
4654 			vm_object_mapping_end(object);
4655 		}
4656 		vm_object_unlock(object);
4657 	}
4658 
4659 	/*
4660 	 *	Perform the copy if requested
4661 	 */
4662 
4663 	if (copy) {
4664 		vm_object_t             new_object;
4665 		vm_object_offset_t      new_offset;
4666 
4667 		result = vm_object_copy_strategically(object,
4668 		    obj_offs,
4669 		    map_size,
4670 		    false,                                   /* forking */
4671 		    &new_object, &new_offset,
4672 		    &copy);
4673 
4674 
4675 		if (result == KERN_MEMORY_RESTART_COPY) {
4676 			boolean_t success;
4677 			boolean_t src_needs_copy;
4678 
4679 			/*
4680 			 * XXX
4681 			 * We currently ignore src_needs_copy.
4682 			 * This really is the issue of how to make
4683 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4684 			 * non-kernel users to use. Solution forthcoming.
4685 			 * In the meantime, since we don't allow non-kernel
4686 			 * memory managers to specify symmetric copy,
4687 			 * we won't run into problems here.
4688 			 */
4689 			new_object = object;
4690 			new_offset = obj_offs;
4691 			success = vm_object_copy_quickly(new_object,
4692 			    new_offset,
4693 			    map_size,
4694 			    &src_needs_copy,
4695 			    &copy);
4696 			assert(success);
4697 			result = KERN_SUCCESS;
4698 		}
4699 		/*
4700 		 *	Throw away the reference to the
4701 		 *	original object, as it won't be mapped.
4702 		 */
4703 
4704 		vm_object_deallocate(object);
4705 
4706 		if (result != KERN_SUCCESS) {
4707 			return result;
4708 		}
4709 
4710 		object   = new_object;
4711 		obj_offs = new_offset;
4712 	}
4713 
4714 	/*
4715 	 * If non-kernel users want to try to prefault pages, the mapping and prefault
4716 	 * needs to be atomic.
4717 	 */
4718 	kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4719 	vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4720 
4721 	result = vm_map_enter(target_map,
4722 	    &map_addr, map_size,
4723 	    (vm_map_offset_t)mask,
4724 	    vmk_flags,
4725 	    object, obj_offs,
4726 	    copy,
4727 	    cur_protection, max_protection,
4728 	    inheritance);
4729 	if (result != KERN_SUCCESS) {
4730 		vm_object_deallocate(object);
4731 	}
4732 
4733 	/*
4734 	 * Try to prefault, and do not forget to release the vm map lock.
4735 	 */
4736 	if (result == KERN_SUCCESS && try_prefault) {
4737 		mach_vm_address_t va = map_addr;
4738 		kern_return_t kr = KERN_SUCCESS;
4739 		unsigned int i = 0;
4740 		int pmap_options;
4741 
4742 		pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4743 		if (object->internal) {
4744 			pmap_options |= PMAP_OPTIONS_INTERNAL;
4745 		}
4746 
4747 		for (i = 0; i < page_list_count; ++i) {
4748 			if (!UPL_VALID_PAGE(page_list, i)) {
4749 				if (kernel_prefault) {
4750 					assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4751 					result = KERN_MEMORY_ERROR;
4752 					break;
4753 				}
4754 			} else {
4755 				/*
4756 				 * If this function call failed, we should stop
4757 				 * trying to optimize, other calls are likely
4758 				 * going to fail too.
4759 				 *
4760 				 * We are not gonna report an error for such
4761 				 * failure though. That's an optimization, not
4762 				 * something critical.
4763 				 */
4764 				kr = pmap_enter_options(target_map->pmap,
4765 				    va, UPL_PHYS_PAGE(page_list, i),
4766 				    cur_protection, VM_PROT_NONE,
4767 				    0, TRUE, pmap_options, NULL, PMAP_MAPPING_TYPE_INFER);
4768 				if (kr != KERN_SUCCESS) {
4769 					OSIncrementAtomic64(&vm_prefault_nb_bailout);
4770 					if (kernel_prefault) {
4771 						result = kr;
4772 					}
4773 					break;
4774 				}
4775 				OSIncrementAtomic64(&vm_prefault_nb_pages);
4776 			}
4777 
4778 			/* Next virtual address */
4779 			va += PAGE_SIZE;
4780 		}
4781 		if (vmk_flags.vmkf_keep_map_locked) {
4782 			vm_map_unlock(target_map);
4783 		}
4784 	}
4785 
4786 	if (vmk_flags.vmf_return_data_addr ||
4787 	    vmk_flags.vmf_return_4k_data_addr) {
4788 		address = map_addr + offset_in_mapping;
4789 	} else {
4790 		address = map_addr;
4791 	}
4792 
4793 out:
4794 	if (result == KERN_SUCCESS) {
4795 #if KASAN
4796 		if (target_map->pmap == kernel_pmap) {
4797 			kasan_notify_address(map_addr, map_size);
4798 		}
4799 #endif
4800 		*address_u = vm_sanitize_wrap_addr(address);
4801 	}
4802 	return result;
4803 }
4804 
4805 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_ut * address,vm_map_size_ut initial_size,vm_map_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset,vm_prot_ut cur_protection,vm_prot_ut max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4806 vm_map_enter_mem_object_prefault(
4807 	vm_map_t                target_map,
4808 	vm_map_offset_ut       *address,
4809 	vm_map_size_ut          initial_size,
4810 	vm_map_offset_ut        mask,
4811 	vm_map_kernel_flags_t   vmk_flags,
4812 	ipc_port_t              port,
4813 	vm_object_offset_ut     offset,
4814 	vm_prot_ut              cur_protection,
4815 	vm_prot_ut              max_protection,
4816 	upl_page_list_ptr_t     page_list,
4817 	unsigned int            page_list_count)
4818 {
4819 	/* range_id is set by vm_map_enter_mem_object */
4820 	return vm_map_enter_mem_object(target_map,
4821 	           address,
4822 	           initial_size,
4823 	           mask,
4824 	           vmk_flags,
4825 	           port,
4826 	           offset,
4827 	           FALSE,
4828 	           cur_protection,
4829 	           max_protection,
4830 	           VM_INHERIT_DEFAULT,
4831 	           page_list,
4832 	           page_list_count);
4833 }
4834 
4835 static inline kern_return_t
vm_map_enter_mem_object_control_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)4836 vm_map_enter_mem_object_control_sanitize(
4837 	vm_map_t                target_map,
4838 	vm_map_offset_ut        address_u,
4839 	vm_map_size_ut          initial_size_u,
4840 	vm_map_offset_ut        mask_u,
4841 	vm_object_offset_ut     offset_u,
4842 	vm_prot_ut              cur_protection_u,
4843 	vm_prot_ut              max_protection_u,
4844 	vm_inherit_ut           inheritance_u,
4845 	vm_map_kernel_flags_t   vmk_flags,
4846 	vm_map_address_t       *map_addr,
4847 	vm_map_size_t          *map_size,
4848 	vm_map_offset_t        *mask,
4849 	vm_object_offset_t     *obj_offs,
4850 	vm_object_offset_t     *obj_end,
4851 	vm_object_size_t       *obj_size,
4852 	vm_prot_t              *cur_protection,
4853 	vm_prot_t              *max_protection,
4854 	vm_inherit_t           *inheritance)
4855 {
4856 	kern_return_t           kr;
4857 
4858 	kr = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
4859 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4860 	    cur_protection, max_protection);
4861 	if (__improbable(kr != KERN_SUCCESS)) {
4862 		return kr;
4863 	}
4864 
4865 	kr = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL,
4866 	    inheritance);
4867 	if (__improbable(kr != KERN_SUCCESS)) {
4868 		return kr;
4869 	}
4870 
4871 	kr = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, mask);
4872 	if (__improbable(kr != KERN_SUCCESS)) {
4873 		return kr;
4874 	}
4875 	/*
4876 	 * Ensure arithmetic doesn't overflow in vm_object space (kernel
4877 	 * pages).
4878 	 * We keep unaligned values for now. The call we eventually make to
4879 	 * vm_map_enter does guarantee that offset_u is page aligned for EITHER
4880 	 * target_map pages or kernel pages. But this isn't enough to guarantee
4881 	 * kernel space alignment.
4882 	 */
4883 	kr = vm_sanitize_addr_size(offset_u, initial_size_u,
4884 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, PAGE_MASK,
4885 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4886 	    obj_offs, obj_end, obj_size);
4887 	if (__improbable(kr != KERN_SUCCESS)) {
4888 		return kr;
4889 	}
4890 
4891 	/*
4892 	 * There is no vm_sanitize_addr_size variant that also adjusts for
4893 	 * a separate offset. Rather than create one for this one-off issue,
4894 	 * we sanitize map_addr and map_size individually, relying on
4895 	 * vm_sanitize_size to incorporate the offset. Then, we perform the
4896 	 * overflow check manually below.
4897 	 */
4898 	*map_addr = vm_sanitize_addr(target_map, address_u);
4899 	kr = vm_sanitize_size(offset_u, initial_size_u,
4900 	    VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4901 	    VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
4902 	if (__improbable(kr != KERN_SUCCESS)) {
4903 		return kr;
4904 	}
4905 
4906 	/*
4907 	 * Ensure arithmetic doesn't overflow in target_map space.
4908 	 * The computation of map_size above accounts for the possibility that
4909 	 * offset_u might be unaligned in target_map space.
4910 	 */
4911 	if (vmk_flags.vmf_fixed) {
4912 		vm_map_address_t map_end;
4913 
4914 		if (__improbable(os_add_overflow(*map_addr, *map_size, &map_end))) {
4915 			return KERN_INVALID_ARGUMENT;
4916 		}
4917 	}
4918 
4919 	return KERN_SUCCESS;
4920 }
4921 
4922 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_ut offset_u,boolean_t needs_copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u)4923 vm_map_enter_mem_object_control(
4924 	vm_map_t                target_map,
4925 	vm_map_offset_ut       *address_u,
4926 	vm_map_size_ut          initial_size_u,
4927 	vm_map_offset_ut        mask_u,
4928 	vm_map_kernel_flags_t   vmk_flags,
4929 	memory_object_control_t control,
4930 	vm_object_offset_ut     offset_u,
4931 	boolean_t               needs_copy,
4932 	vm_prot_ut              cur_protection_u,
4933 	vm_prot_ut              max_protection_u,
4934 	vm_inherit_ut           inheritance_u)
4935 {
4936 	vm_map_offset_t         mask;
4937 	vm_prot_t               cur_protection;
4938 	vm_prot_t               max_protection;
4939 	vm_inherit_t            inheritance;
4940 	vm_map_address_t        map_addr;
4941 	vm_map_size_t           map_size;
4942 	vm_object_t             object;
4943 	vm_object_offset_t      obj_offs, obj_end;
4944 	vm_object_size_t        obj_size;
4945 	kern_return_t           result;
4946 	memory_object_t         pager;
4947 	vm_prot_t               pager_prot;
4948 	kern_return_t           kr;
4949 
4950 	/*
4951 	 * Check arguments for validity
4952 	 */
4953 	if (target_map == VM_MAP_NULL) {
4954 		return KERN_INVALID_ARGUMENT;
4955 	}
4956 
4957 	/*
4958 	 * We only support vmf_return_data_addr-like behavior.
4959 	 */
4960 	vmk_flags.vmf_return_data_addr = true;
4961 
4962 	/*
4963 	 * Sanitize any input parameters that are addr/size/prot/inherit
4964 	 */
4965 	kr = vm_map_enter_mem_object_control_sanitize(target_map,
4966 	    *address_u,
4967 	    initial_size_u,
4968 	    mask_u,
4969 	    offset_u,
4970 	    cur_protection_u,
4971 	    max_protection_u,
4972 	    inheritance_u,
4973 	    vmk_flags,
4974 	    &map_addr,
4975 	    &map_size,
4976 	    &mask,
4977 	    &obj_offs,
4978 	    &obj_end,
4979 	    &obj_size,
4980 	    &cur_protection,
4981 	    &max_protection,
4982 	    &inheritance);
4983 	if (__improbable(kr != KERN_SUCCESS)) {
4984 		return vm_sanitize_get_kr(kr);
4985 	}
4986 
4987 	object = memory_object_control_to_vm_object(control);
4988 
4989 	if (object == VM_OBJECT_NULL) {
4990 		return KERN_INVALID_OBJECT;
4991 	}
4992 
4993 	if (is_kernel_object(object)) {
4994 		printf("Warning: Attempt to map kernel object"
4995 		    " by a non-private kernel entity\n");
4996 		return KERN_INVALID_OBJECT;
4997 	}
4998 
4999 	vm_object_lock(object);
5000 	object->ref_count++;
5001 
5002 	/*
5003 	 * For "named" VM objects, let the pager know that the
5004 	 * memory object is being mapped.  Some pagers need to keep
5005 	 * track of this, to know when they can reclaim the memory
5006 	 * object, for example.
5007 	 * VM calls memory_object_map() for each mapping (specifying
5008 	 * the protection of each mapping) and calls
5009 	 * memory_object_last_unmap() when all the mappings are gone.
5010 	 */
5011 	pager_prot = max_protection;
5012 	if (needs_copy) {
5013 		pager_prot &= ~VM_PROT_WRITE;
5014 	}
5015 	pager = object->pager;
5016 	if (object->named &&
5017 	    pager != MEMORY_OBJECT_NULL &&
5018 	    object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5019 		assert(object->pager_ready);
5020 		vm_object_mapping_wait(object, THREAD_UNINT);
5021 		vm_object_mapping_begin(object);
5022 		vm_object_unlock(object);
5023 
5024 		kr = memory_object_map(pager, pager_prot);
5025 		assert(kr == KERN_SUCCESS);
5026 
5027 		vm_object_lock(object);
5028 		vm_object_mapping_end(object);
5029 	}
5030 	vm_object_unlock(object);
5031 
5032 	/*
5033 	 *	Perform the copy if requested
5034 	 */
5035 
5036 	if (needs_copy) {
5037 		vm_object_t             new_object;
5038 		vm_object_offset_t      new_offset;
5039 
5040 		result = vm_object_copy_strategically(object, obj_offs, obj_size,
5041 		    false,                                   /* forking */
5042 		    &new_object, &new_offset,
5043 		    &needs_copy);
5044 
5045 
5046 		if (result == KERN_MEMORY_RESTART_COPY) {
5047 			boolean_t success;
5048 			boolean_t src_needs_copy;
5049 
5050 			/*
5051 			 * XXX
5052 			 * We currently ignore src_needs_copy.
5053 			 * This really is the issue of how to make
5054 			 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5055 			 * non-kernel users to use. Solution forthcoming.
5056 			 * In the meantime, since we don't allow non-kernel
5057 			 * memory managers to specify symmetric copy,
5058 			 * we won't run into problems here.
5059 			 */
5060 			new_object = object;
5061 			new_offset = obj_offs;
5062 			success = vm_object_copy_quickly(new_object,
5063 			    new_offset, obj_size,
5064 			    &src_needs_copy,
5065 			    &needs_copy);
5066 			assert(success);
5067 			result = KERN_SUCCESS;
5068 		}
5069 		/*
5070 		 *	Throw away the reference to the
5071 		 *	original object, as it won't be mapped.
5072 		 */
5073 
5074 		vm_object_deallocate(object);
5075 
5076 		if (result != KERN_SUCCESS) {
5077 			return result;
5078 		}
5079 
5080 		object   = new_object;
5081 		obj_offs = new_offset;
5082 	}
5083 
5084 	result = vm_map_enter(target_map,
5085 	    &map_addr, map_size,
5086 	    (vm_map_offset_t)mask,
5087 	    vmk_flags,
5088 	    object,
5089 	    obj_offs,
5090 	    needs_copy,
5091 	    cur_protection, max_protection,
5092 	    inheritance);
5093 
5094 	if (result == KERN_SUCCESS) {
5095 		*address_u = vm_sanitize_wrap_addr(map_addr + (obj_offs & vm_map_page_mask(target_map)));
5096 	} else {
5097 		vm_object_deallocate(object);
5098 	}
5099 
5100 	return result;
5101 }
5102 
5103 
5104 /* Not used without nested pmaps */
5105 #ifndef NO_NESTED_PMAP
5106 /*
5107  * Clip and unnest a portion of a nested submap mapping.
5108  */
5109 
5110 
5111 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5112 vm_map_clip_unnest(
5113 	vm_map_t        map,
5114 	vm_map_entry_t  entry,
5115 	vm_map_offset_t start_unnest,
5116 	vm_map_offset_t end_unnest)
5117 {
5118 	vm_map_offset_t old_start_unnest = start_unnest;
5119 	vm_map_offset_t old_end_unnest = end_unnest;
5120 
5121 	assert(entry->is_sub_map);
5122 	assert(VME_SUBMAP(entry) != NULL);
5123 	assert(entry->use_pmap);
5124 
5125 	/*
5126 	 * Query the platform for the optimal unnest range.
5127 	 * DRK: There's some duplication of effort here, since
5128 	 * callers may have adjusted the range to some extent. This
5129 	 * routine was introduced to support 1GiB subtree nesting
5130 	 * for x86 platforms, which can also nest on 2MiB boundaries
5131 	 * depending on size/alignment.
5132 	 */
5133 	if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5134 		assert(VME_SUBMAP(entry)->is_nested_map);
5135 		assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5136 		log_unnest_badness(map,
5137 		    old_start_unnest,
5138 		    old_end_unnest,
5139 		    VME_SUBMAP(entry)->is_nested_map,
5140 		    (entry->vme_start +
5141 		    VME_SUBMAP(entry)->lowest_unnestable_start -
5142 		    VME_OFFSET(entry)));
5143 	}
5144 
5145 	if (entry->vme_start > start_unnest ||
5146 	    entry->vme_end < end_unnest) {
5147 		panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5148 		    "bad nested entry: start=0x%llx end=0x%llx\n",
5149 		    (long long)start_unnest, (long long)end_unnest,
5150 		    (long long)entry->vme_start, (long long)entry->vme_end);
5151 	}
5152 
5153 	if (start_unnest > entry->vme_start) {
5154 		_vm_map_clip_start(&map->hdr,
5155 		    entry,
5156 		    start_unnest);
5157 		if (map->holelistenabled) {
5158 			vm_map_store_update_first_free(map, NULL, FALSE);
5159 		} else {
5160 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5161 		}
5162 	}
5163 	if (entry->vme_end > end_unnest) {
5164 		_vm_map_clip_end(&map->hdr,
5165 		    entry,
5166 		    end_unnest);
5167 		if (map->holelistenabled) {
5168 			vm_map_store_update_first_free(map, NULL, FALSE);
5169 		} else {
5170 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5171 		}
5172 	}
5173 
5174 	pmap_unnest(map->pmap,
5175 	    entry->vme_start,
5176 	    entry->vme_end - entry->vme_start);
5177 	if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5178 		/* clean up parent map/maps */
5179 		vm_map_submap_pmap_clean(
5180 			map, entry->vme_start,
5181 			entry->vme_end,
5182 			VME_SUBMAP(entry),
5183 			VME_OFFSET(entry));
5184 	}
5185 	entry->use_pmap = FALSE;
5186 	if ((map->pmap != kernel_pmap) &&
5187 	    (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5188 		VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5189 	}
5190 }
5191 #endif  /* NO_NESTED_PMAP */
5192 
5193 __abortlike
5194 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5195 __vm_map_clip_atomic_entry_panic(
5196 	vm_map_t        map,
5197 	vm_map_entry_t  entry,
5198 	vm_map_offset_t where)
5199 {
5200 	panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5201 	    "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5202 	    (uint64_t)entry->vme_start,
5203 	    (uint64_t)entry->vme_end,
5204 	    (uint64_t)where);
5205 }
5206 
5207 /*
5208  *	vm_map_clip_start:	[ internal use only ]
5209  *
5210  *	Asserts that the given entry begins at or after
5211  *	the specified address; if necessary,
5212  *	it splits the entry into two.
5213  */
5214 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5215 vm_map_clip_start(
5216 	vm_map_t        map,
5217 	vm_map_entry_t  entry,
5218 	vm_map_offset_t startaddr)
5219 {
5220 #ifndef NO_NESTED_PMAP
5221 	if (entry->is_sub_map &&
5222 	    entry->use_pmap &&
5223 	    startaddr >= entry->vme_start) {
5224 		vm_map_offset_t start_unnest, end_unnest;
5225 
5226 		/*
5227 		 * Make sure "startaddr" is no longer in a nested range
5228 		 * before we clip.  Unnest only the minimum range the platform
5229 		 * can handle.
5230 		 * vm_map_clip_unnest may perform additional adjustments to
5231 		 * the unnest range.
5232 		 */
5233 		start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5234 		end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5235 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5236 	}
5237 #endif /* NO_NESTED_PMAP */
5238 	if (startaddr > entry->vme_start) {
5239 		if (!entry->is_sub_map &&
5240 		    VME_OBJECT(entry) &&
5241 		    VME_OBJECT(entry)->phys_contiguous) {
5242 			pmap_remove(map->pmap,
5243 			    (addr64_t)(entry->vme_start),
5244 			    (addr64_t)(entry->vme_end));
5245 		}
5246 		if (entry->vme_atomic) {
5247 			__vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5248 		}
5249 
5250 		DTRACE_VM5(
5251 			vm_map_clip_start,
5252 			vm_map_t, map,
5253 			vm_map_offset_t, entry->vme_start,
5254 			vm_map_offset_t, entry->vme_end,
5255 			vm_map_offset_t, startaddr,
5256 			int, VME_ALIAS(entry));
5257 
5258 		_vm_map_clip_start(&map->hdr, entry, startaddr);
5259 		if (map->holelistenabled) {
5260 			vm_map_store_update_first_free(map, NULL, FALSE);
5261 		} else {
5262 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5263 		}
5264 	}
5265 }
5266 
5267 
5268 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5269 	MACRO_BEGIN \
5270 	if ((startaddr) > (entry)->vme_start) \
5271 	        _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5272 	MACRO_END
5273 
5274 /*
5275  *	This routine is called only when it is known that
5276  *	the entry must be split.
5277  */
5278 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5279 _vm_map_clip_start(
5280 	struct vm_map_header    *map_header,
5281 	vm_map_entry_t          entry,
5282 	vm_map_offset_t         start)
5283 {
5284 	vm_map_entry_t  new_entry;
5285 
5286 	/*
5287 	 *	Split off the front portion --
5288 	 *	note that we must insert the new
5289 	 *	entry BEFORE this one, so that
5290 	 *	this entry has the specified starting
5291 	 *	address.
5292 	 */
5293 
5294 	if (entry->map_aligned) {
5295 		assert(VM_MAP_PAGE_ALIGNED(start,
5296 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5297 	}
5298 
5299 	new_entry = _vm_map_entry_create(map_header);
5300 	vm_map_entry_copy_full(new_entry, entry);
5301 
5302 	new_entry->vme_end = start;
5303 	assert(new_entry->vme_start < new_entry->vme_end);
5304 	VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5305 	if (__improbable(start >= entry->vme_end)) {
5306 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5307 	}
5308 	assert(start < entry->vme_end);
5309 	entry->vme_start = start;
5310 
5311 #if VM_BTLOG_TAGS
5312 	if (new_entry->vme_kernel_object) {
5313 		btref_retain(new_entry->vme_tag_btref);
5314 	}
5315 #endif /* VM_BTLOG_TAGS */
5316 
5317 	_vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5318 
5319 	if (entry->is_sub_map) {
5320 		vm_map_reference(VME_SUBMAP(new_entry));
5321 	} else {
5322 		vm_object_reference(VME_OBJECT(new_entry));
5323 	}
5324 }
5325 
5326 
5327 /*
5328  *	vm_map_clip_end:	[ internal use only ]
5329  *
5330  *	Asserts that the given entry ends at or before
5331  *	the specified address; if necessary,
5332  *	it splits the entry into two.
5333  */
5334 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5335 vm_map_clip_end(
5336 	vm_map_t        map,
5337 	vm_map_entry_t  entry,
5338 	vm_map_offset_t endaddr)
5339 {
5340 	if (endaddr > entry->vme_end) {
5341 		/*
5342 		 * Within the scope of this clipping, limit "endaddr" to
5343 		 * the end of this map entry...
5344 		 */
5345 		endaddr = entry->vme_end;
5346 	}
5347 #ifndef NO_NESTED_PMAP
5348 	if (entry->is_sub_map && entry->use_pmap) {
5349 		vm_map_offset_t start_unnest, end_unnest;
5350 
5351 		/*
5352 		 * Make sure the range between the start of this entry and
5353 		 * the new "endaddr" is no longer nested before we clip.
5354 		 * Unnest only the minimum range the platform can handle.
5355 		 * vm_map_clip_unnest may perform additional adjustments to
5356 		 * the unnest range.
5357 		 */
5358 		start_unnest = entry->vme_start;
5359 		end_unnest =
5360 		    (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5361 		    ~(pmap_shared_region_size_min(map->pmap) - 1);
5362 		vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5363 	}
5364 #endif /* NO_NESTED_PMAP */
5365 	if (endaddr < entry->vme_end) {
5366 		if (!entry->is_sub_map &&
5367 		    VME_OBJECT(entry) &&
5368 		    VME_OBJECT(entry)->phys_contiguous) {
5369 			pmap_remove(map->pmap,
5370 			    (addr64_t)(entry->vme_start),
5371 			    (addr64_t)(entry->vme_end));
5372 		}
5373 		if (entry->vme_atomic) {
5374 			__vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5375 		}
5376 		DTRACE_VM5(
5377 			vm_map_clip_end,
5378 			vm_map_t, map,
5379 			vm_map_offset_t, entry->vme_start,
5380 			vm_map_offset_t, entry->vme_end,
5381 			vm_map_offset_t, endaddr,
5382 			int, VME_ALIAS(entry));
5383 
5384 		_vm_map_clip_end(&map->hdr, entry, endaddr);
5385 		if (map->holelistenabled) {
5386 			vm_map_store_update_first_free(map, NULL, FALSE);
5387 		} else {
5388 			vm_map_store_update_first_free(map, map->first_free, FALSE);
5389 		}
5390 	}
5391 }
5392 
5393 
5394 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5395 	MACRO_BEGIN \
5396 	if ((endaddr) < (entry)->vme_end) \
5397 	        _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5398 	MACRO_END
5399 
5400 /*
5401  *	This routine is called only when it is known that
5402  *	the entry must be split.
5403  */
5404 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5405 _vm_map_clip_end(
5406 	struct vm_map_header    *map_header,
5407 	vm_map_entry_t          entry,
5408 	vm_map_offset_t         end)
5409 {
5410 	vm_map_entry_t  new_entry;
5411 
5412 	/*
5413 	 *	Create a new entry and insert it
5414 	 *	AFTER the specified entry
5415 	 */
5416 
5417 	if (entry->map_aligned) {
5418 		assert(VM_MAP_PAGE_ALIGNED(end,
5419 		    VM_MAP_HDR_PAGE_MASK(map_header)));
5420 	}
5421 
5422 	new_entry = _vm_map_entry_create(map_header);
5423 	vm_map_entry_copy_full(new_entry, entry);
5424 
5425 	if (__improbable(end <= entry->vme_start)) {
5426 		panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5427 	}
5428 	assert(entry->vme_start < end);
5429 	new_entry->vme_start = entry->vme_end = end;
5430 	VME_OFFSET_SET(new_entry,
5431 	    VME_OFFSET(new_entry) + (end - entry->vme_start));
5432 	assert(new_entry->vme_start < new_entry->vme_end);
5433 
5434 #if VM_BTLOG_TAGS
5435 	if (new_entry->vme_kernel_object) {
5436 		btref_retain(new_entry->vme_tag_btref);
5437 	}
5438 #endif /* VM_BTLOG_TAGS */
5439 
5440 	_vm_map_store_entry_link(map_header, entry, new_entry);
5441 
5442 	if (entry->is_sub_map) {
5443 		vm_map_reference(VME_SUBMAP(new_entry));
5444 	} else {
5445 		vm_object_reference(VME_OBJECT(new_entry));
5446 	}
5447 }
5448 
5449 
5450 /*
5451  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
5452  *
5453  *	Asserts that the starting and ending region
5454  *	addresses fall within the valid range of the map.
5455  */
5456 #define VM_MAP_RANGE_CHECK(map, start, end)     \
5457 	MACRO_BEGIN                             \
5458 	if (start < vm_map_min(map))            \
5459 	        start = vm_map_min(map);        \
5460 	if (end > vm_map_max(map))              \
5461 	        end = vm_map_max(map);          \
5462 	if (start > end)                        \
5463 	        start = end;                    \
5464 	MACRO_END
5465 
5466 /*
5467  *	vm_map_range_check:	[ internal use only ]
5468  *
5469  *	Check that the region defined by the specified start and
5470  *	end addresses are wholly contained within a single map
5471  *	entry or set of adjacent map entries of the spacified map,
5472  *	i.e. the specified region contains no unmapped space.
5473  *	If any or all of the region is unmapped, FALSE is returned.
5474  *	Otherwise, TRUE is returned and if the output argument 'entry'
5475  *	is not NULL it points to the map entry containing the start
5476  *	of the region.
5477  *
5478  *	The map is locked for reading on entry and is left locked.
5479  */
5480 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5481 vm_map_range_check(
5482 	vm_map_t                map,
5483 	vm_map_offset_t         start,
5484 	vm_map_offset_t         end,
5485 	vm_map_entry_t          *entry)
5486 {
5487 	vm_map_entry_t          cur;
5488 	vm_map_offset_t         prev;
5489 
5490 	/*
5491 	 *      Basic sanity checks first
5492 	 */
5493 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5494 		return FALSE;
5495 	}
5496 
5497 	/*
5498 	 *      Check first if the region starts within a valid
5499 	 *	mapping for the map.
5500 	 */
5501 	if (!vm_map_lookup_entry(map, start, &cur)) {
5502 		return FALSE;
5503 	}
5504 
5505 	/*
5506 	 *	Optimize for the case that the region is contained
5507 	 *	in a single map entry.
5508 	 */
5509 	if (entry != (vm_map_entry_t *) NULL) {
5510 		*entry = cur;
5511 	}
5512 	if (end <= cur->vme_end) {
5513 		return TRUE;
5514 	}
5515 
5516 	/*
5517 	 *      If the region is not wholly contained within a
5518 	 *      single entry, walk the entries looking for holes.
5519 	 */
5520 	prev = cur->vme_end;
5521 	cur = cur->vme_next;
5522 	while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5523 		if (end <= cur->vme_end) {
5524 			return TRUE;
5525 		}
5526 		prev = cur->vme_end;
5527 		cur = cur->vme_next;
5528 	}
5529 	return FALSE;
5530 }
5531 
5532 /*
5533  *	vm_map_protect:
5534  *
5535  *	Sets the protection of the specified address
5536  *	region in the target map.  If "set_max" is
5537  *	specified, the maximum protection is to be set;
5538  *	otherwise, only the current protection is affected.
5539  */
5540 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5541 vm_map_protect(
5542 	vm_map_t        map,
5543 	vm_map_offset_t start,
5544 	vm_map_offset_t end,
5545 	vm_prot_t       new_prot,
5546 	boolean_t       set_max)
5547 {
5548 	vm_map_entry_t                  current;
5549 	vm_map_offset_t                 prev;
5550 	vm_map_entry_t                  entry;
5551 	vm_prot_t                       new_max;
5552 	int                             pmap_options = 0;
5553 	kern_return_t                   kr;
5554 
5555 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
5556 		return KERN_INVALID_ARGUMENT;
5557 	}
5558 
5559 	if (new_prot & VM_PROT_COPY) {
5560 		vm_map_offset_t         new_start;
5561 		vm_prot_t               cur_prot, max_prot;
5562 		vm_map_kernel_flags_t   kflags;
5563 
5564 		/* LP64todo - see below */
5565 		if (start >= map->max_offset) {
5566 			return KERN_INVALID_ADDRESS;
5567 		}
5568 
5569 		if ((new_prot & VM_PROT_ALLEXEC) &&
5570 		    map->pmap != kernel_pmap &&
5571 		    (vm_map_cs_enforcement(map)
5572 #if XNU_TARGET_OS_OSX && __arm64__
5573 		    || !VM_MAP_IS_EXOTIC(map)
5574 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5575 		    ) &&
5576 		    VM_MAP_POLICY_WX_FAIL(map)) {
5577 			DTRACE_VM3(cs_wx,
5578 			    uint64_t, (uint64_t) start,
5579 			    uint64_t, (uint64_t) end,
5580 			    vm_prot_t, new_prot);
5581 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5582 			    proc_selfpid(),
5583 			    (get_bsdtask_info(current_task())
5584 			    ? proc_name_address(get_bsdtask_info(current_task()))
5585 			    : "?"),
5586 			    __FUNCTION__, __LINE__,
5587 #if DEVELOPMENT || DEBUG
5588 			    (uint64_t)start,
5589 			    (uint64_t)end,
5590 #else /* DEVELOPMENT || DEBUG */
5591 			    (uint64_t)0,
5592 			    (uint64_t)0,
5593 #endif /* DEVELOPMENT || DEBUG */
5594 			    new_prot);
5595 			return KERN_PROTECTION_FAILURE;
5596 		}
5597 
5598 		/*
5599 		 * Let vm_map_remap_extract() know that it will need to:
5600 		 * + make a copy of the mapping
5601 		 * + add VM_PROT_WRITE to the max protections
5602 		 * + remove any protections that are no longer allowed from the
5603 		 *   max protections (to avoid any WRITE/EXECUTE conflict, for
5604 		 *   example).
5605 		 * Note that "max_prot" is an IN/OUT parameter only for this
5606 		 * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
5607 		 * only.
5608 		 */
5609 		max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5610 		cur_prot = VM_PROT_NONE;
5611 		kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5612 		kflags.vmkf_remap_prot_copy = true;
5613 		kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5614 		new_start = start;
5615 		kr = vm_map_remap(map,
5616 		    vm_sanitize_wrap_addr_ref(&new_start),
5617 		    end - start,
5618 		    0, /* mask */
5619 		    kflags,
5620 		    map,
5621 		    start,
5622 		    TRUE, /* copy-on-write remapping! */
5623 		    vm_sanitize_wrap_prot_ref(&cur_prot), /* IN/OUT */
5624 		    vm_sanitize_wrap_prot_ref(&max_prot), /* IN/OUT */
5625 		    VM_INHERIT_DEFAULT);
5626 		if (kr != KERN_SUCCESS) {
5627 			return kr;
5628 		}
5629 		new_prot &= ~VM_PROT_COPY;
5630 	}
5631 
5632 	vm_map_lock(map);
5633 
5634 	/* LP64todo - remove this check when vm_map_commpage64()
5635 	 * no longer has to stuff in a map_entry for the commpage
5636 	 * above the map's max_offset.
5637 	 */
5638 	if (start >= map->max_offset) {
5639 		vm_map_unlock(map);
5640 		return KERN_INVALID_ADDRESS;
5641 	}
5642 
5643 	while (1) {
5644 		/*
5645 		 *      Lookup the entry.  If it doesn't start in a valid
5646 		 *	entry, return an error.
5647 		 */
5648 		if (!vm_map_lookup_entry(map, start, &entry)) {
5649 			vm_map_unlock(map);
5650 			return KERN_INVALID_ADDRESS;
5651 		}
5652 
5653 		if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5654 			start = SUPERPAGE_ROUND_DOWN(start);
5655 			continue;
5656 		}
5657 		break;
5658 	}
5659 	if (entry->superpage_size) {
5660 		end = SUPERPAGE_ROUND_UP(end);
5661 	}
5662 
5663 	/*
5664 	 *	Make a first pass to check for protection and address
5665 	 *	violations.
5666 	 */
5667 
5668 	current = entry;
5669 	prev = current->vme_start;
5670 	while ((current != vm_map_to_entry(map)) &&
5671 	    (current->vme_start < end)) {
5672 		/*
5673 		 * If there is a hole, return an error.
5674 		 */
5675 		if (current->vme_start != prev) {
5676 			vm_map_unlock(map);
5677 			return KERN_INVALID_ADDRESS;
5678 		}
5679 
5680 		new_max = current->max_protection;
5681 
5682 #if defined(__x86_64__)
5683 		/* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5684 		if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5685 			new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5686 		}
5687 #elif CODE_SIGNING_MONITOR
5688 		if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5689 			new_max |= VM_PROT_EXECUTE;
5690 		}
5691 #endif
5692 		if ((new_prot & new_max) != new_prot) {
5693 			vm_map_unlock(map);
5694 			return KERN_PROTECTION_FAILURE;
5695 		}
5696 
5697 		if (current->used_for_jit &&
5698 		    pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5699 			vm_map_unlock(map);
5700 			return KERN_PROTECTION_FAILURE;
5701 		}
5702 
5703 #if __arm64e__
5704 		/* Disallow remapping hw assisted TPRO mappings */
5705 		if (current->used_for_tpro) {
5706 			vm_map_unlock(map);
5707 			return KERN_PROTECTION_FAILURE;
5708 		}
5709 #endif /* __arm64e__ */
5710 
5711 
5712 		if ((new_prot & VM_PROT_WRITE) &&
5713 		    (new_prot & VM_PROT_ALLEXEC) &&
5714 #if XNU_TARGET_OS_OSX
5715 		    map->pmap != kernel_pmap &&
5716 		    (vm_map_cs_enforcement(map)
5717 #if __arm64__
5718 		    || !VM_MAP_IS_EXOTIC(map)
5719 #endif /* __arm64__ */
5720 		    ) &&
5721 #endif /* XNU_TARGET_OS_OSX */
5722 #if CODE_SIGNING_MONITOR
5723 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5724 #endif
5725 		    !(current->used_for_jit)) {
5726 			DTRACE_VM3(cs_wx,
5727 			    uint64_t, (uint64_t) current->vme_start,
5728 			    uint64_t, (uint64_t) current->vme_end,
5729 			    vm_prot_t, new_prot);
5730 			printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5731 			    proc_selfpid(),
5732 			    (get_bsdtask_info(current_task())
5733 			    ? proc_name_address(get_bsdtask_info(current_task()))
5734 			    : "?"),
5735 			    __FUNCTION__, __LINE__,
5736 #if DEVELOPMENT || DEBUG
5737 			    (uint64_t)current->vme_start,
5738 			    (uint64_t)current->vme_end,
5739 #else /* DEVELOPMENT || DEBUG */
5740 			    (uint64_t)0,
5741 			    (uint64_t)0,
5742 #endif /* DEVELOPMENT || DEBUG */
5743 			    new_prot);
5744 			new_prot &= ~VM_PROT_ALLEXEC;
5745 			if (VM_MAP_POLICY_WX_FAIL(map)) {
5746 				vm_map_unlock(map);
5747 				return KERN_PROTECTION_FAILURE;
5748 			}
5749 		}
5750 
5751 		/*
5752 		 * If the task has requested executable lockdown,
5753 		 * deny both:
5754 		 * - adding executable protections OR
5755 		 * - adding write protections to an existing executable mapping.
5756 		 */
5757 		if (map->map_disallow_new_exec == TRUE) {
5758 			if ((new_prot & VM_PROT_ALLEXEC) ||
5759 			    ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5760 				vm_map_unlock(map);
5761 				return KERN_PROTECTION_FAILURE;
5762 			}
5763 		}
5764 
5765 		prev = current->vme_end;
5766 		current = current->vme_next;
5767 	}
5768 
5769 #if __arm64__
5770 	if (end > prev &&
5771 	    end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5772 		vm_map_entry_t prev_entry;
5773 
5774 		prev_entry = current->vme_prev;
5775 		if (prev_entry != vm_map_to_entry(map) &&
5776 		    !prev_entry->map_aligned &&
5777 		    (vm_map_round_page(prev_entry->vme_end,
5778 		    VM_MAP_PAGE_MASK(map))
5779 		    == end)) {
5780 			/*
5781 			 * The last entry in our range is not "map-aligned"
5782 			 * but it would have reached all the way to "end"
5783 			 * if it had been map-aligned, so this is not really
5784 			 * a hole in the range and we can proceed.
5785 			 */
5786 			prev = end;
5787 		}
5788 	}
5789 #endif /* __arm64__ */
5790 
5791 	if (end > prev) {
5792 		vm_map_unlock(map);
5793 		return KERN_INVALID_ADDRESS;
5794 	}
5795 
5796 	/*
5797 	 *	Go back and fix up protections.
5798 	 *	Clip to start here if the range starts within
5799 	 *	the entry.
5800 	 */
5801 
5802 	current = entry;
5803 	if (current != vm_map_to_entry(map)) {
5804 		/* clip and unnest if necessary */
5805 		vm_map_clip_start(map, current, start);
5806 	}
5807 
5808 	while ((current != vm_map_to_entry(map)) &&
5809 	    (current->vme_start < end)) {
5810 		vm_prot_t       old_prot;
5811 
5812 		vm_map_clip_end(map, current, end);
5813 
5814 #if DEVELOPMENT || DEBUG
5815 		if (current->csm_associated && vm_log_xnu_user_debug) {
5816 			printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
5817 			    proc_selfpid(),
5818 			    (get_bsdtask_info(current_task())
5819 			    ? proc_name_address(get_bsdtask_info(current_task()))
5820 			    : "?"),
5821 			    __FUNCTION__,
5822 			    (uint64_t)start,
5823 			    (uint64_t)end,
5824 			    new_prot,
5825 			    map, current,
5826 			    current->vme_start,
5827 			    current->vme_end,
5828 			    current->protection,
5829 			    current->max_protection);
5830 		}
5831 #endif /* DEVELOPMENT || DEBUG */
5832 
5833 		if (current->is_sub_map) {
5834 			/* clipping did unnest if needed */
5835 			assert(!current->use_pmap);
5836 		}
5837 
5838 		old_prot = current->protection;
5839 
5840 		if (set_max) {
5841 			current->max_protection = new_prot;
5842 			/* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
5843 			current->protection = (new_prot & old_prot);
5844 		} else {
5845 			current->protection = new_prot;
5846 		}
5847 
5848 #if CODE_SIGNING_MONITOR
5849 		if (!current->vme_xnu_user_debug &&
5850 		    /* a !csm_associated mapping becoming executable */
5851 		    ((!current->csm_associated &&
5852 		    !(old_prot & VM_PROT_EXECUTE) &&
5853 		    (current->protection & VM_PROT_EXECUTE))
5854 		    ||
5855 		    /* a csm_associated mapping becoming writable */
5856 		    (current->csm_associated &&
5857 		    !(old_prot & VM_PROT_WRITE) &&
5858 		    (current->protection & VM_PROT_WRITE)))) {
5859 			/*
5860 			 * This mapping has not already been marked as
5861 			 * "user_debug" and it is either:
5862 			 * 1. not code-signing-monitored and becoming executable
5863 			 * 2. code-signing-monitored and becoming writable,
5864 			 * so inform the CodeSigningMonitor and mark the
5865 			 * mapping as "user_debug" if appropriate.
5866 			 */
5867 			vm_map_kernel_flags_t vmk_flags;
5868 			vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
5869 			/* pretend it's a vm_protect(VM_PROT_COPY)... */
5870 			vmk_flags.vmkf_remap_prot_copy = true;
5871 			kr = vm_map_entry_cs_associate(map, current, vmk_flags);
5872 #if DEVELOPMENT || DEBUG
5873 			if (vm_log_xnu_user_debug) {
5874 				printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
5875 				    proc_selfpid(),
5876 				    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
5877 				    __FUNCTION__, __LINE__,
5878 				    map, current,
5879 				    current->vme_start, current->vme_end,
5880 				    old_prot, current->protection,
5881 				    kr, current->vme_xnu_user_debug);
5882 			}
5883 #endif /* DEVELOPMENT || DEBUG */
5884 		}
5885 #endif /* CODE_SIGNING_MONITOR */
5886 
5887 		/*
5888 		 *	Update physical map if necessary.
5889 		 *	If the request is to turn off write protection,
5890 		 *	we won't do it for real (in pmap). This is because
5891 		 *	it would cause copy-on-write to fail.  We've already
5892 		 *	set, the new protection in the map, so if a
5893 		 *	write-protect fault occurred, it will be fixed up
5894 		 *	properly, COW or not.
5895 		 */
5896 		if (current->protection != old_prot) {
5897 			/* Look one level in we support nested pmaps */
5898 			/* from mapped submaps which are direct entries */
5899 			/* in our map */
5900 
5901 			vm_prot_t prot;
5902 
5903 			prot = current->protection;
5904 			if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
5905 				prot &= ~VM_PROT_WRITE;
5906 			} else {
5907 				assert(!VME_OBJECT(current)->code_signed);
5908 				assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
5909 				if (prot & VM_PROT_WRITE) {
5910 					/*
5911 					 * For write requests on the
5912 					 * compressor, we wil ask the
5913 					 * pmap layer to prevent us from
5914 					 * taking a write fault when we
5915 					 * attempt to access the mapping
5916 					 * next.
5917 					 */
5918 					pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
5919 				}
5920 			}
5921 
5922 			if (override_nx(map, VME_ALIAS(current)) && prot) {
5923 				prot |= VM_PROT_EXECUTE;
5924 			}
5925 
5926 #if DEVELOPMENT || DEBUG
5927 			if (!(old_prot & VM_PROT_EXECUTE) &&
5928 			    (prot & VM_PROT_EXECUTE) &&
5929 			    panic_on_unsigned_execute &&
5930 			    (proc_selfcsflags() & CS_KILL)) {
5931 				panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
5932 			}
5933 #endif /* DEVELOPMENT || DEBUG */
5934 
5935 			if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
5936 				if (current->wired_count) {
5937 					panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
5938 					    map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
5939 				}
5940 
5941 				/* If the pmap layer cares about this
5942 				 * protection type, force a fault for
5943 				 * each page so that vm_fault will
5944 				 * repopulate the page with the full
5945 				 * set of protections.
5946 				 */
5947 				/*
5948 				 * TODO: We don't seem to need this,
5949 				 * but this is due to an internal
5950 				 * implementation detail of
5951 				 * pmap_protect.  Do we want to rely
5952 				 * on this?
5953 				 */
5954 				prot = VM_PROT_NONE;
5955 			}
5956 
5957 			if (current->is_sub_map && current->use_pmap) {
5958 				pmap_protect(VME_SUBMAP(current)->pmap,
5959 				    current->vme_start,
5960 				    current->vme_end,
5961 				    prot);
5962 			} else {
5963 				pmap_protect_options(map->pmap,
5964 				    current->vme_start,
5965 				    current->vme_end,
5966 				    prot,
5967 				    pmap_options,
5968 				    NULL);
5969 			}
5970 		}
5971 		current = current->vme_next;
5972 	}
5973 
5974 	current = entry;
5975 	while ((current != vm_map_to_entry(map)) &&
5976 	    (current->vme_start <= end)) {
5977 		vm_map_simplify_entry(map, current);
5978 		current = current->vme_next;
5979 	}
5980 
5981 	vm_map_unlock(map);
5982 	return KERN_SUCCESS;
5983 }
5984 
5985 /*
5986  *	vm_map_inherit:
5987  *
5988  *	Sets the inheritance of the specified address
5989  *	range in the target map.  Inheritance
5990  *	affects how the map will be shared with
5991  *	child maps at the time of vm_map_fork.
5992  */
5993 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)5994 vm_map_inherit(
5995 	vm_map_t        map,
5996 	vm_map_offset_t start,
5997 	vm_map_offset_t end,
5998 	vm_inherit_t    new_inheritance)
5999 {
6000 	vm_map_entry_t  entry;
6001 	vm_map_entry_t  temp_entry;
6002 
6003 	vm_map_lock(map);
6004 
6005 	VM_MAP_RANGE_CHECK(map, start, end);
6006 
6007 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6008 		vm_map_unlock(map);
6009 		return KERN_INVALID_ADDRESS;
6010 	}
6011 
6012 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
6013 		entry = temp_entry;
6014 	} else {
6015 		temp_entry = temp_entry->vme_next;
6016 		entry = temp_entry;
6017 	}
6018 
6019 	/* first check entire range for submaps which can't support the */
6020 	/* given inheritance. */
6021 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6022 		if (entry->is_sub_map) {
6023 			if (new_inheritance == VM_INHERIT_COPY) {
6024 				vm_map_unlock(map);
6025 				return KERN_INVALID_ARGUMENT;
6026 			}
6027 		}
6028 
6029 		entry = entry->vme_next;
6030 	}
6031 
6032 	entry = temp_entry;
6033 	if (entry != vm_map_to_entry(map)) {
6034 		/* clip and unnest if necessary */
6035 		vm_map_clip_start(map, entry, start);
6036 	}
6037 
6038 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6039 		vm_map_clip_end(map, entry, end);
6040 		if (entry->is_sub_map) {
6041 			/* clip did unnest if needed */
6042 			assert(!entry->use_pmap);
6043 		}
6044 
6045 		entry->inheritance = new_inheritance;
6046 
6047 		entry = entry->vme_next;
6048 	}
6049 
6050 	vm_map_unlock(map);
6051 	return KERN_SUCCESS;
6052 }
6053 
6054 /*
6055  * Update the accounting for the amount of wired memory in this map.  If the user has
6056  * exceeded the defined limits, then we fail.  Wiring on behalf of the kernel never fails.
6057  */
6058 
6059 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6060 add_wire_counts(
6061 	vm_map_t        map,
6062 	vm_map_entry_t  entry,
6063 	boolean_t       user_wire)
6064 {
6065 	vm_map_size_t   size;
6066 
6067 	bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6068 
6069 	if (user_wire) {
6070 		unsigned int total_wire_count =  vm_page_wire_count + vm_lopage_free_count;
6071 
6072 		/*
6073 		 * We're wiring memory at the request of the user.  Check if this is the first time the user is wiring
6074 		 * this map entry.
6075 		 */
6076 
6077 		if (entry->user_wired_count == 0) {
6078 			size = entry->vme_end - entry->vme_start;
6079 
6080 			/*
6081 			 * Since this is the first time the user is wiring this map entry, check to see if we're
6082 			 * exceeding the user wire limits.  There is a per map limit which is the smaller of either
6083 			 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value.  There is also
6084 			 * a system-wide limit on the amount of memory all users can wire.  If the user is over either
6085 			 * limit, then we fail.
6086 			 */
6087 
6088 			if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6089 			    size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6090 				if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6091 #if DEVELOPMENT || DEBUG
6092 					if (panic_on_mlock_failure) {
6093 						panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6094 					}
6095 #endif /* DEVELOPMENT || DEBUG */
6096 					os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6097 				} else {
6098 					os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6099 #if DEVELOPMENT || DEBUG
6100 					if (panic_on_mlock_failure) {
6101 						panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6102 					}
6103 #endif /* DEVELOPMENT || DEBUG */
6104 				}
6105 				return KERN_RESOURCE_SHORTAGE;
6106 			}
6107 
6108 			/*
6109 			 * The first time the user wires an entry, we also increment the wired_count and add this to
6110 			 * the total that has been wired in the map.
6111 			 */
6112 
6113 			if (entry->wired_count >= MAX_WIRE_COUNT) {
6114 				return KERN_FAILURE;
6115 			}
6116 
6117 			entry->wired_count++;
6118 			map->user_wire_size += size;
6119 		}
6120 
6121 		if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6122 			return KERN_FAILURE;
6123 		}
6124 
6125 		entry->user_wired_count++;
6126 	} else {
6127 		/*
6128 		 * The kernel's wiring the memory.  Just bump the count and continue.
6129 		 */
6130 
6131 		if (entry->wired_count >= MAX_WIRE_COUNT) {
6132 			panic("vm_map_wire: too many wirings");
6133 		}
6134 
6135 		entry->wired_count++;
6136 	}
6137 
6138 	if (first_wire) {
6139 		vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6140 	}
6141 
6142 	return KERN_SUCCESS;
6143 }
6144 
6145 /*
6146  * Update the memory wiring accounting now that the given map entry is being unwired.
6147  */
6148 
6149 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6150 subtract_wire_counts(
6151 	vm_map_t        map,
6152 	vm_map_entry_t  entry,
6153 	boolean_t       user_wire)
6154 {
6155 	if (user_wire) {
6156 		/*
6157 		 * We're unwiring memory at the request of the user.  See if we're removing the last user wire reference.
6158 		 */
6159 
6160 		if (entry->user_wired_count == 1) {
6161 			/*
6162 			 * We're removing the last user wire reference.  Decrement the wired_count and the total
6163 			 * user wired memory for this map.
6164 			 */
6165 
6166 			assert(entry->wired_count >= 1);
6167 			entry->wired_count--;
6168 			map->user_wire_size -= entry->vme_end - entry->vme_start;
6169 		}
6170 
6171 		assert(entry->user_wired_count >= 1);
6172 		entry->user_wired_count--;
6173 	} else {
6174 		/*
6175 		 * The kernel is unwiring the memory.   Just update the count.
6176 		 */
6177 
6178 		assert(entry->wired_count >= 1);
6179 		entry->wired_count--;
6180 	}
6181 
6182 	vme_btref_consider_and_put(entry);
6183 }
6184 
6185 int cs_executable_wire = 0;
6186 
6187 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6188 vm_map_wire_nested(
6189 	vm_map_t                map,
6190 	vm_map_offset_t         start,
6191 	vm_map_offset_t         end,
6192 	vm_prot_t               caller_prot,
6193 	vm_tag_t                tag,
6194 	boolean_t               user_wire,
6195 	pmap_t                  map_pmap,
6196 	vm_map_offset_t         pmap_addr,
6197 	ppnum_t                *physpage_p)
6198 {
6199 	vm_map_entry_t          entry;
6200 	vm_prot_t               access_type;
6201 	struct vm_map_entry     *first_entry, tmp_entry;
6202 	vm_map_t                real_map;
6203 	vm_map_offset_t         s, e;
6204 	kern_return_t           rc;
6205 	boolean_t               need_wakeup;
6206 	boolean_t               main_map = FALSE;
6207 	wait_interrupt_t        interruptible_state;
6208 	thread_t                cur_thread;
6209 	unsigned int            last_timestamp;
6210 	vm_map_size_t           size;
6211 	boolean_t               wire_and_extract;
6212 	vm_prot_t               extra_prots;
6213 
6214 	extra_prots = VM_PROT_COPY;
6215 	extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6216 #if XNU_TARGET_OS_OSX
6217 	if (map->pmap == kernel_pmap ||
6218 	    !vm_map_cs_enforcement(map)) {
6219 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6220 	}
6221 #endif /* XNU_TARGET_OS_OSX */
6222 #if CODE_SIGNING_MONITOR
6223 	if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6224 		extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6225 	}
6226 #endif /* CODE_SIGNING_MONITOR */
6227 
6228 	access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6229 
6230 	wire_and_extract = FALSE;
6231 	if (physpage_p != NULL) {
6232 		/*
6233 		 * The caller wants the physical page number of the
6234 		 * wired page.  We return only one physical page number
6235 		 * so this works for only one page at a time.
6236 		 *
6237 		 * The only caller (vm_map_wire_and_extract)
6238 		 * guarantees it.
6239 		 */
6240 		assert(end - start == VM_MAP_PAGE_SIZE(map));
6241 		wire_and_extract = TRUE;
6242 		*physpage_p = 0;
6243 	}
6244 
6245 	VM_MAP_RANGE_CHECK(map, start, end);
6246 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6247 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6248 	if (start == end) {
6249 		/* We wired what the caller asked for, zero pages */
6250 		return KERN_SUCCESS;
6251 	}
6252 
6253 	vm_map_lock(map);
6254 	if (map_pmap == NULL) {
6255 		main_map = TRUE;
6256 	}
6257 	last_timestamp = map->timestamp;
6258 
6259 	need_wakeup = FALSE;
6260 	cur_thread = current_thread();
6261 
6262 	s = start;
6263 	rc = KERN_SUCCESS;
6264 
6265 	if (vm_map_lookup_entry(map, s, &first_entry)) {
6266 		entry = first_entry;
6267 		/*
6268 		 * vm_map_clip_start will be done later.
6269 		 * We don't want to unnest any nested submaps here !
6270 		 */
6271 	} else {
6272 		/* Start address is not in map */
6273 		rc = KERN_INVALID_ADDRESS;
6274 		goto done;
6275 	}
6276 
6277 	while ((entry != vm_map_to_entry(map)) && (s < end)) {
6278 		/*
6279 		 * At this point, we have wired from "start" to "s".
6280 		 * We still need to wire from "s" to "end".
6281 		 *
6282 		 * "entry" hasn't been clipped, so it could start before "s"
6283 		 * and/or end after "end".
6284 		 */
6285 
6286 		/* "e" is how far we want to wire in this entry */
6287 		e = entry->vme_end;
6288 		if (e > end) {
6289 			e = end;
6290 		}
6291 
6292 		/*
6293 		 * If another thread is wiring/unwiring this entry then
6294 		 * block after informing other thread to wake us up.
6295 		 */
6296 		if (entry->in_transition) {
6297 			wait_result_t wait_result;
6298 
6299 			/*
6300 			 * We have not clipped the entry.  Make sure that
6301 			 * the start address is in range so that the lookup
6302 			 * below will succeed.
6303 			 * "s" is the current starting point: we've already
6304 			 * wired from "start" to "s" and we still have
6305 			 * to wire from "s" to "end".
6306 			 */
6307 
6308 			entry->needs_wakeup = TRUE;
6309 
6310 			/*
6311 			 * wake up anybody waiting on entries that we have
6312 			 * already wired.
6313 			 */
6314 			if (need_wakeup) {
6315 				vm_map_entry_wakeup(map);
6316 				need_wakeup = FALSE;
6317 			}
6318 			/*
6319 			 * User wiring is interruptible
6320 			 */
6321 			wait_result = vm_map_entry_wait(map,
6322 			    (user_wire) ? THREAD_ABORTSAFE :
6323 			    THREAD_UNINT);
6324 			if (user_wire && wait_result == THREAD_INTERRUPTED) {
6325 				/*
6326 				 * undo the wirings we have done so far
6327 				 * We do not clear the needs_wakeup flag,
6328 				 * because we cannot tell if we were the
6329 				 * only one waiting.
6330 				 */
6331 				rc = KERN_FAILURE;
6332 				goto done;
6333 			}
6334 
6335 			/*
6336 			 * Cannot avoid a lookup here. reset timestamp.
6337 			 */
6338 			last_timestamp = map->timestamp;
6339 
6340 			/*
6341 			 * The entry could have been clipped, look it up again.
6342 			 * Worse that can happen is, it may not exist anymore.
6343 			 */
6344 			if (!vm_map_lookup_entry(map, s, &first_entry)) {
6345 				/*
6346 				 * User: undo everything upto the previous
6347 				 * entry.  let vm_map_unwire worry about
6348 				 * checking the validity of the range.
6349 				 */
6350 				rc = KERN_FAILURE;
6351 				goto done;
6352 			}
6353 			entry = first_entry;
6354 			continue;
6355 		}
6356 
6357 		if (entry->is_sub_map) {
6358 			vm_map_offset_t sub_start;
6359 			vm_map_offset_t sub_end;
6360 			vm_map_offset_t local_start;
6361 			vm_map_offset_t local_end;
6362 			pmap_t          pmap;
6363 
6364 			if (wire_and_extract) {
6365 				/*
6366 				 * Wiring would result in copy-on-write
6367 				 * which would not be compatible with
6368 				 * the sharing we have with the original
6369 				 * provider of this memory.
6370 				 */
6371 				rc = KERN_INVALID_ARGUMENT;
6372 				goto done;
6373 			}
6374 
6375 			vm_map_clip_start(map, entry, s);
6376 			vm_map_clip_end(map, entry, end);
6377 
6378 			sub_start = VME_OFFSET(entry);
6379 			sub_end = entry->vme_end;
6380 			sub_end += VME_OFFSET(entry) - entry->vme_start;
6381 
6382 			local_end = entry->vme_end;
6383 			if (map_pmap == NULL) {
6384 				vm_object_t             object;
6385 				vm_object_offset_t      offset;
6386 				vm_prot_t               prot;
6387 				boolean_t               wired;
6388 				vm_map_entry_t          local_entry;
6389 				vm_map_version_t         version;
6390 				vm_map_t                lookup_map;
6391 
6392 				if (entry->use_pmap) {
6393 					pmap = VME_SUBMAP(entry)->pmap;
6394 					/* ppc implementation requires that */
6395 					/* submaps pmap address ranges line */
6396 					/* up with parent map */
6397 #ifdef notdef
6398 					pmap_addr = sub_start;
6399 #endif
6400 					pmap_addr = s;
6401 				} else {
6402 					pmap = map->pmap;
6403 					pmap_addr = s;
6404 				}
6405 
6406 				if (entry->wired_count) {
6407 					if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6408 						goto done;
6409 					}
6410 
6411 					/*
6412 					 * The map was not unlocked:
6413 					 * no need to goto re-lookup.
6414 					 * Just go directly to next entry.
6415 					 */
6416 					entry = entry->vme_next;
6417 					s = entry->vme_start;
6418 					continue;
6419 				}
6420 
6421 				/* call vm_map_lookup_and_lock_object to */
6422 				/* cause any needs copy to be   */
6423 				/* evaluated */
6424 				local_start = entry->vme_start;
6425 				lookup_map = map;
6426 				vm_map_lock_write_to_read(map);
6427 				rc = vm_map_lookup_and_lock_object(
6428 					&lookup_map, local_start,
6429 					(access_type | extra_prots),
6430 					OBJECT_LOCK_EXCLUSIVE,
6431 					&version, &object,
6432 					&offset, &prot, &wired,
6433 					NULL,
6434 					&real_map, NULL);
6435 				if (rc != KERN_SUCCESS) {
6436 					vm_map_unlock_read(lookup_map);
6437 					assert(map_pmap == NULL);
6438 					vm_map_unwire_nested(map, start,
6439 					    s, user_wire, PMAP_NULL, 0);
6440 					return rc;
6441 				}
6442 				vm_object_unlock(object);
6443 				if (real_map != lookup_map) {
6444 					vm_map_unlock(real_map);
6445 				}
6446 				vm_map_unlock_read(lookup_map);
6447 				vm_map_lock(map);
6448 
6449 				/* we unlocked, so must re-lookup */
6450 				if (!vm_map_lookup_entry(map,
6451 				    local_start,
6452 				    &local_entry)) {
6453 					rc = KERN_FAILURE;
6454 					goto done;
6455 				}
6456 
6457 				/*
6458 				 * entry could have been "simplified",
6459 				 * so re-clip
6460 				 */
6461 				entry = local_entry;
6462 				assert(s == local_start);
6463 				vm_map_clip_start(map, entry, s);
6464 				vm_map_clip_end(map, entry, end);
6465 				/* re-compute "e" */
6466 				e = entry->vme_end;
6467 				if (e > end) {
6468 					e = end;
6469 				}
6470 
6471 				/* did we have a change of type? */
6472 				if (!entry->is_sub_map) {
6473 					last_timestamp = map->timestamp;
6474 					continue;
6475 				}
6476 			} else {
6477 				local_start = entry->vme_start;
6478 				pmap = map_pmap;
6479 			}
6480 
6481 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6482 				goto done;
6483 			}
6484 
6485 			entry->in_transition = TRUE;
6486 
6487 			vm_map_unlock(map);
6488 			rc = vm_map_wire_nested(VME_SUBMAP(entry),
6489 			    sub_start, sub_end,
6490 			    caller_prot, tag,
6491 			    user_wire, pmap, pmap_addr,
6492 			    NULL);
6493 			vm_map_lock(map);
6494 
6495 			/*
6496 			 * Find the entry again.  It could have been clipped
6497 			 * after we unlocked the map.
6498 			 */
6499 			if (!vm_map_lookup_entry(map, local_start,
6500 			    &first_entry)) {
6501 				panic("vm_map_wire: re-lookup failed");
6502 			}
6503 			entry = first_entry;
6504 
6505 			assert(local_start == s);
6506 			/* re-compute "e" */
6507 			e = entry->vme_end;
6508 			if (e > end) {
6509 				e = end;
6510 			}
6511 
6512 			last_timestamp = map->timestamp;
6513 			while ((entry != vm_map_to_entry(map)) &&
6514 			    (entry->vme_start < e)) {
6515 				assert(entry->in_transition);
6516 				entry->in_transition = FALSE;
6517 				if (entry->needs_wakeup) {
6518 					entry->needs_wakeup = FALSE;
6519 					need_wakeup = TRUE;
6520 				}
6521 				if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6522 					subtract_wire_counts(map, entry, user_wire);
6523 				}
6524 				entry = entry->vme_next;
6525 			}
6526 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6527 				goto done;
6528 			}
6529 
6530 			/* no need to relookup again */
6531 			s = entry->vme_start;
6532 			continue;
6533 		}
6534 
6535 		/*
6536 		 * If this entry is already wired then increment
6537 		 * the appropriate wire reference count.
6538 		 */
6539 		if (entry->wired_count) {
6540 			if ((entry->protection & access_type) != access_type) {
6541 				/* found a protection problem */
6542 
6543 				/*
6544 				 * XXX FBDP
6545 				 * We should always return an error
6546 				 * in this case but since we didn't
6547 				 * enforce it before, let's do
6548 				 * it only for the new "wire_and_extract"
6549 				 * code path for now...
6550 				 */
6551 				if (wire_and_extract) {
6552 					rc = KERN_PROTECTION_FAILURE;
6553 					goto done;
6554 				}
6555 			}
6556 
6557 			/*
6558 			 * entry is already wired down, get our reference
6559 			 * after clipping to our range.
6560 			 */
6561 			vm_map_clip_start(map, entry, s);
6562 			vm_map_clip_end(map, entry, end);
6563 
6564 			if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6565 				goto done;
6566 			}
6567 
6568 			if (wire_and_extract) {
6569 				vm_object_t             object;
6570 				vm_object_offset_t      offset;
6571 				vm_page_t               m;
6572 
6573 				/*
6574 				 * We don't have to "wire" the page again
6575 				 * bit we still have to "extract" its
6576 				 * physical page number, after some sanity
6577 				 * checks.
6578 				 */
6579 				assert((entry->vme_end - entry->vme_start)
6580 				    == PAGE_SIZE);
6581 				assert(!entry->needs_copy);
6582 				assert(!entry->is_sub_map);
6583 				assert(VME_OBJECT(entry));
6584 				if (((entry->vme_end - entry->vme_start)
6585 				    != PAGE_SIZE) ||
6586 				    entry->needs_copy ||
6587 				    entry->is_sub_map ||
6588 				    VME_OBJECT(entry) == VM_OBJECT_NULL) {
6589 					rc = KERN_INVALID_ARGUMENT;
6590 					goto done;
6591 				}
6592 
6593 				object = VME_OBJECT(entry);
6594 				offset = VME_OFFSET(entry);
6595 				/* need exclusive lock to update m->dirty */
6596 				if (entry->protection & VM_PROT_WRITE) {
6597 					vm_object_lock(object);
6598 				} else {
6599 					vm_object_lock_shared(object);
6600 				}
6601 				m = vm_page_lookup(object, offset);
6602 				assert(m != VM_PAGE_NULL);
6603 				assert(VM_PAGE_WIRED(m));
6604 				if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6605 					*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6606 					if (entry->protection & VM_PROT_WRITE) {
6607 						vm_object_lock_assert_exclusive(
6608 							object);
6609 						m->vmp_dirty = TRUE;
6610 					}
6611 				} else {
6612 					/* not already wired !? */
6613 					*physpage_p = 0;
6614 				}
6615 				vm_object_unlock(object);
6616 			}
6617 
6618 			/* map was not unlocked: no need to relookup */
6619 			entry = entry->vme_next;
6620 			s = entry->vme_start;
6621 			continue;
6622 		}
6623 
6624 		/*
6625 		 * Unwired entry or wire request transmitted via submap
6626 		 */
6627 
6628 		/*
6629 		 * Wiring would copy the pages to the shadow object.
6630 		 * The shadow object would not be code-signed so
6631 		 * attempting to execute code from these copied pages
6632 		 * would trigger a code-signing violation.
6633 		 */
6634 
6635 		if ((entry->protection & VM_PROT_EXECUTE)
6636 #if XNU_TARGET_OS_OSX
6637 		    &&
6638 		    map->pmap != kernel_pmap &&
6639 		    (vm_map_cs_enforcement(map)
6640 #if __arm64__
6641 		    || !VM_MAP_IS_EXOTIC(map)
6642 #endif /* __arm64__ */
6643 		    )
6644 #endif /* XNU_TARGET_OS_OSX */
6645 #if CODE_SIGNING_MONITOR
6646 		    &&
6647 		    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6648 #endif
6649 		    ) {
6650 #if MACH_ASSERT
6651 			printf("pid %d[%s] wiring executable range from "
6652 			    "0x%llx to 0x%llx: rejected to preserve "
6653 			    "code-signing\n",
6654 			    proc_selfpid(),
6655 			    (get_bsdtask_info(current_task())
6656 			    ? proc_name_address(get_bsdtask_info(current_task()))
6657 			    : "?"),
6658 			    (uint64_t) entry->vme_start,
6659 			    (uint64_t) entry->vme_end);
6660 #endif /* MACH_ASSERT */
6661 			DTRACE_VM2(cs_executable_wire,
6662 			    uint64_t, (uint64_t)entry->vme_start,
6663 			    uint64_t, (uint64_t)entry->vme_end);
6664 			cs_executable_wire++;
6665 			rc = KERN_PROTECTION_FAILURE;
6666 			goto done;
6667 		}
6668 
6669 		/*
6670 		 * Perform actions of vm_map_lookup that need the write
6671 		 * lock on the map: create a shadow object for a
6672 		 * copy-on-write region, or an object for a zero-fill
6673 		 * region.
6674 		 */
6675 		size = entry->vme_end - entry->vme_start;
6676 		/*
6677 		 * If wiring a copy-on-write page, we need to copy it now
6678 		 * even if we're only (currently) requesting read access.
6679 		 * This is aggressive, but once it's wired we can't move it.
6680 		 */
6681 		if (entry->needs_copy) {
6682 			if (wire_and_extract) {
6683 				/*
6684 				 * We're supposed to share with the original
6685 				 * provider so should not be "needs_copy"
6686 				 */
6687 				rc = KERN_INVALID_ARGUMENT;
6688 				goto done;
6689 			}
6690 
6691 			VME_OBJECT_SHADOW(entry, size,
6692 			    vm_map_always_shadow(map));
6693 			entry->needs_copy = FALSE;
6694 		} else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6695 			if (wire_and_extract) {
6696 				/*
6697 				 * We're supposed to share with the original
6698 				 * provider so should already have an object.
6699 				 */
6700 				rc = KERN_INVALID_ARGUMENT;
6701 				goto done;
6702 			}
6703 			VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6704 			VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6705 			assert(entry->use_pmap);
6706 		} else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6707 			if (wire_and_extract) {
6708 				/*
6709 				 * We're supposed to share with the original
6710 				 * provider so should not be COPY_SYMMETRIC.
6711 				 */
6712 				rc = KERN_INVALID_ARGUMENT;
6713 				goto done;
6714 			}
6715 			/*
6716 			 * Force an unrequested "copy-on-write" but only for
6717 			 * the range we're wiring.
6718 			 */
6719 //			printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6720 			vm_map_clip_start(map, entry, s);
6721 			vm_map_clip_end(map, entry, end);
6722 			/* recompute "size" */
6723 			size = entry->vme_end - entry->vme_start;
6724 			/* make a shadow object */
6725 			vm_object_t orig_object;
6726 			vm_object_offset_t orig_offset;
6727 			orig_object = VME_OBJECT(entry);
6728 			orig_offset = VME_OFFSET(entry);
6729 			VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6730 			if (VME_OBJECT(entry) != orig_object) {
6731 				/*
6732 				 * This mapping has not been shared (or it would be
6733 				 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6734 				 * not been copied-on-write (or it would be marked
6735 				 * as "needs_copy" and would have been handled above
6736 				 * and also already write-protected).
6737 				 * We still need to write-protect here to prevent
6738 				 * other threads from modifying these pages while
6739 				 * we're in the process of copying and wiring
6740 				 * the copied pages.
6741 				 * Since the mapping is neither shared nor COWed,
6742 				 * we only need to write-protect the PTEs for this
6743 				 * mapping.
6744 				 */
6745 				vm_object_pmap_protect(orig_object,
6746 				    orig_offset,
6747 				    size,
6748 				    map->pmap,
6749 				    VM_MAP_PAGE_SIZE(map),
6750 				    entry->vme_start,
6751 				    entry->protection & ~VM_PROT_WRITE);
6752 			}
6753 		}
6754 		if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6755 			/*
6756 			 * Make the object COPY_DELAY to get a stable object
6757 			 * to wire.
6758 			 * That should avoid creating long shadow chains while
6759 			 * wiring/unwiring the same range repeatedly.
6760 			 * That also prevents part of the object from being
6761 			 * wired while another part is "needs_copy", which
6762 			 * could result in conflicting rules wrt copy-on-write.
6763 			 */
6764 			vm_object_t object;
6765 
6766 			object = VME_OBJECT(entry);
6767 			vm_object_lock(object);
6768 			if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6769 				assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
6770 				    "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
6771 				    object, (uint64_t)object->vo_size,
6772 				    entry,
6773 				    (uint64_t)entry->vme_start,
6774 				    (uint64_t)entry->vme_end,
6775 				    (uint64_t)VME_OFFSET(entry),
6776 				    (uint64_t)size);
6777 				assertf(object->ref_count == 1,
6778 				    "object %p ref_count %d\n",
6779 				    object, object->ref_count);
6780 				assertf(!entry->needs_copy,
6781 				    "entry %p\n", entry);
6782 				object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6783 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
6784 			}
6785 			vm_object_unlock(object);
6786 		}
6787 
6788 		vm_map_clip_start(map, entry, s);
6789 		vm_map_clip_end(map, entry, end);
6790 
6791 		/* re-compute "e" */
6792 		e = entry->vme_end;
6793 		if (e > end) {
6794 			e = end;
6795 		}
6796 
6797 		/*
6798 		 * Check for holes and protection mismatch.
6799 		 * Holes: Next entry should be contiguous unless this
6800 		 *	  is the end of the region.
6801 		 * Protection: Access requested must be allowed, unless
6802 		 *	wiring is by protection class
6803 		 */
6804 		if ((entry->vme_end < end) &&
6805 		    ((entry->vme_next == vm_map_to_entry(map)) ||
6806 		    (entry->vme_next->vme_start > entry->vme_end))) {
6807 			/* found a hole */
6808 			rc = KERN_INVALID_ADDRESS;
6809 			goto done;
6810 		}
6811 		if ((entry->protection & access_type) != access_type) {
6812 			/* found a protection problem */
6813 			rc = KERN_PROTECTION_FAILURE;
6814 			goto done;
6815 		}
6816 
6817 		assert(entry->wired_count == 0 && entry->user_wired_count == 0);
6818 
6819 		if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6820 			goto done;
6821 		}
6822 
6823 		entry->in_transition = TRUE;
6824 
6825 		/*
6826 		 * This entry might get split once we unlock the map.
6827 		 * In vm_fault_wire(), we need the current range as
6828 		 * defined by this entry.  In order for this to work
6829 		 * along with a simultaneous clip operation, we make a
6830 		 * temporary copy of this entry and use that for the
6831 		 * wiring.  Note that the underlying objects do not
6832 		 * change during a clip.
6833 		 */
6834 		tmp_entry = *entry;
6835 
6836 		/*
6837 		 * The in_transition state guarentees that the entry
6838 		 * (or entries for this range, if split occured) will be
6839 		 * there when the map lock is acquired for the second time.
6840 		 */
6841 		vm_map_unlock(map);
6842 
6843 		if (!user_wire && cur_thread != THREAD_NULL) {
6844 			interruptible_state = thread_interrupt_level(THREAD_UNINT);
6845 		} else {
6846 			interruptible_state = THREAD_UNINT;
6847 		}
6848 
6849 		if (map_pmap) {
6850 			rc = vm_fault_wire(map,
6851 			    &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
6852 			    physpage_p);
6853 		} else {
6854 			rc = vm_fault_wire(map,
6855 			    &tmp_entry, caller_prot, tag, map->pmap,
6856 			    tmp_entry.vme_start,
6857 			    physpage_p);
6858 		}
6859 
6860 		if (!user_wire && cur_thread != THREAD_NULL) {
6861 			thread_interrupt_level(interruptible_state);
6862 		}
6863 
6864 		vm_map_lock(map);
6865 
6866 		if (last_timestamp + 1 != map->timestamp) {
6867 			/*
6868 			 * Find the entry again.  It could have been clipped
6869 			 * after we unlocked the map.
6870 			 */
6871 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
6872 			    &first_entry)) {
6873 				panic("vm_map_wire: re-lookup failed");
6874 			}
6875 
6876 			entry = first_entry;
6877 		}
6878 
6879 		last_timestamp = map->timestamp;
6880 
6881 		while ((entry != vm_map_to_entry(map)) &&
6882 		    (entry->vme_start < tmp_entry.vme_end)) {
6883 			assert(entry->in_transition);
6884 			entry->in_transition = FALSE;
6885 			if (entry->needs_wakeup) {
6886 				entry->needs_wakeup = FALSE;
6887 				need_wakeup = TRUE;
6888 			}
6889 			if (rc != KERN_SUCCESS) {       /* from vm_*_wire */
6890 				subtract_wire_counts(map, entry, user_wire);
6891 			}
6892 			entry = entry->vme_next;
6893 		}
6894 
6895 		if (rc != KERN_SUCCESS) {               /* from vm_*_wire */
6896 			goto done;
6897 		}
6898 
6899 		if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
6900 		    (tmp_entry.vme_end != end) &&    /* AND, we are not at the end of the requested range */
6901 		    (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
6902 			/* found a "new" hole */
6903 			s = tmp_entry.vme_end;
6904 			rc = KERN_INVALID_ADDRESS;
6905 			goto done;
6906 		}
6907 
6908 		s = entry->vme_start;
6909 	} /* end while loop through map entries */
6910 
6911 done:
6912 	if (rc == KERN_SUCCESS) {
6913 		/* repair any damage we may have made to the VM map */
6914 		vm_map_simplify_range(map, start, end);
6915 	}
6916 
6917 	vm_map_unlock(map);
6918 
6919 	/*
6920 	 * wake up anybody waiting on entries we wired.
6921 	 */
6922 	if (need_wakeup) {
6923 		vm_map_entry_wakeup(map);
6924 	}
6925 
6926 	if (rc != KERN_SUCCESS) {
6927 		/* undo what has been wired so far */
6928 		vm_map_unwire_nested(map, start, s, user_wire,
6929 		    map_pmap, pmap_addr);
6930 		if (physpage_p) {
6931 			*physpage_p = 0;
6932 		}
6933 	}
6934 
6935 	return rc;
6936 }
6937 
6938 static inline kern_return_t
vm_map_wire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size,vm_prot_t * prot)6939 vm_map_wire_sanitize(
6940 	vm_map_t                map,
6941 	vm_map_offset_ut        start_u,
6942 	vm_map_offset_ut        end_u,
6943 	vm_prot_ut              prot_u,
6944 	vm_sanitize_caller_t    vm_sanitize_caller,
6945 	vm_map_offset_t        *start,
6946 	vm_map_offset_t        *end,
6947 	vm_map_size_t          *size,
6948 	vm_prot_t              *prot)
6949 {
6950 	kern_return_t   kr;
6951 
6952 	kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
6953 	    VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
6954 	    size);
6955 	if (__improbable(kr != KERN_SUCCESS)) {
6956 		return kr;
6957 	}
6958 
6959 	kr = vm_sanitize_prot(prot_u, vm_sanitize_caller, map, prot);
6960 	if (__improbable(kr != KERN_SUCCESS)) {
6961 		return kr;
6962 	}
6963 
6964 	return KERN_SUCCESS;
6965 }
6966 
6967 /*
6968  * Validation function for vm_map_wire_nested().
6969  */
6970 kern_return_t
vm_map_wire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p,vm_sanitize_caller_t vm_sanitize_caller)6971 vm_map_wire_impl(
6972 	vm_map_t                map,
6973 	vm_map_offset_ut        start_u,
6974 	vm_map_offset_ut        end_u,
6975 	vm_prot_ut              prot_u,
6976 	vm_tag_t                tag,
6977 	boolean_t               user_wire,
6978 	ppnum_t                *physpage_p,
6979 	vm_sanitize_caller_t    vm_sanitize_caller)
6980 {
6981 	vm_map_offset_t start, end;
6982 	vm_map_size_t   size;
6983 	vm_prot_t       prot;
6984 	kern_return_t   kr;
6985 
6986 	/*
6987 	 * Sanitize any input parameters that are addr/size/prot/inherit
6988 	 */
6989 	kr = vm_map_wire_sanitize(map,
6990 	    start_u,
6991 	    end_u,
6992 	    prot_u,
6993 	    vm_sanitize_caller,
6994 	    &start,
6995 	    &end,
6996 	    &size,
6997 	    &prot);
6998 	if (__improbable(kr != KERN_SUCCESS)) {
6999 		if (physpage_p) {
7000 			*physpage_p = 0;
7001 		}
7002 		return vm_sanitize_get_kr(kr);
7003 	}
7004 
7005 	return vm_map_wire_nested(map, start, end, prot, tag, user_wire,
7006 	           PMAP_NULL, 0, physpage_p);
7007 }
7008 
7009 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,boolean_t user_wire)7010 vm_map_wire_external(
7011 	vm_map_t                map,
7012 	vm_map_offset_ut        start_u,
7013 	vm_map_offset_ut        end_u,
7014 	vm_prot_ut              prot_u,
7015 	boolean_t               user_wire)
7016 {
7017 	vm_tag_t tag = vm_tag_bt();
7018 
7019 	return vm_map_wire_kernel(map, start_u, end_u, prot_u, tag, user_wire);
7020 }
7021 
7022 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire)7023 vm_map_wire_kernel(
7024 	vm_map_t                map,
7025 	vm_map_offset_ut        start_u,
7026 	vm_map_offset_ut        end_u,
7027 	vm_prot_ut              prot_u,
7028 	vm_tag_t                tag,
7029 	boolean_t               user_wire)
7030 {
7031 	return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7032 	           user_wire, NULL, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7033 }
7034 
7035 #if XNU_PLATFORM_MacOSX
7036 
7037 kern_return_t
vm_map_wire_and_extract(vm_map_t map,vm_map_offset_ut start_u,vm_prot_ut prot_u,boolean_t user_wire,ppnum_t * physpage_p)7038 vm_map_wire_and_extract(
7039 	vm_map_t                map,
7040 	vm_map_offset_ut        start_u,
7041 	vm_prot_ut              prot_u,
7042 	boolean_t               user_wire,
7043 	ppnum_t                *physpage_p)
7044 {
7045 	vm_tag_t         tag    = vm_tag_bt();
7046 	vm_map_size_ut   size_u = vm_sanitize_wrap_size(VM_MAP_PAGE_SIZE(map));
7047 	vm_map_offset_ut end_u  = vm_sanitize_compute_unsafe_end(start_u, size_u);
7048 
7049 	return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7050 	           user_wire, physpage_p, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7051 }
7052 
7053 #endif /* XNU_PLATFORM_MacOSX */
7054 
7055 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7056 vm_map_unwire_nested(
7057 	vm_map_t                map,
7058 	vm_map_offset_t         start,
7059 	vm_map_offset_t         end,
7060 	boolean_t               user_wire,
7061 	pmap_t                  map_pmap,
7062 	vm_map_offset_t         pmap_addr)
7063 {
7064 	vm_map_entry_t          entry;
7065 	struct vm_map_entry     *first_entry, tmp_entry;
7066 	boolean_t               need_wakeup;
7067 	boolean_t               main_map = FALSE;
7068 	unsigned int            last_timestamp;
7069 
7070 	VM_MAP_RANGE_CHECK(map, start, end);
7071 	assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7072 	assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7073 
7074 	if (start == end) {
7075 		/* We unwired what the caller asked for: zero pages */
7076 		return KERN_SUCCESS;
7077 	}
7078 
7079 	vm_map_lock(map);
7080 	if (map_pmap == NULL) {
7081 		main_map = TRUE;
7082 	}
7083 	last_timestamp = map->timestamp;
7084 
7085 	if (vm_map_lookup_entry(map, start, &first_entry)) {
7086 		entry = first_entry;
7087 		/*
7088 		 * vm_map_clip_start will be done later.
7089 		 * We don't want to unnest any nested sub maps here !
7090 		 */
7091 	} else {
7092 		if (!user_wire) {
7093 			panic("vm_map_unwire: start not found");
7094 		}
7095 		/*	Start address is not in map. */
7096 		vm_map_unlock(map);
7097 		return KERN_INVALID_ADDRESS;
7098 	}
7099 
7100 	if (entry->superpage_size) {
7101 		/* superpages are always wired */
7102 		vm_map_unlock(map);
7103 		return KERN_INVALID_ADDRESS;
7104 	}
7105 
7106 	need_wakeup = FALSE;
7107 	while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7108 		if (entry->in_transition) {
7109 			/*
7110 			 * 1)
7111 			 * Another thread is wiring down this entry. Note
7112 			 * that if it is not for the other thread we would
7113 			 * be unwiring an unwired entry.  This is not
7114 			 * permitted.  If we wait, we will be unwiring memory
7115 			 * we did not wire.
7116 			 *
7117 			 * 2)
7118 			 * Another thread is unwiring this entry.  We did not
7119 			 * have a reference to it, because if we did, this
7120 			 * entry will not be getting unwired now.
7121 			 */
7122 			if (!user_wire) {
7123 				/*
7124 				 * XXX FBDP
7125 				 * This could happen:  there could be some
7126 				 * overlapping vslock/vsunlock operations
7127 				 * going on.
7128 				 * We should probably just wait and retry,
7129 				 * but then we have to be careful that this
7130 				 * entry could get "simplified" after
7131 				 * "in_transition" gets unset and before
7132 				 * we re-lookup the entry, so we would
7133 				 * have to re-clip the entry to avoid
7134 				 * re-unwiring what we have already unwired...
7135 				 * See vm_map_wire_nested().
7136 				 *
7137 				 * Or we could just ignore "in_transition"
7138 				 * here and proceed to decement the wired
7139 				 * count(s) on this entry.  That should be fine
7140 				 * as long as "wired_count" doesn't drop all
7141 				 * the way to 0 (and we should panic if THAT
7142 				 * happens).
7143 				 */
7144 				panic("vm_map_unwire: in_transition entry");
7145 			}
7146 
7147 			entry = entry->vme_next;
7148 			continue;
7149 		}
7150 
7151 		if (entry->is_sub_map) {
7152 			vm_map_offset_t sub_start;
7153 			vm_map_offset_t sub_end;
7154 			vm_map_offset_t local_end;
7155 			pmap_t          pmap;
7156 
7157 			vm_map_clip_start(map, entry, start);
7158 			vm_map_clip_end(map, entry, end);
7159 
7160 			sub_start = VME_OFFSET(entry);
7161 			sub_end = entry->vme_end - entry->vme_start;
7162 			sub_end += VME_OFFSET(entry);
7163 			local_end = entry->vme_end;
7164 			if (map_pmap == NULL) {
7165 				if (entry->use_pmap) {
7166 					pmap = VME_SUBMAP(entry)->pmap;
7167 					pmap_addr = sub_start;
7168 				} else {
7169 					pmap = map->pmap;
7170 					pmap_addr = start;
7171 				}
7172 				if (entry->wired_count == 0 ||
7173 				    (user_wire && entry->user_wired_count == 0)) {
7174 					if (!user_wire) {
7175 						panic("vm_map_unwire: entry is unwired");
7176 					}
7177 					entry = entry->vme_next;
7178 					continue;
7179 				}
7180 
7181 				/*
7182 				 * Check for holes
7183 				 * Holes: Next entry should be contiguous unless
7184 				 * this is the end of the region.
7185 				 */
7186 				if (((entry->vme_end < end) &&
7187 				    ((entry->vme_next == vm_map_to_entry(map)) ||
7188 				    (entry->vme_next->vme_start
7189 				    > entry->vme_end)))) {
7190 					if (!user_wire) {
7191 						panic("vm_map_unwire: non-contiguous region");
7192 					}
7193 /*
7194  *                                       entry = entry->vme_next;
7195  *                                       continue;
7196  */
7197 				}
7198 
7199 				subtract_wire_counts(map, entry, user_wire);
7200 
7201 				if (entry->wired_count != 0) {
7202 					entry = entry->vme_next;
7203 					continue;
7204 				}
7205 
7206 				entry->in_transition = TRUE;
7207 				tmp_entry = *entry;/* see comment in vm_map_wire() */
7208 
7209 				/*
7210 				 * We can unlock the map now. The in_transition state
7211 				 * guarantees existance of the entry.
7212 				 */
7213 				vm_map_unlock(map);
7214 				vm_map_unwire_nested(VME_SUBMAP(entry),
7215 				    sub_start, sub_end, user_wire, pmap, pmap_addr);
7216 				vm_map_lock(map);
7217 
7218 				if (last_timestamp + 1 != map->timestamp) {
7219 					/*
7220 					 * Find the entry again.  It could have been
7221 					 * clipped or deleted after we unlocked the map.
7222 					 */
7223 					if (!vm_map_lookup_entry(map,
7224 					    tmp_entry.vme_start,
7225 					    &first_entry)) {
7226 						if (!user_wire) {
7227 							panic("vm_map_unwire: re-lookup failed");
7228 						}
7229 						entry = first_entry->vme_next;
7230 					} else {
7231 						entry = first_entry;
7232 					}
7233 				}
7234 				last_timestamp = map->timestamp;
7235 
7236 				/*
7237 				 * clear transition bit for all constituent entries
7238 				 * that were in the original entry (saved in
7239 				 * tmp_entry).  Also check for waiters.
7240 				 */
7241 				while ((entry != vm_map_to_entry(map)) &&
7242 				    (entry->vme_start < tmp_entry.vme_end)) {
7243 					assert(entry->in_transition);
7244 					entry->in_transition = FALSE;
7245 					if (entry->needs_wakeup) {
7246 						entry->needs_wakeup = FALSE;
7247 						need_wakeup = TRUE;
7248 					}
7249 					entry = entry->vme_next;
7250 				}
7251 				continue;
7252 			} else {
7253 				tmp_entry = *entry;
7254 				vm_map_unlock(map);
7255 				vm_map_unwire_nested(VME_SUBMAP(entry),
7256 				    sub_start, sub_end, user_wire, map_pmap,
7257 				    pmap_addr);
7258 				vm_map_lock(map);
7259 
7260 				if (last_timestamp + 1 != map->timestamp) {
7261 					/*
7262 					 * Find the entry again.  It could have been
7263 					 * clipped or deleted after we unlocked the map.
7264 					 */
7265 					if (!vm_map_lookup_entry(map,
7266 					    tmp_entry.vme_start,
7267 					    &first_entry)) {
7268 						if (!user_wire) {
7269 							panic("vm_map_unwire: re-lookup failed");
7270 						}
7271 						entry = first_entry->vme_next;
7272 					} else {
7273 						entry = first_entry;
7274 					}
7275 				}
7276 				last_timestamp = map->timestamp;
7277 			}
7278 		}
7279 
7280 
7281 		if ((entry->wired_count == 0) ||
7282 		    (user_wire && entry->user_wired_count == 0)) {
7283 			if (!user_wire) {
7284 				panic("vm_map_unwire: entry is unwired");
7285 			}
7286 
7287 			entry = entry->vme_next;
7288 			continue;
7289 		}
7290 
7291 		assert(entry->wired_count > 0 &&
7292 		    (!user_wire || entry->user_wired_count > 0));
7293 
7294 		vm_map_clip_start(map, entry, start);
7295 		vm_map_clip_end(map, entry, end);
7296 
7297 		/*
7298 		 * Check for holes
7299 		 * Holes: Next entry should be contiguous unless
7300 		 *	  this is the end of the region.
7301 		 */
7302 		if (((entry->vme_end < end) &&
7303 		    ((entry->vme_next == vm_map_to_entry(map)) ||
7304 		    (entry->vme_next->vme_start > entry->vme_end)))) {
7305 			if (!user_wire) {
7306 				panic("vm_map_unwire: non-contiguous region");
7307 			}
7308 			entry = entry->vme_next;
7309 			continue;
7310 		}
7311 
7312 		subtract_wire_counts(map, entry, user_wire);
7313 
7314 		if (entry->wired_count != 0) {
7315 			entry = entry->vme_next;
7316 			continue;
7317 		}
7318 
7319 		if (entry->zero_wired_pages) {
7320 			entry->zero_wired_pages = FALSE;
7321 		}
7322 
7323 		entry->in_transition = TRUE;
7324 		tmp_entry = *entry;     /* see comment in vm_map_wire() */
7325 
7326 		/*
7327 		 * We can unlock the map now. The in_transition state
7328 		 * guarantees existance of the entry.
7329 		 */
7330 		vm_map_unlock(map);
7331 		if (map_pmap) {
7332 			vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7333 			    pmap_addr, tmp_entry.vme_end);
7334 		} else {
7335 			vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7336 			    tmp_entry.vme_start, tmp_entry.vme_end);
7337 		}
7338 		vm_map_lock(map);
7339 
7340 		if (last_timestamp + 1 != map->timestamp) {
7341 			/*
7342 			 * Find the entry again.  It could have been clipped
7343 			 * or deleted after we unlocked the map.
7344 			 */
7345 			if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7346 			    &first_entry)) {
7347 				if (!user_wire) {
7348 					panic("vm_map_unwire: re-lookup failed");
7349 				}
7350 				entry = first_entry->vme_next;
7351 			} else {
7352 				entry = first_entry;
7353 			}
7354 		}
7355 		last_timestamp = map->timestamp;
7356 
7357 		/*
7358 		 * clear transition bit for all constituent entries that
7359 		 * were in the original entry (saved in tmp_entry).  Also
7360 		 * check for waiters.
7361 		 */
7362 		while ((entry != vm_map_to_entry(map)) &&
7363 		    (entry->vme_start < tmp_entry.vme_end)) {
7364 			assert(entry->in_transition);
7365 			entry->in_transition = FALSE;
7366 			if (entry->needs_wakeup) {
7367 				entry->needs_wakeup = FALSE;
7368 				need_wakeup = TRUE;
7369 			}
7370 			entry = entry->vme_next;
7371 		}
7372 	}
7373 
7374 	/*
7375 	 * We might have fragmented the address space when we wired this
7376 	 * range of addresses.  Attempt to re-coalesce these VM map entries
7377 	 * with their neighbors now that they're no longer wired.
7378 	 * Under some circumstances, address space fragmentation can
7379 	 * prevent VM object shadow chain collapsing, which can cause
7380 	 * swap space leaks.
7381 	 */
7382 	vm_map_simplify_range(map, start, end);
7383 
7384 	vm_map_unlock(map);
7385 	/*
7386 	 * wake up anybody waiting on entries that we have unwired.
7387 	 */
7388 	if (need_wakeup) {
7389 		vm_map_entry_wakeup(map);
7390 	}
7391 	return KERN_SUCCESS;
7392 }
7393 
7394 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire)7395 vm_map_unwire(
7396 	vm_map_t                map,
7397 	vm_map_offset_ut        start_u,
7398 	vm_map_offset_ut        end_u,
7399 	boolean_t               user_wire)
7400 {
7401 	return vm_map_unwire_impl(map, start_u, end_u, user_wire,
7402 	           VM_SANITIZE_CALLER_VM_MAP_UNWIRE);
7403 }
7404 
7405 static inline kern_return_t
vm_map_unwire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)7406 vm_map_unwire_sanitize(
7407 	vm_map_t                map,
7408 	vm_map_offset_ut        start_u,
7409 	vm_map_offset_ut        end_u,
7410 	vm_sanitize_caller_t    vm_sanitize_caller,
7411 	vm_map_offset_t        *start,
7412 	vm_map_offset_t        *end,
7413 	vm_map_size_t          *size)
7414 {
7415 	return vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7416 	           VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7417 	           size);
7418 }
7419 
7420 kern_return_t
vm_map_unwire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire,vm_sanitize_caller_t vm_sanitize_caller)7421 vm_map_unwire_impl(
7422 	vm_map_t                map,
7423 	vm_map_offset_ut        start_u,
7424 	vm_map_offset_ut        end_u,
7425 	boolean_t               user_wire,
7426 	vm_sanitize_caller_t    vm_sanitize_caller)
7427 {
7428 	vm_map_offset_t start, end;
7429 	vm_map_size_t   size;
7430 	kern_return_t   kr;
7431 
7432 	/*
7433 	 * Sanitize any input parameters that are addr/size/prot/inherit
7434 	 */
7435 	kr = vm_map_unwire_sanitize(
7436 		map,
7437 		start_u,
7438 		end_u,
7439 		vm_sanitize_caller,
7440 		&start,
7441 		&end,
7442 		&size);
7443 	if (__improbable(kr != KERN_SUCCESS)) {
7444 		return vm_sanitize_get_kr(kr);
7445 	}
7446 
7447 	return vm_map_unwire_nested(map, start, end,
7448 	           user_wire, (pmap_t)NULL, 0);
7449 }
7450 
7451 
7452 /*
7453  *	vm_map_entry_zap:	[ internal use only ]
7454  *
7455  *	Remove the entry from the target map
7456  *	and put it on a zap list.
7457  */
7458 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7459 vm_map_entry_zap(
7460 	vm_map_t                map,
7461 	vm_map_entry_t          entry,
7462 	vm_map_zap_t            zap)
7463 {
7464 	vm_map_offset_t s, e;
7465 
7466 	s = entry->vme_start;
7467 	e = entry->vme_end;
7468 	assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7469 	assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7470 	if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7471 		assert(page_aligned(s));
7472 		assert(page_aligned(e));
7473 	}
7474 	if (entry->map_aligned == TRUE) {
7475 		assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7476 		assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7477 	}
7478 	assert(entry->wired_count == 0);
7479 	assert(entry->user_wired_count == 0);
7480 	assert(!entry->vme_permanent);
7481 
7482 	vm_map_store_entry_unlink(map, entry, false);
7483 	map->size -= e - s;
7484 
7485 	vm_map_zap_append(zap, entry);
7486 }
7487 
7488 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7489 vm_map_submap_pmap_clean(
7490 	vm_map_t        map,
7491 	vm_map_offset_t start,
7492 	vm_map_offset_t end,
7493 	vm_map_t        sub_map,
7494 	vm_map_offset_t offset)
7495 {
7496 	vm_map_offset_t submap_start;
7497 	vm_map_offset_t submap_end;
7498 	vm_map_size_t   remove_size;
7499 	vm_map_entry_t  entry;
7500 
7501 	submap_end = offset + (end - start);
7502 	submap_start = offset;
7503 
7504 	vm_map_lock_read(sub_map);
7505 	if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7506 		remove_size = (entry->vme_end - entry->vme_start);
7507 		if (offset > entry->vme_start) {
7508 			remove_size -= offset - entry->vme_start;
7509 		}
7510 
7511 
7512 		if (submap_end < entry->vme_end) {
7513 			remove_size -=
7514 			    entry->vme_end - submap_end;
7515 		}
7516 		if (entry->is_sub_map) {
7517 			vm_map_submap_pmap_clean(
7518 				sub_map,
7519 				start,
7520 				start + remove_size,
7521 				VME_SUBMAP(entry),
7522 				VME_OFFSET(entry));
7523 		} else {
7524 			if (map->mapped_in_other_pmaps &&
7525 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7526 			    VME_OBJECT(entry) != NULL) {
7527 				vm_object_pmap_protect_options(
7528 					VME_OBJECT(entry),
7529 					(VME_OFFSET(entry) +
7530 					offset -
7531 					entry->vme_start),
7532 					remove_size,
7533 					PMAP_NULL,
7534 					PAGE_SIZE,
7535 					entry->vme_start,
7536 					VM_PROT_NONE,
7537 					PMAP_OPTIONS_REMOVE);
7538 			} else {
7539 				pmap_remove(map->pmap,
7540 				    (addr64_t)start,
7541 				    (addr64_t)(start + remove_size));
7542 			}
7543 		}
7544 	}
7545 
7546 	entry = entry->vme_next;
7547 
7548 	while ((entry != vm_map_to_entry(sub_map))
7549 	    && (entry->vme_start < submap_end)) {
7550 		remove_size = (entry->vme_end - entry->vme_start);
7551 		if (submap_end < entry->vme_end) {
7552 			remove_size -= entry->vme_end - submap_end;
7553 		}
7554 		if (entry->is_sub_map) {
7555 			vm_map_submap_pmap_clean(
7556 				sub_map,
7557 				(start + entry->vme_start) - offset,
7558 				((start + entry->vme_start) - offset) + remove_size,
7559 				VME_SUBMAP(entry),
7560 				VME_OFFSET(entry));
7561 		} else {
7562 			if (map->mapped_in_other_pmaps &&
7563 			    os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7564 			    VME_OBJECT(entry) != NULL) {
7565 				vm_object_pmap_protect_options(
7566 					VME_OBJECT(entry),
7567 					VME_OFFSET(entry),
7568 					remove_size,
7569 					PMAP_NULL,
7570 					PAGE_SIZE,
7571 					entry->vme_start,
7572 					VM_PROT_NONE,
7573 					PMAP_OPTIONS_REMOVE);
7574 			} else {
7575 				pmap_remove(map->pmap,
7576 				    (addr64_t)((start + entry->vme_start)
7577 				    - offset),
7578 				    (addr64_t)(((start + entry->vme_start)
7579 				    - offset) + remove_size));
7580 			}
7581 		}
7582 		entry = entry->vme_next;
7583 	}
7584 	vm_map_unlock_read(sub_map);
7585 	return;
7586 }
7587 
7588 /*
7589  *     virt_memory_guard_ast:
7590  *
7591  *     Handle the AST callout for a virtual memory guard.
7592  *	   raise an EXC_GUARD exception and terminate the task
7593  *     if configured to do so.
7594  */
7595 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7596 virt_memory_guard_ast(
7597 	thread_t thread,
7598 	mach_exception_data_type_t code,
7599 	mach_exception_data_type_t subcode)
7600 {
7601 	task_t task = get_threadtask(thread);
7602 	assert(task != kernel_task);
7603 	assert(task == current_task());
7604 	kern_return_t sync_exception_result;
7605 	uint32_t behavior;
7606 
7607 	behavior = task->task_exc_guard;
7608 
7609 	/* Is delivery enabled */
7610 	if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7611 		return;
7612 	}
7613 
7614 	/* If only once, make sure we're that once */
7615 	while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7616 		uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7617 
7618 		if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7619 			break;
7620 		}
7621 		behavior = task->task_exc_guard;
7622 		if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7623 			return;
7624 		}
7625 	}
7626 
7627 	const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7628 	/* Raise exception synchronously and see if handler claimed it */
7629 	sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7630 
7631 	if (fatal) {
7632 		/*
7633 		 * If Synchronous EXC_GUARD delivery was successful then
7634 		 * kill the process and return, else kill the process
7635 		 * and deliver the exception via EXC_CORPSE_NOTIFY.
7636 		 */
7637 
7638 
7639 		int flags = PX_DEBUG_NO_HONOR;
7640 		exception_info_t info = {
7641 			.os_reason = OS_REASON_GUARD,
7642 			.exception_type = EXC_GUARD,
7643 			.mx_code = code,
7644 			.mx_subcode = subcode
7645 		};
7646 
7647 		if (sync_exception_result == KERN_SUCCESS) {
7648 			flags |= PX_PSIGNAL;
7649 		}
7650 		exit_with_mach_exception(current_proc(), info, flags);
7651 	} else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7652 		/*
7653 		 * If the synchronous EXC_GUARD delivery was not successful,
7654 		 * raise a simulated crash.
7655 		 */
7656 		if (sync_exception_result != KERN_SUCCESS) {
7657 			task_violated_guard(code, subcode, NULL, FALSE);
7658 		}
7659 	}
7660 }
7661 
7662 /*
7663  *     vm_map_guard_exception:
7664  *
7665  *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7666  *
7667  *     Right now, we do this when we find nothing mapped, or a
7668  *     gap in the mapping when a user address space deallocate
7669  *     was requested. We report the address of the first gap found.
7670  */
7671 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7672 vm_map_guard_exception(
7673 	vm_map_offset_t gap_start,
7674 	unsigned reason)
7675 {
7676 	mach_exception_code_t code = 0;
7677 	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7678 	unsigned int target = 0; /* should we pass in pid associated with map? */
7679 	mach_exception_data_type_t subcode = (uint64_t)gap_start;
7680 	boolean_t fatal = FALSE;
7681 
7682 	task_t task = current_task_early();
7683 
7684 	/* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7685 	if (task == NULL || task == kernel_task) {
7686 		return;
7687 	}
7688 
7689 	EXC_GUARD_ENCODE_TYPE(code, guard_type);
7690 	EXC_GUARD_ENCODE_FLAVOR(code, reason);
7691 	EXC_GUARD_ENCODE_TARGET(code, target);
7692 
7693 	if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7694 		fatal = TRUE;
7695 	}
7696 	thread_guard_violation(current_thread(), code, subcode, fatal);
7697 }
7698 
7699 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7700 vm_map_delete_submap_recurse(
7701 	vm_map_t submap,
7702 	vm_map_offset_t submap_start,
7703 	vm_map_offset_t submap_end)
7704 {
7705 	vm_map_entry_t submap_entry;
7706 
7707 	/*
7708 	 * Verify that the submap does not contain any "permanent" entries
7709 	 * within the specified range.
7710 	 * We do not care about gaps.
7711 	 */
7712 
7713 	vm_map_lock(submap);
7714 
7715 	if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7716 		submap_entry = submap_entry->vme_next;
7717 	}
7718 
7719 	for (;
7720 	    submap_entry != vm_map_to_entry(submap) &&
7721 	    submap_entry->vme_start < submap_end;
7722 	    submap_entry = submap_entry->vme_next) {
7723 		if (submap_entry->vme_permanent) {
7724 			/* "permanent" entry -> fail */
7725 			vm_map_unlock(submap);
7726 			return KERN_PROTECTION_FAILURE;
7727 		}
7728 	}
7729 	/* no "permanent" entries in the range -> success */
7730 	vm_map_unlock(submap);
7731 	return KERN_SUCCESS;
7732 }
7733 
7734 __abortlike
7735 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7736 __vm_map_delete_misaligned_panic(
7737 	vm_map_t                map,
7738 	vm_map_offset_t         start,
7739 	vm_map_offset_t         end)
7740 {
7741 	panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7742 	    map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7743 }
7744 
7745 __abortlike
7746 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7747 __vm_map_delete_failed_panic(
7748 	vm_map_t                map,
7749 	vm_map_offset_t         start,
7750 	vm_map_offset_t         end,
7751 	kern_return_t           kr)
7752 {
7753 	panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7754 	    map, (uint64_t)start, (uint64_t)end, kr);
7755 }
7756 
7757 __abortlike
7758 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7759 __vm_map_delete_gap_panic(
7760 	vm_map_t                map,
7761 	vm_map_offset_t         where,
7762 	vm_map_offset_t         start,
7763 	vm_map_offset_t         end)
7764 {
7765 	panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7766 	    map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7767 }
7768 
7769 __abortlike
7770 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7771 __vm_map_delete_permanent_panic(
7772 	vm_map_t                map,
7773 	vm_map_offset_t         start,
7774 	vm_map_offset_t         end,
7775 	vm_map_entry_t          entry)
7776 {
7777 	panic("vm_map_delete(%p,0x%llx,0x%llx): "
7778 	    "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7779 	    map, (uint64_t)start, (uint64_t)end, entry,
7780 	    (uint64_t)entry->vme_start,
7781 	    (uint64_t)entry->vme_end);
7782 }
7783 
7784 __options_decl(vm_map_delete_state_t, uint32_t, {
7785 	VMDS_NONE               = 0x0000,
7786 
7787 	VMDS_FOUND_GAP          = 0x0001,
7788 	VMDS_GAPS_OK            = 0x0002,
7789 
7790 	VMDS_KERNEL_PMAP        = 0x0004,
7791 	VMDS_NEEDS_LOOKUP       = 0x0008,
7792 	VMDS_NEEDS_WAKEUP       = 0x0010,
7793 	VMDS_KERNEL_KMEMPTR     = 0x0020
7794 });
7795 
7796 /*
7797  *	vm_map_delete:	[ internal use only ]
7798  *
7799  *	Deallocates the given address range from the target map.
7800  *	Removes all user wirings. Unwires one kernel wiring if
7801  *	VM_MAP_REMOVE_KUNWIRE is set.  Waits for kernel wirings to go
7802  *	away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set.  Sleeps
7803  *	interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
7804  *
7805  *
7806  *	When the map is a kernel map, then any error in removing mappings
7807  *	will lead to a panic so that clients do not have to repeat the panic
7808  *	code at each call site.  If VM_MAP_REMOVE_INTERRUPTIBLE
7809  *	is also passed, then KERN_ABORTED will not lead to a panic.
7810  *
7811  *	This routine is called with map locked and leaves map locked.
7812  */
7813 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)7814 vm_map_delete(
7815 	vm_map_t                map,
7816 	vm_map_offset_t         start,
7817 	vm_map_offset_t         end,
7818 	vmr_flags_t             flags,
7819 	kmem_guard_t            guard,
7820 	vm_map_zap_t            zap_list)
7821 {
7822 	vm_map_entry_t          entry, next;
7823 	int                     interruptible;
7824 	vm_map_offset_t         gap_start = 0;
7825 	vm_map_offset_t         clear_in_transition_end = 0;
7826 	__unused vm_map_offset_t save_start = start;
7827 	__unused vm_map_offset_t save_end = end;
7828 	vm_map_delete_state_t   state = VMDS_NONE;
7829 	kmem_return_t           ret = { };
7830 	vm_map_range_id_t       range_id = 0;
7831 	struct kmem_page_meta  *meta = NULL;
7832 	uint32_t                size_idx, slot_idx;
7833 	struct mach_vm_range    slot;
7834 
7835 	if (vm_map_pmap(map) == kernel_pmap) {
7836 		state |= VMDS_KERNEL_PMAP;
7837 		range_id = kmem_addr_get_range(start, end - start);
7838 		if (kmem_is_ptr_range(range_id)) {
7839 			state |= VMDS_KERNEL_KMEMPTR;
7840 			slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
7841 			    &size_idx, &slot);
7842 		}
7843 	}
7844 
7845 	if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
7846 		state |= VMDS_GAPS_OK;
7847 	}
7848 
7849 	if (map->corpse_source &&
7850 	    !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
7851 	    !map->terminated) {
7852 		/*
7853 		 * The map is being used for corpses related diagnostics.
7854 		 * So skip any entry removal to avoid perturbing the map state.
7855 		 * The cleanup will happen in task_terminate_internal after the
7856 		 * call to task_port_no_senders.
7857 		 */
7858 		goto out;
7859 	}
7860 
7861 	interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
7862 	    THREAD_ABORTSAFE : THREAD_UNINT;
7863 
7864 	if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
7865 	    (start & VM_MAP_PAGE_MASK(map))) {
7866 		__vm_map_delete_misaligned_panic(map, start, end);
7867 	}
7868 
7869 	if ((state & VMDS_GAPS_OK) == 0) {
7870 		/*
7871 		 * If the map isn't terminated then all deletions must have
7872 		 * no gaps, and be within the [min, max) of the map.
7873 		 *
7874 		 * We got here without VM_MAP_RANGE_CHECK() being called,
7875 		 * and hence must validate bounds manually.
7876 		 *
7877 		 * It is worth noting that because vm_deallocate() will
7878 		 * round_page() the deallocation size, it's possible for "end"
7879 		 * to be 0 here due to overflow. We hence must treat it as being
7880 		 * beyond vm_map_max(map).
7881 		 *
7882 		 * Similarly, end < start means some wrap around happend,
7883 		 * which should cause an error or panic.
7884 		 */
7885 		if (end == 0 || end > vm_map_max(map)) {
7886 			state |= VMDS_FOUND_GAP;
7887 			gap_start = vm_map_max(map);
7888 			if (state & VMDS_KERNEL_PMAP) {
7889 				__vm_map_delete_gap_panic(map,
7890 				    gap_start, start, end);
7891 			}
7892 			goto out;
7893 		}
7894 
7895 		if (end < start) {
7896 			if (state & VMDS_KERNEL_PMAP) {
7897 				__vm_map_delete_gap_panic(map,
7898 				    vm_map_max(map), start, end);
7899 			}
7900 			ret.kmr_return = KERN_INVALID_ARGUMENT;
7901 			goto out;
7902 		}
7903 
7904 		if (start < vm_map_min(map)) {
7905 			state |= VMDS_FOUND_GAP;
7906 			gap_start = start;
7907 			if (state & VMDS_KERNEL_PMAP) {
7908 				__vm_map_delete_gap_panic(map,
7909 				    gap_start, start, end);
7910 			}
7911 			goto out;
7912 		}
7913 	} else {
7914 		/*
7915 		 * If the map is terminated, we must accept start/end
7916 		 * being beyond the boundaries of the map as this is
7917 		 * how some of the mappings like commpage mappings
7918 		 * can be destroyed (they're outside of those bounds).
7919 		 *
7920 		 * end < start is still something we can't cope with,
7921 		 * so just bail.
7922 		 */
7923 		if (end < start) {
7924 			goto out;
7925 		}
7926 	}
7927 
7928 
7929 	/*
7930 	 *	Find the start of the region.
7931 	 *
7932 	 *	If in a superpage, extend the range
7933 	 *	to include the start of the mapping.
7934 	 */
7935 	while (vm_map_lookup_entry_or_next(map, start, &entry)) {
7936 		if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
7937 			start = SUPERPAGE_ROUND_DOWN(start);
7938 		} else {
7939 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
7940 			break;
7941 		}
7942 	}
7943 
7944 	if (entry->superpage_size) {
7945 		end = SUPERPAGE_ROUND_UP(end);
7946 	}
7947 
7948 	/*
7949 	 *	Step through all entries in this region
7950 	 */
7951 	for (vm_map_offset_t s = start; s < end;) {
7952 		/*
7953 		 * At this point, we have deleted all the memory entries
7954 		 * in [start, s) and are proceeding with the [s, end) range.
7955 		 *
7956 		 * This loop might drop the map lock, and it is possible that
7957 		 * some memory was already reallocated within [start, s)
7958 		 * and we don't want to mess with those entries.
7959 		 *
7960 		 * Some of those entries could even have been re-assembled
7961 		 * with an entry after "s" (in vm_map_simplify_entry()), so
7962 		 * we may have to vm_map_clip_start() again.
7963 		 *
7964 		 * When clear_in_transition_end is set, the we had marked
7965 		 * [start, clear_in_transition_end) as "in_transition"
7966 		 * during a previous iteration and we need to clear it.
7967 		 */
7968 
7969 		/*
7970 		 * Step 1: If needed (because we dropped locks),
7971 		 *         lookup the entry again.
7972 		 *
7973 		 *         If we're coming back from unwiring (Step 5),
7974 		 *         we also need to mark the entries as no longer
7975 		 *         in transition after that.
7976 		 */
7977 
7978 		if (state & VMDS_NEEDS_LOOKUP) {
7979 			state &= ~VMDS_NEEDS_LOOKUP;
7980 
7981 			if (vm_map_lookup_entry_or_next(map, s, &entry)) {
7982 				SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
7983 			}
7984 
7985 			if (state & VMDS_KERNEL_KMEMPTR) {
7986 				kmem_validate_slot(s, meta, size_idx, slot_idx);
7987 			}
7988 		}
7989 
7990 		if (clear_in_transition_end) {
7991 			for (vm_map_entry_t it = entry;
7992 			    it != vm_map_to_entry(map) &&
7993 			    it->vme_start < clear_in_transition_end;
7994 			    it = it->vme_next) {
7995 				assert(it->in_transition);
7996 				it->in_transition = FALSE;
7997 				if (it->needs_wakeup) {
7998 					it->needs_wakeup = FALSE;
7999 					state |= VMDS_NEEDS_WAKEUP;
8000 				}
8001 			}
8002 
8003 			clear_in_transition_end = 0;
8004 		}
8005 
8006 
8007 		/*
8008 		 * Step 2: Perform various policy checks
8009 		 *         before we do _anything_ to this entry.
8010 		 */
8011 
8012 		if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8013 			if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8014 				/*
8015 				 * Either we found a gap already,
8016 				 * or we are tearing down a map,
8017 				 * keep going.
8018 				 */
8019 			} else if (state & VMDS_KERNEL_PMAP) {
8020 				__vm_map_delete_gap_panic(map, s, start, end);
8021 			} else if (s < end) {
8022 				state |= VMDS_FOUND_GAP;
8023 				gap_start = s;
8024 			}
8025 
8026 			if (entry == vm_map_to_entry(map) ||
8027 			    end <= entry->vme_start) {
8028 				break;
8029 			}
8030 
8031 			s = entry->vme_start;
8032 		}
8033 
8034 		if (state & VMDS_KERNEL_PMAP) {
8035 			/*
8036 			 * In the kernel map and its submaps,
8037 			 * permanent entries never die, even
8038 			 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8039 			 */
8040 			if (entry->vme_permanent) {
8041 				__vm_map_delete_permanent_panic(map, start, end, entry);
8042 			}
8043 
8044 			if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8045 				end = entry->vme_end;
8046 				flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8047 			}
8048 
8049 			/*
8050 			 * In the kernel map and its submaps,
8051 			 * the removal of an atomic/guarded entry is strict.
8052 			 *
8053 			 * An atomic entry is processed only if it was
8054 			 * specifically targeted.
8055 			 *
8056 			 * We might have deleted non-atomic entries before
8057 			 * we reach this this point however...
8058 			 */
8059 			kmem_entry_validate_guard(map, entry,
8060 			    start, end - start, guard);
8061 		}
8062 
8063 		/*
8064 		 * Step 2.1: handle "permanent" and "submap" entries
8065 		 * *before* clipping to avoid triggering some unnecessary
8066 		 * un-nesting of the shared region.
8067 		 */
8068 		if (entry->vme_permanent && entry->is_sub_map) {
8069 //			printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8070 			/*
8071 			 * Un-mapping a "permanent" mapping of a user-space
8072 			 * submap is not allowed unless...
8073 			 */
8074 			if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8075 				/*
8076 				 * a. explicitly requested by the kernel caller.
8077 				 */
8078 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8079 			} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8080 			    developer_mode_state()) {
8081 				/*
8082 				 * b. we're in "developer" mode (for
8083 				 *    breakpoints, dtrace probes, ...).
8084 				 */
8085 //				printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8086 			} else if (map->terminated) {
8087 				/*
8088 				 * c. this is the final address space cleanup.
8089 				 */
8090 //				printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8091 			} else {
8092 				vm_map_offset_t submap_start, submap_end;
8093 				kern_return_t submap_kr;
8094 
8095 				/*
8096 				 * Check if there are any "permanent" mappings
8097 				 * in this range in the submap.
8098 				 */
8099 				if (entry->in_transition) {
8100 					/* can that even happen ? */
8101 					goto in_transition;
8102 				}
8103 				/* compute the clipped range in the submap */
8104 				submap_start = s - entry->vme_start;
8105 				submap_start += VME_OFFSET(entry);
8106 				submap_end = end - entry->vme_start;
8107 				submap_end += VME_OFFSET(entry);
8108 				submap_kr = vm_map_delete_submap_recurse(
8109 					VME_SUBMAP(entry),
8110 					submap_start,
8111 					submap_end);
8112 				if (submap_kr != KERN_SUCCESS) {
8113 					/*
8114 					 * There are some "permanent" mappings
8115 					 * in the submap: we are not allowed
8116 					 * to remove this range.
8117 					 */
8118 					printf("%d[%s] removing permanent submap entry "
8119 					    "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8120 					    proc_selfpid(),
8121 					    (get_bsdtask_info(current_task())
8122 					    ? proc_name_address(get_bsdtask_info(current_task()))
8123 					    : "?"), entry,
8124 					    (uint64_t)entry->vme_start,
8125 					    (uint64_t)entry->vme_end,
8126 					    entry->protection,
8127 					    entry->max_protection);
8128 					DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8129 					    vm_map_entry_t, entry,
8130 					    vm_map_offset_t, entry->vme_start,
8131 					    vm_map_offset_t, entry->vme_end,
8132 					    vm_prot_t, entry->protection,
8133 					    vm_prot_t, entry->max_protection,
8134 					    int, VME_ALIAS(entry));
8135 					ret.kmr_return = KERN_PROTECTION_FAILURE;
8136 					goto out;
8137 				}
8138 				/* no permanent mappings: proceed */
8139 			}
8140 		}
8141 
8142 		/*
8143 		 * Step 3: Perform any clipping needed.
8144 		 *
8145 		 *         After this, "entry" starts at "s", ends before "end"
8146 		 */
8147 
8148 		if (entry->vme_start < s) {
8149 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8150 			    entry->map_aligned &&
8151 			    !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8152 				/*
8153 				 * The entry will no longer be map-aligned
8154 				 * after clipping and the caller said it's OK.
8155 				 */
8156 				entry->map_aligned = FALSE;
8157 			}
8158 			vm_map_clip_start(map, entry, s);
8159 			SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8160 		}
8161 
8162 		if (end < entry->vme_end) {
8163 			if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8164 			    entry->map_aligned &&
8165 			    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8166 				/*
8167 				 * The entry will no longer be map-aligned
8168 				 * after clipping and the caller said it's OK.
8169 				 */
8170 				entry->map_aligned = FALSE;
8171 			}
8172 			vm_map_clip_end(map, entry, end);
8173 		}
8174 
8175 		if (entry->vme_permanent && entry->is_sub_map) {
8176 			/*
8177 			 * We already went through step 2.1 which did not deny
8178 			 * the removal of this "permanent" and "is_sub_map"
8179 			 * entry.
8180 			 * Now that we've clipped what we actually want to
8181 			 * delete, undo the "permanent" part to allow the
8182 			 * removal to proceed.
8183 			 */
8184 			DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8185 			    vm_map_entry_t, entry,
8186 			    vm_map_offset_t, entry->vme_start,
8187 			    vm_map_offset_t, entry->vme_end,
8188 			    vm_prot_t, entry->protection,
8189 			    vm_prot_t, entry->max_protection,
8190 			    int, VME_ALIAS(entry));
8191 			entry->vme_permanent = false;
8192 		}
8193 
8194 		assert(s == entry->vme_start);
8195 		assert(entry->vme_end <= end);
8196 
8197 
8198 		/*
8199 		 * Step 4: If the entry is in flux, wait for this to resolve.
8200 		 */
8201 
8202 		if (entry->in_transition) {
8203 			wait_result_t wait_result;
8204 
8205 in_transition:
8206 			/*
8207 			 * Another thread is wiring/unwiring this entry.
8208 			 * Let the other thread know we are waiting.
8209 			 */
8210 
8211 			entry->needs_wakeup = TRUE;
8212 
8213 			/*
8214 			 * wake up anybody waiting on entries that we have
8215 			 * already unwired/deleted.
8216 			 */
8217 			if (state & VMDS_NEEDS_WAKEUP) {
8218 				vm_map_entry_wakeup(map);
8219 				state &= ~VMDS_NEEDS_WAKEUP;
8220 			}
8221 
8222 			wait_result = vm_map_entry_wait(map, interruptible);
8223 
8224 			if (interruptible &&
8225 			    wait_result == THREAD_INTERRUPTED) {
8226 				/*
8227 				 * We do not clear the needs_wakeup flag,
8228 				 * since we cannot tell if we were the only one.
8229 				 */
8230 				ret.kmr_return = KERN_ABORTED;
8231 				return ret;
8232 			}
8233 
8234 			/*
8235 			 * The entry could have been clipped or it
8236 			 * may not exist anymore.  Look it up again.
8237 			 */
8238 			state |= VMDS_NEEDS_LOOKUP;
8239 			continue;
8240 		}
8241 
8242 
8243 		/*
8244 		 * Step 5: Handle wiring
8245 		 */
8246 
8247 		if (entry->wired_count) {
8248 			struct vm_map_entry tmp_entry;
8249 			boolean_t           user_wire;
8250 			unsigned int        last_timestamp;
8251 
8252 			user_wire = entry->user_wired_count > 0;
8253 
8254 			/*
8255 			 *      Remove a kernel wiring if requested
8256 			 */
8257 			if (flags & VM_MAP_REMOVE_KUNWIRE) {
8258 				entry->wired_count--;
8259 				vme_btref_consider_and_put(entry);
8260 			}
8261 
8262 			/*
8263 			 *	Remove all user wirings for proper accounting
8264 			 */
8265 			while (entry->user_wired_count) {
8266 				subtract_wire_counts(map, entry, user_wire);
8267 			}
8268 
8269 			/*
8270 			 * All our DMA I/O operations in IOKit are currently
8271 			 * done by wiring through the map entries of the task
8272 			 * requesting the I/O.
8273 			 *
8274 			 * Because of this, we must always wait for kernel wirings
8275 			 * to go away on the entries before deleting them.
8276 			 *
8277 			 * Any caller who wants to actually remove a kernel wiring
8278 			 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8279 			 * properly remove one wiring instead of blasting through
8280 			 * them all.
8281 			 */
8282 			if (entry->wired_count != 0) {
8283 				assert(map != kernel_map);
8284 				/*
8285 				 * Cannot continue.  Typical case is when
8286 				 * a user thread has physical io pending on
8287 				 * on this page.  Either wait for the
8288 				 * kernel wiring to go away or return an
8289 				 * error.
8290 				 */
8291 				wait_result_t wait_result;
8292 
8293 				entry->needs_wakeup = TRUE;
8294 				wait_result = vm_map_entry_wait(map,
8295 				    interruptible);
8296 
8297 				if (interruptible &&
8298 				    wait_result == THREAD_INTERRUPTED) {
8299 					/*
8300 					 * We do not clear the
8301 					 * needs_wakeup flag, since we
8302 					 * cannot tell if we were the
8303 					 * only one.
8304 					 */
8305 					ret.kmr_return = KERN_ABORTED;
8306 					return ret;
8307 				}
8308 
8309 
8310 				/*
8311 				 * The entry could have been clipped or
8312 				 * it may not exist anymore.  Look it
8313 				 * up again.
8314 				 */
8315 				state |= VMDS_NEEDS_LOOKUP;
8316 				continue;
8317 			}
8318 
8319 			/*
8320 			 * We can unlock the map now.
8321 			 *
8322 			 * The entry might be split once we unlock the map,
8323 			 * but we need the range as defined by this entry
8324 			 * to be stable. So we must make a local copy.
8325 			 *
8326 			 * The underlying objects do not change during clips,
8327 			 * and the in_transition state guarentees existence
8328 			 * of the entry.
8329 			 */
8330 			last_timestamp = map->timestamp;
8331 			entry->in_transition = TRUE;
8332 			tmp_entry = *entry;
8333 			vm_map_unlock(map);
8334 
8335 			if (tmp_entry.is_sub_map) {
8336 				vm_map_t sub_map;
8337 				vm_map_offset_t sub_start, sub_end;
8338 				pmap_t pmap;
8339 				vm_map_offset_t pmap_addr;
8340 
8341 
8342 				sub_map = VME_SUBMAP(&tmp_entry);
8343 				sub_start = VME_OFFSET(&tmp_entry);
8344 				sub_end = sub_start + (tmp_entry.vme_end -
8345 				    tmp_entry.vme_start);
8346 				if (tmp_entry.use_pmap) {
8347 					pmap = sub_map->pmap;
8348 					pmap_addr = tmp_entry.vme_start;
8349 				} else {
8350 					pmap = map->pmap;
8351 					pmap_addr = tmp_entry.vme_start;
8352 				}
8353 				(void) vm_map_unwire_nested(sub_map,
8354 				    sub_start, sub_end,
8355 				    user_wire,
8356 				    pmap, pmap_addr);
8357 			} else {
8358 				vm_map_offset_t entry_end = tmp_entry.vme_end;
8359 				vm_map_offset_t max_end;
8360 
8361 				if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8362 					max_end = end - VM_MAP_PAGE_SIZE(map);
8363 					if (entry_end > max_end) {
8364 						entry_end = max_end;
8365 					}
8366 				}
8367 
8368 				if (tmp_entry.vme_kernel_object) {
8369 					pmap_protect_options(
8370 						map->pmap,
8371 						tmp_entry.vme_start,
8372 						entry_end,
8373 						VM_PROT_NONE,
8374 						PMAP_OPTIONS_REMOVE,
8375 						NULL);
8376 				}
8377 				vm_fault_unwire(map, &tmp_entry,
8378 				    tmp_entry.vme_kernel_object, map->pmap,
8379 				    tmp_entry.vme_start, entry_end);
8380 			}
8381 
8382 			vm_map_lock(map);
8383 
8384 			/*
8385 			 * Unwiring happened, we can now go back to deleting
8386 			 * them (after we clear the in_transition bit for the range).
8387 			 */
8388 			if (last_timestamp + 1 != map->timestamp) {
8389 				state |= VMDS_NEEDS_LOOKUP;
8390 			}
8391 			clear_in_transition_end = tmp_entry.vme_end;
8392 			continue;
8393 		}
8394 
8395 		assert(entry->wired_count == 0);
8396 		assert(entry->user_wired_count == 0);
8397 
8398 
8399 		/*
8400 		 * Step 6: Entry is unwired and ready for us to delete !
8401 		 */
8402 
8403 		if (!entry->vme_permanent) {
8404 			/*
8405 			 * Typical case: the entry really shouldn't be permanent
8406 			 */
8407 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8408 		    (entry->protection & VM_PROT_EXECUTE) &&
8409 		    developer_mode_state()) {
8410 			/*
8411 			 * Allow debuggers to undo executable mappings
8412 			 * when developer mode is on.
8413 			 */
8414 #if 0
8415 			printf("FBDP %d[%s] removing permanent executable entry "
8416 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8417 			    proc_selfpid(),
8418 			    (current_task()->bsd_info
8419 			    ? proc_name_address(current_task()->bsd_info)
8420 			    : "?"), entry,
8421 			    (uint64_t)entry->vme_start,
8422 			    (uint64_t)entry->vme_end,
8423 			    entry->protection,
8424 			    entry->max_protection);
8425 #endif
8426 			entry->vme_permanent = FALSE;
8427 		} else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8428 #if 0
8429 			printf("FBDP %d[%s] removing permanent entry "
8430 			    "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8431 			    proc_selfpid(),
8432 			    (current_task()->bsd_info
8433 			    ? proc_name_address(current_task()->bsd_info)
8434 			    : "?"), entry,
8435 			    (uint64_t)entry->vme_start,
8436 			    (uint64_t)entry->vme_end,
8437 			    entry->protection,
8438 			    entry->max_protection);
8439 #endif
8440 			entry->vme_permanent = FALSE;
8441 #if CODE_SIGNING_MONITOR
8442 		} else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8443 			entry->vme_permanent = FALSE;
8444 
8445 			printf("%d[%s] %s(0x%llx,0x%llx): "
8446 			    "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8447 			    "prot 0x%x/0x%x\n",
8448 			    proc_selfpid(),
8449 			    (get_bsdtask_info(current_task())
8450 			    ? proc_name_address(get_bsdtask_info(current_task()))
8451 			    : "?"),
8452 			    __FUNCTION__,
8453 			    (uint64_t)start,
8454 			    (uint64_t)end,
8455 			    (uint64_t)entry->vme_start,
8456 			    (uint64_t)entry->vme_end,
8457 			    entry->protection,
8458 			    entry->max_protection);
8459 #endif
8460 		} else {
8461 			DTRACE_VM6(vm_map_delete_permanent,
8462 			    vm_map_entry_t, entry,
8463 			    vm_map_offset_t, entry->vme_start,
8464 			    vm_map_offset_t, entry->vme_end,
8465 			    vm_prot_t, entry->protection,
8466 			    vm_prot_t, entry->max_protection,
8467 			    int, VME_ALIAS(entry));
8468 		}
8469 
8470 		if (entry->is_sub_map) {
8471 			assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8472 			    "map %p (%d) entry %p submap %p (%d)\n",
8473 			    map, VM_MAP_PAGE_SHIFT(map), entry,
8474 			    VME_SUBMAP(entry),
8475 			    VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8476 			if (entry->use_pmap) {
8477 #ifndef NO_NESTED_PMAP
8478 				int pmap_flags;
8479 
8480 				if (map->terminated) {
8481 					/*
8482 					 * This is the final cleanup of the
8483 					 * address space being terminated.
8484 					 * No new mappings are expected and
8485 					 * we don't really need to unnest the
8486 					 * shared region (and lose the "global"
8487 					 * pmap mappings, if applicable).
8488 					 *
8489 					 * Tell the pmap layer that we're
8490 					 * "clean" wrt nesting.
8491 					 */
8492 					pmap_flags = PMAP_UNNEST_CLEAN;
8493 				} else {
8494 					/*
8495 					 * We're unmapping part of the nested
8496 					 * shared region, so we can't keep the
8497 					 * nested pmap.
8498 					 */
8499 					pmap_flags = 0;
8500 				}
8501 				pmap_unnest_options(
8502 					map->pmap,
8503 					(addr64_t)entry->vme_start,
8504 					entry->vme_end - entry->vme_start,
8505 					pmap_flags);
8506 #endif  /* NO_NESTED_PMAP */
8507 				if (map->mapped_in_other_pmaps &&
8508 				    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8509 					/* clean up parent map/maps */
8510 					vm_map_submap_pmap_clean(
8511 						map, entry->vme_start,
8512 						entry->vme_end,
8513 						VME_SUBMAP(entry),
8514 						VME_OFFSET(entry));
8515 				}
8516 			} else {
8517 				vm_map_submap_pmap_clean(
8518 					map, entry->vme_start, entry->vme_end,
8519 					VME_SUBMAP(entry),
8520 					VME_OFFSET(entry));
8521 			}
8522 		} else if (entry->vme_kernel_object ||
8523 		    VME_OBJECT(entry) == compressor_object) {
8524 			/*
8525 			 * nothing to do
8526 			 */
8527 		} else if (map->mapped_in_other_pmaps &&
8528 		    os_ref_get_count_raw(&map->map_refcnt) != 0) {
8529 			vm_object_pmap_protect_options(
8530 				VME_OBJECT(entry), VME_OFFSET(entry),
8531 				entry->vme_end - entry->vme_start,
8532 				PMAP_NULL,
8533 				PAGE_SIZE,
8534 				entry->vme_start,
8535 				VM_PROT_NONE,
8536 				PMAP_OPTIONS_REMOVE);
8537 		} else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8538 		    (state & VMDS_KERNEL_PMAP)) {
8539 			/* Remove translations associated
8540 			 * with this range unless the entry
8541 			 * does not have an object, or
8542 			 * it's the kernel map or a descendant
8543 			 * since the platform could potentially
8544 			 * create "backdoor" mappings invisible
8545 			 * to the VM. It is expected that
8546 			 * objectless, non-kernel ranges
8547 			 * do not have such VM invisible
8548 			 * translations.
8549 			 */
8550 			pmap_remove_options(map->pmap,
8551 			    (addr64_t)entry->vme_start,
8552 			    (addr64_t)entry->vme_end,
8553 			    PMAP_OPTIONS_REMOVE);
8554 		}
8555 
8556 #if DEBUG
8557 		/*
8558 		 * All pmap mappings for this map entry must have been
8559 		 * cleared by now.
8560 		 */
8561 		assert(pmap_is_empty(map->pmap,
8562 		    entry->vme_start,
8563 		    entry->vme_end));
8564 #endif /* DEBUG */
8565 
8566 		if (entry->iokit_acct) {
8567 			/* alternate accounting */
8568 			DTRACE_VM4(vm_map_iokit_unmapped_region,
8569 			    vm_map_t, map,
8570 			    vm_map_offset_t, entry->vme_start,
8571 			    vm_map_offset_t, entry->vme_end,
8572 			    int, VME_ALIAS(entry));
8573 			vm_map_iokit_unmapped_region(map,
8574 			    (entry->vme_end -
8575 			    entry->vme_start));
8576 			entry->iokit_acct = FALSE;
8577 			entry->use_pmap = FALSE;
8578 		}
8579 
8580 		/* move "s" forward */
8581 		s    = entry->vme_end;
8582 		next = entry->vme_next;
8583 		if (!entry->map_aligned) {
8584 			vm_map_offset_t rounded_s;
8585 
8586 			/*
8587 			 * Skip artificial gap due to mis-aligned entry
8588 			 * on devices with a page size smaller than the
8589 			 * map's page size (i.e. 16k task on a 4k device).
8590 			 */
8591 			rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8592 			if (next == vm_map_to_entry(map)) {
8593 				s = rounded_s;
8594 			} else if (s < rounded_s) {
8595 				s = MIN(rounded_s, next->vme_start);
8596 			}
8597 		}
8598 		ret.kmr_size += s - entry->vme_start;
8599 
8600 		if (entry->vme_permanent) {
8601 			/*
8602 			 * A permanent entry can not be removed, so leave it
8603 			 * in place but remove all access permissions.
8604 			 */
8605 			if (!entry->csm_associated) {
8606 				printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8607 				    __FUNCTION__, __LINE__,
8608 				    proc_selfpid(),
8609 				    (get_bsdtask_info(current_task())
8610 				    ? proc_name_address(get_bsdtask_info(current_task()))
8611 				    : "?"),
8612 				    map,
8613 				    entry,
8614 				    (uint64_t)entry->vme_start,
8615 				    (uint64_t)entry->vme_end,
8616 				    entry->is_sub_map,
8617 				    entry->protection,
8618 				    entry->max_protection);
8619 			}
8620 			DTRACE_VM6(vm_map_delete_permanent_prot_none,
8621 			    vm_map_entry_t, entry,
8622 			    vm_map_offset_t, entry->vme_start,
8623 			    vm_map_offset_t, entry->vme_end,
8624 			    vm_prot_t, entry->protection,
8625 			    vm_prot_t, entry->max_protection,
8626 			    int, VME_ALIAS(entry));
8627 			entry->protection = VM_PROT_NONE;
8628 			entry->max_protection = VM_PROT_NONE;
8629 		} else {
8630 			vm_map_entry_zap(map, entry, zap_list);
8631 		}
8632 
8633 		entry = next;
8634 		next  = VM_MAP_ENTRY_NULL;
8635 
8636 		if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8637 			unsigned int last_timestamp = map->timestamp++;
8638 
8639 			if (lck_rw_lock_yield_exclusive(&map->lock,
8640 			    LCK_RW_YIELD_ANY_WAITER)) {
8641 				if (last_timestamp != map->timestamp + 1) {
8642 					state |= VMDS_NEEDS_LOOKUP;
8643 				}
8644 			} else {
8645 				/* we didn't yield, undo our change */
8646 				map->timestamp--;
8647 			}
8648 		}
8649 	}
8650 
8651 	if (map->wait_for_space) {
8652 		thread_wakeup((event_t) map);
8653 	}
8654 
8655 	if (state & VMDS_NEEDS_WAKEUP) {
8656 		vm_map_entry_wakeup(map);
8657 	}
8658 
8659 out:
8660 	if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8661 		__vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8662 	}
8663 
8664 	if (state & VMDS_KERNEL_KMEMPTR) {
8665 		kmem_free_space(start, end, range_id, &slot);
8666 	}
8667 
8668 	if (state & VMDS_FOUND_GAP) {
8669 		DTRACE_VM3(kern_vm_deallocate_gap,
8670 		    vm_map_offset_t, gap_start,
8671 		    vm_map_offset_t, save_start,
8672 		    vm_map_offset_t, save_end);
8673 		if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8674 			ret.kmr_return = KERN_INVALID_VALUE;
8675 		} else {
8676 			vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8677 		}
8678 	}
8679 
8680 	return ret;
8681 }
8682 
8683 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8684 vm_map_remove_and_unlock(
8685 	vm_map_t        map,
8686 	vm_map_offset_t start,
8687 	vm_map_offset_t end,
8688 	vmr_flags_t     flags,
8689 	kmem_guard_t    guard)
8690 {
8691 	kmem_return_t ret;
8692 	VM_MAP_ZAP_DECLARE(zap);
8693 
8694 	ret = vm_map_delete(map, start, end, flags, guard, &zap);
8695 	vm_map_unlock(map);
8696 
8697 	vm_map_zap_dispose(&zap);
8698 
8699 	return ret;
8700 }
8701 
8702 /*
8703  *	vm_map_remove_guard:
8704  *
8705  *	Remove the given address range from the target map.
8706  *	This is the exported form of vm_map_delete.
8707  */
8708 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8709 vm_map_remove_guard(
8710 	vm_map_t        map,
8711 	vm_map_offset_t start,
8712 	vm_map_offset_t end,
8713 	vmr_flags_t     flags,
8714 	kmem_guard_t    guard)
8715 {
8716 	vm_map_lock(map);
8717 	return vm_map_remove_and_unlock(map, start, end, flags, guard);
8718 }
8719 
8720 /*
8721  *	vm_map_terminate:
8722  *
8723  *	Clean out a task's map.
8724  */
8725 kern_return_t
vm_map_terminate(vm_map_t map)8726 vm_map_terminate(
8727 	vm_map_t        map)
8728 {
8729 	vm_map_lock(map);
8730 	map->terminated = TRUE;
8731 	vm_map_disable_hole_optimization(map);
8732 	(void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8733 	    VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
8734 	return KERN_SUCCESS;
8735 }
8736 
8737 /*
8738  *	Routine:	vm_map_copy_allocate
8739  *
8740  *	Description:
8741  *		Allocates and initializes a map copy object.
8742  */
8743 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)8744 vm_map_copy_allocate(uint16_t type)
8745 {
8746 	vm_map_copy_t new_copy;
8747 
8748 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
8749 	new_copy->type = type;
8750 	if (type == VM_MAP_COPY_ENTRY_LIST) {
8751 		new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8752 		vm_map_store_init(&new_copy->cpy_hdr);
8753 	}
8754 	return new_copy;
8755 }
8756 
8757 /*
8758  *	Routine:	vm_map_copy_discard
8759  *
8760  *	Description:
8761  *		Dispose of a map copy object (returned by
8762  *		vm_map_copyin).
8763  */
8764 void
vm_map_copy_discard(vm_map_copy_t copy)8765 vm_map_copy_discard(
8766 	vm_map_copy_t   copy)
8767 {
8768 	if (copy == VM_MAP_COPY_NULL) {
8769 		return;
8770 	}
8771 
8772 	/*
8773 	 * Assert that the vm_map_copy is coming from the right
8774 	 * zone and hasn't been forged
8775 	 */
8776 	vm_map_copy_require(copy);
8777 
8778 	switch (copy->type) {
8779 	case VM_MAP_COPY_ENTRY_LIST:
8780 		while (vm_map_copy_first_entry(copy) !=
8781 		    vm_map_copy_to_entry(copy)) {
8782 			vm_map_entry_t  entry = vm_map_copy_first_entry(copy);
8783 
8784 			vm_map_copy_entry_unlink(copy, entry);
8785 			if (entry->is_sub_map) {
8786 				vm_map_deallocate(VME_SUBMAP(entry));
8787 			} else {
8788 				vm_object_deallocate(VME_OBJECT(entry));
8789 			}
8790 			vm_map_copy_entry_dispose(entry);
8791 		}
8792 		break;
8793 	case VM_MAP_COPY_KERNEL_BUFFER:
8794 
8795 		/*
8796 		 * The vm_map_copy_t and possibly the data buffer were
8797 		 * allocated by a single call to kalloc_data(), i.e. the
8798 		 * vm_map_copy_t was not allocated out of the zone.
8799 		 */
8800 		if (copy->size > msg_ool_size_small || copy->offset) {
8801 			panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
8802 			    (long long)copy->size, (long long)copy->offset);
8803 		}
8804 		kfree_data(copy->cpy_kdata, copy->size);
8805 	}
8806 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
8807 }
8808 
8809 #if XNU_PLATFORM_MacOSX
8810 
8811 __exported
8812 extern vm_map_copy_t vm_map_copy_copy(vm_map_copy_t copy);
8813 
8814 /*
8815  *	Routine:	vm_map_copy_copy
8816  *
8817  *	Description:
8818  *			Move the information in a map copy object to
8819  *			a new map copy object, leaving the old one
8820  *			empty.
8821  *
8822  *			This is used by kernel routines that need
8823  *			to look at out-of-line data (in copyin form)
8824  *			before deciding whether to return SUCCESS.
8825  *			If the routine returns FAILURE, the original
8826  *			copy object will be deallocated; therefore,
8827  *			these routines must make a copy of the copy
8828  *			object and leave the original empty so that
8829  *			deallocation will not fail.
8830  */
8831 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)8832 vm_map_copy_copy(
8833 	vm_map_copy_t   copy)
8834 {
8835 	vm_map_copy_t   new_copy;
8836 
8837 	if (copy == VM_MAP_COPY_NULL) {
8838 		return VM_MAP_COPY_NULL;
8839 	}
8840 
8841 	/*
8842 	 * Assert that the vm_map_copy is coming from the right
8843 	 * zone and hasn't been forged
8844 	 */
8845 	vm_map_copy_require(copy);
8846 
8847 	/*
8848 	 * Allocate a new copy object, and copy the information
8849 	 * from the old one into it.
8850 	 */
8851 
8852 	new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8853 	memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
8854 #if __has_feature(ptrauth_calls)
8855 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8856 		new_copy->cpy_kdata = copy->cpy_kdata;
8857 	}
8858 #endif
8859 
8860 	if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
8861 		/*
8862 		 * The links in the entry chain must be
8863 		 * changed to point to the new copy object.
8864 		 */
8865 		vm_map_copy_first_entry(copy)->vme_prev
8866 		        = vm_map_copy_to_entry(new_copy);
8867 		vm_map_copy_last_entry(copy)->vme_next
8868 		        = vm_map_copy_to_entry(new_copy);
8869 	}
8870 
8871 	/*
8872 	 * Change the old copy object into one that contains
8873 	 * nothing to be deallocated.
8874 	 */
8875 	bzero(copy, sizeof(struct vm_map_copy));
8876 	copy->type = VM_MAP_COPY_KERNEL_BUFFER;
8877 
8878 	/*
8879 	 * Return the new object.
8880 	 */
8881 	return new_copy;
8882 }
8883 
8884 #endif /* XNU_PLATFORM_MacOSX */
8885 
8886 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)8887 vm_map_entry_is_overwritable(
8888 	vm_map_t        dst_map __unused,
8889 	vm_map_entry_t  entry)
8890 {
8891 	if (!(entry->protection & VM_PROT_WRITE)) {
8892 		/* can't overwrite if not writable */
8893 		return FALSE;
8894 	}
8895 #if !__x86_64__
8896 	if (entry->used_for_jit &&
8897 	    vm_map_cs_enforcement(dst_map) &&
8898 	    !dst_map->cs_debugged) {
8899 		/*
8900 		 * Can't overwrite a JIT region while cs_enforced
8901 		 * and not cs_debugged.
8902 		 */
8903 		return FALSE;
8904 	}
8905 
8906 #if __arm64e__
8907 	/* Do not allow overwrite HW assisted TPRO entries */
8908 	if (entry->used_for_tpro) {
8909 		return FALSE;
8910 	}
8911 #endif /* __arm64e__ */
8912 
8913 	if (entry->vme_permanent) {
8914 		if (entry->is_sub_map) {
8915 			/*
8916 			 * We can't tell if the submap contains "permanent"
8917 			 * entries within the range targeted by the caller.
8918 			 * The caller will have to check for that with
8919 			 * vm_map_overwrite_submap_recurse() for example.
8920 			 */
8921 		} else {
8922 			/*
8923 			 * Do not allow overwriting of a "permanent"
8924 			 * entry.
8925 			 */
8926 			DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
8927 			    vm_map_entry_t, entry,
8928 			    vm_map_offset_t, entry->vme_start,
8929 			    vm_map_offset_t, entry->vme_end,
8930 			    vm_prot_t, entry->protection,
8931 			    vm_prot_t, entry->max_protection,
8932 			    int, VME_ALIAS(entry));
8933 			return FALSE;
8934 		}
8935 	}
8936 #endif /* !__x86_64__ */
8937 	return TRUE;
8938 }
8939 
8940 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)8941 vm_map_overwrite_submap_recurse(
8942 	vm_map_t        dst_map,
8943 	vm_map_offset_t dst_addr,
8944 	vm_map_size_t   dst_size)
8945 {
8946 	vm_map_offset_t dst_end;
8947 	vm_map_entry_t  tmp_entry;
8948 	vm_map_entry_t  entry;
8949 	kern_return_t   result;
8950 	boolean_t       encountered_sub_map = FALSE;
8951 
8952 
8953 
8954 	/*
8955 	 *	Verify that the destination is all writeable
8956 	 *	initially.  We have to trunc the destination
8957 	 *	address and round the copy size or we'll end up
8958 	 *	splitting entries in strange ways.
8959 	 */
8960 
8961 	dst_end = vm_map_round_page(dst_addr + dst_size,
8962 	    VM_MAP_PAGE_MASK(dst_map));
8963 	vm_map_lock(dst_map);
8964 
8965 start_pass_1:
8966 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
8967 		vm_map_unlock(dst_map);
8968 		return KERN_INVALID_ADDRESS;
8969 	}
8970 
8971 	vm_map_clip_start(dst_map,
8972 	    tmp_entry,
8973 	    vm_map_trunc_page(dst_addr,
8974 	    VM_MAP_PAGE_MASK(dst_map)));
8975 	if (tmp_entry->is_sub_map) {
8976 		/* clipping did unnest if needed */
8977 		assert(!tmp_entry->use_pmap);
8978 	}
8979 
8980 	for (entry = tmp_entry;;) {
8981 		vm_map_entry_t  next;
8982 
8983 		next = entry->vme_next;
8984 		while (entry->is_sub_map) {
8985 			vm_map_offset_t sub_start;
8986 			vm_map_offset_t sub_end;
8987 			vm_map_offset_t local_end;
8988 
8989 			if (entry->in_transition) {
8990 				/*
8991 				 * Say that we are waiting, and wait for entry.
8992 				 */
8993 				entry->needs_wakeup = TRUE;
8994 				vm_map_entry_wait(dst_map, THREAD_UNINT);
8995 
8996 				goto start_pass_1;
8997 			}
8998 
8999 			encountered_sub_map = TRUE;
9000 			sub_start = VME_OFFSET(entry);
9001 
9002 			if (entry->vme_end < dst_end) {
9003 				sub_end = entry->vme_end;
9004 			} else {
9005 				sub_end = dst_end;
9006 			}
9007 			sub_end -= entry->vme_start;
9008 			sub_end += VME_OFFSET(entry);
9009 			local_end = entry->vme_end;
9010 			vm_map_unlock(dst_map);
9011 
9012 			result = vm_map_overwrite_submap_recurse(
9013 				VME_SUBMAP(entry),
9014 				sub_start,
9015 				sub_end - sub_start);
9016 
9017 			if (result != KERN_SUCCESS) {
9018 				return result;
9019 			}
9020 			if (dst_end <= entry->vme_end) {
9021 				return KERN_SUCCESS;
9022 			}
9023 			vm_map_lock(dst_map);
9024 			if (!vm_map_lookup_entry(dst_map, local_end,
9025 			    &tmp_entry)) {
9026 				vm_map_unlock(dst_map);
9027 				return KERN_INVALID_ADDRESS;
9028 			}
9029 			entry = tmp_entry;
9030 			next = entry->vme_next;
9031 		}
9032 
9033 		if (!(entry->protection & VM_PROT_WRITE)) {
9034 			vm_map_unlock(dst_map);
9035 			return KERN_PROTECTION_FAILURE;
9036 		}
9037 
9038 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9039 			vm_map_unlock(dst_map);
9040 			return KERN_PROTECTION_FAILURE;
9041 		}
9042 
9043 		/*
9044 		 *	If the entry is in transition, we must wait
9045 		 *	for it to exit that state.  Anything could happen
9046 		 *	when we unlock the map, so start over.
9047 		 */
9048 		if (entry->in_transition) {
9049 			/*
9050 			 * Say that we are waiting, and wait for entry.
9051 			 */
9052 			entry->needs_wakeup = TRUE;
9053 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9054 
9055 			goto start_pass_1;
9056 		}
9057 
9058 /*
9059  *		our range is contained completely within this map entry
9060  */
9061 		if (dst_end <= entry->vme_end) {
9062 			vm_map_unlock(dst_map);
9063 			return KERN_SUCCESS;
9064 		}
9065 /*
9066  *		check that range specified is contiguous region
9067  */
9068 		if ((next == vm_map_to_entry(dst_map)) ||
9069 		    (next->vme_start != entry->vme_end)) {
9070 			vm_map_unlock(dst_map);
9071 			return KERN_INVALID_ADDRESS;
9072 		}
9073 
9074 		/*
9075 		 *	Check for permanent objects in the destination.
9076 		 */
9077 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9078 		    ((!VME_OBJECT(entry)->internal) ||
9079 		    (VME_OBJECT(entry)->true_share))) {
9080 			if (encountered_sub_map) {
9081 				vm_map_unlock(dst_map);
9082 				return KERN_FAILURE;
9083 			}
9084 		}
9085 
9086 
9087 		entry = next;
9088 	}/* for */
9089 	vm_map_unlock(dst_map);
9090 	return KERN_SUCCESS;
9091 }
9092 
9093 /*
9094  *	Routine:	vm_map_copy_overwrite
9095  *
9096  *	Description:
9097  *		Copy the memory described by the map copy
9098  *		object (copy; returned by vm_map_copyin) onto
9099  *		the specified destination region (dst_map, dst_addr).
9100  *		The destination must be writeable.
9101  *
9102  *		Unlike vm_map_copyout, this routine actually
9103  *		writes over previously-mapped memory.  If the
9104  *		previous mapping was to a permanent (user-supplied)
9105  *		memory object, it is preserved.
9106  *
9107  *		The attributes (protection and inheritance) of the
9108  *		destination region are preserved.
9109  *
9110  *		If successful, consumes the copy object.
9111  *		Otherwise, the caller is responsible for it.
9112  *
9113  *	Implementation notes:
9114  *		To overwrite aligned temporary virtual memory, it is
9115  *		sufficient to remove the previous mapping and insert
9116  *		the new copy.  This replacement is done either on
9117  *		the whole region (if no permanent virtual memory
9118  *		objects are embedded in the destination region) or
9119  *		in individual map entries.
9120  *
9121  *		To overwrite permanent virtual memory , it is necessary
9122  *		to copy each page, as the external memory management
9123  *		interface currently does not provide any optimizations.
9124  *
9125  *		Unaligned memory also has to be copied.  It is possible
9126  *		to use 'vm_trickery' to copy the aligned data.  This is
9127  *		not done but not hard to implement.
9128  *
9129  *		Once a page of permanent memory has been overwritten,
9130  *		it is impossible to interrupt this function; otherwise,
9131  *		the call would be neither atomic nor location-independent.
9132  *		The kernel-state portion of a user thread must be
9133  *		interruptible.
9134  *
9135  *		It may be expensive to forward all requests that might
9136  *		overwrite permanent memory (vm_write, vm_copy) to
9137  *		uninterruptible kernel threads.  This routine may be
9138  *		called by interruptible threads; however, success is
9139  *		not guaranteed -- if the request cannot be performed
9140  *		atomically and interruptibly, an error indication is
9141  *		returned.
9142  *
9143  *		Callers of this function must call vm_map_copy_require on
9144  *		previously created vm_map_copy_t or pass a newly created
9145  *		one to ensure that it hasn't been forged.
9146  */
9147 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9148 vm_map_copy_overwrite_nested(
9149 	vm_map_t                dst_map,
9150 	vm_map_address_t        dst_addr,
9151 	vm_map_copy_t           copy,
9152 	boolean_t               interruptible,
9153 	pmap_t                  pmap,
9154 	boolean_t               discard_on_success)
9155 {
9156 	vm_map_offset_t         dst_end;
9157 	vm_map_entry_t          tmp_entry;
9158 	vm_map_entry_t          entry;
9159 	kern_return_t           kr;
9160 	boolean_t               aligned = TRUE;
9161 	boolean_t               contains_permanent_objects = FALSE;
9162 	boolean_t               encountered_sub_map = FALSE;
9163 	vm_map_offset_t         base_addr;
9164 	vm_map_size_t           copy_size;
9165 	vm_map_size_t           total_size;
9166 	uint16_t                copy_page_shift;
9167 
9168 	/*
9169 	 *	Check for special kernel buffer allocated
9170 	 *	by new_ipc_kmsg_copyin.
9171 	 */
9172 
9173 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9174 		kr = vm_map_copyout_kernel_buffer(
9175 			dst_map, &dst_addr,
9176 			copy, copy->size, TRUE, discard_on_success);
9177 		return kr;
9178 	}
9179 
9180 	/*
9181 	 *      Only works for entry lists at the moment.  Will
9182 	 *	support page lists later.
9183 	 */
9184 
9185 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9186 
9187 	if (copy->size == 0) {
9188 		if (discard_on_success) {
9189 			vm_map_copy_discard(copy);
9190 		}
9191 		return KERN_SUCCESS;
9192 	}
9193 
9194 	copy_page_shift = copy->cpy_hdr.page_shift;
9195 
9196 	/*
9197 	 *	Verify that the destination is all writeable
9198 	 *	initially.  We have to trunc the destination
9199 	 *	address and round the copy size or we'll end up
9200 	 *	splitting entries in strange ways.
9201 	 */
9202 
9203 	if (!VM_MAP_PAGE_ALIGNED(copy->size,
9204 	    VM_MAP_PAGE_MASK(dst_map)) ||
9205 	    !VM_MAP_PAGE_ALIGNED(copy->offset,
9206 	    VM_MAP_PAGE_MASK(dst_map)) ||
9207 	    !VM_MAP_PAGE_ALIGNED(dst_addr,
9208 	    VM_MAP_PAGE_MASK(dst_map)) ||
9209 	    copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9210 		aligned = FALSE;
9211 		dst_end = vm_map_round_page(dst_addr + copy->size,
9212 		    VM_MAP_PAGE_MASK(dst_map));
9213 	} else {
9214 		dst_end = dst_addr + copy->size;
9215 	}
9216 
9217 	vm_map_lock(dst_map);
9218 
9219 	/* LP64todo - remove this check when vm_map_commpage64()
9220 	 * no longer has to stuff in a map_entry for the commpage
9221 	 * above the map's max_offset.
9222 	 */
9223 	if (dst_addr >= dst_map->max_offset) {
9224 		vm_map_unlock(dst_map);
9225 		return KERN_INVALID_ADDRESS;
9226 	}
9227 
9228 start_pass_1:
9229 	if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9230 		vm_map_unlock(dst_map);
9231 		return KERN_INVALID_ADDRESS;
9232 	}
9233 	vm_map_clip_start(dst_map,
9234 	    tmp_entry,
9235 	    vm_map_trunc_page(dst_addr,
9236 	    VM_MAP_PAGE_MASK(dst_map)));
9237 	for (entry = tmp_entry;;) {
9238 		vm_map_entry_t  next = entry->vme_next;
9239 
9240 		while (entry->is_sub_map) {
9241 			vm_map_offset_t sub_start;
9242 			vm_map_offset_t sub_end;
9243 			vm_map_offset_t local_end;
9244 
9245 			if (entry->in_transition) {
9246 				/*
9247 				 * Say that we are waiting, and wait for entry.
9248 				 */
9249 				entry->needs_wakeup = TRUE;
9250 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9251 
9252 				goto start_pass_1;
9253 			}
9254 
9255 			local_end = entry->vme_end;
9256 			if (!(entry->needs_copy)) {
9257 				/* if needs_copy we are a COW submap */
9258 				/* in such a case we just replace so */
9259 				/* there is no need for the follow-  */
9260 				/* ing check.                        */
9261 				encountered_sub_map = TRUE;
9262 				sub_start = VME_OFFSET(entry);
9263 
9264 				if (entry->vme_end < dst_end) {
9265 					sub_end = entry->vme_end;
9266 				} else {
9267 					sub_end = dst_end;
9268 				}
9269 				sub_end -= entry->vme_start;
9270 				sub_end += VME_OFFSET(entry);
9271 				vm_map_unlock(dst_map);
9272 
9273 				kr = vm_map_overwrite_submap_recurse(
9274 					VME_SUBMAP(entry),
9275 					sub_start,
9276 					sub_end - sub_start);
9277 				if (kr != KERN_SUCCESS) {
9278 					return kr;
9279 				}
9280 				vm_map_lock(dst_map);
9281 			}
9282 
9283 			if (dst_end <= entry->vme_end) {
9284 				goto start_overwrite;
9285 			}
9286 			if (!vm_map_lookup_entry(dst_map, local_end,
9287 			    &entry)) {
9288 				vm_map_unlock(dst_map);
9289 				return KERN_INVALID_ADDRESS;
9290 			}
9291 			next = entry->vme_next;
9292 		}
9293 
9294 		if (!(entry->protection & VM_PROT_WRITE)) {
9295 			vm_map_unlock(dst_map);
9296 			return KERN_PROTECTION_FAILURE;
9297 		}
9298 
9299 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9300 			vm_map_unlock(dst_map);
9301 			return KERN_PROTECTION_FAILURE;
9302 		}
9303 
9304 		/*
9305 		 *	If the entry is in transition, we must wait
9306 		 *	for it to exit that state.  Anything could happen
9307 		 *	when we unlock the map, so start over.
9308 		 */
9309 		if (entry->in_transition) {
9310 			/*
9311 			 * Say that we are waiting, and wait for entry.
9312 			 */
9313 			entry->needs_wakeup = TRUE;
9314 			vm_map_entry_wait(dst_map, THREAD_UNINT);
9315 
9316 			goto start_pass_1;
9317 		}
9318 
9319 /*
9320  *		our range is contained completely within this map entry
9321  */
9322 		if (dst_end <= entry->vme_end) {
9323 			break;
9324 		}
9325 /*
9326  *		check that range specified is contiguous region
9327  */
9328 		if ((next == vm_map_to_entry(dst_map)) ||
9329 		    (next->vme_start != entry->vme_end)) {
9330 			vm_map_unlock(dst_map);
9331 			return KERN_INVALID_ADDRESS;
9332 		}
9333 
9334 
9335 		/*
9336 		 *	Check for permanent objects in the destination.
9337 		 */
9338 		if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9339 		    ((!VME_OBJECT(entry)->internal) ||
9340 		    (VME_OBJECT(entry)->true_share))) {
9341 			contains_permanent_objects = TRUE;
9342 		}
9343 
9344 		entry = next;
9345 	}/* for */
9346 
9347 start_overwrite:
9348 	/*
9349 	 *	If there are permanent objects in the destination, then
9350 	 *	the copy cannot be interrupted.
9351 	 */
9352 
9353 	if (interruptible && contains_permanent_objects) {
9354 		vm_map_unlock(dst_map);
9355 		return KERN_FAILURE;   /* XXX */
9356 	}
9357 
9358 	/*
9359 	 *
9360 	 *	Make a second pass, overwriting the data
9361 	 *	At the beginning of each loop iteration,
9362 	 *	the next entry to be overwritten is "tmp_entry"
9363 	 *	(initially, the value returned from the lookup above),
9364 	 *	and the starting address expected in that entry
9365 	 *	is "start".
9366 	 */
9367 
9368 	total_size = copy->size;
9369 	if (encountered_sub_map) {
9370 		copy_size = 0;
9371 		/* re-calculate tmp_entry since we've had the map */
9372 		/* unlocked */
9373 		if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9374 			vm_map_unlock(dst_map);
9375 			return KERN_INVALID_ADDRESS;
9376 		}
9377 	} else {
9378 		copy_size = copy->size;
9379 	}
9380 
9381 	base_addr = dst_addr;
9382 	while (TRUE) {
9383 		/* deconstruct the copy object and do in parts */
9384 		/* only in sub_map, interruptable case */
9385 		vm_map_entry_t  copy_entry;
9386 		vm_map_entry_t  previous_prev = VM_MAP_ENTRY_NULL;
9387 		vm_map_entry_t  next_copy = VM_MAP_ENTRY_NULL;
9388 		int             nentries;
9389 		int             remaining_entries = 0;
9390 		vm_map_offset_t new_offset = 0;
9391 
9392 		for (entry = tmp_entry; copy_size == 0;) {
9393 			vm_map_entry_t  next;
9394 
9395 			next = entry->vme_next;
9396 
9397 			/* tmp_entry and base address are moved along */
9398 			/* each time we encounter a sub-map.  Otherwise */
9399 			/* entry can outpase tmp_entry, and the copy_size */
9400 			/* may reflect the distance between them */
9401 			/* if the current entry is found to be in transition */
9402 			/* we will start over at the beginning or the last */
9403 			/* encounter of a submap as dictated by base_addr */
9404 			/* we will zero copy_size accordingly. */
9405 			if (entry->in_transition) {
9406 				/*
9407 				 * Say that we are waiting, and wait for entry.
9408 				 */
9409 				entry->needs_wakeup = TRUE;
9410 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9411 
9412 				if (!vm_map_lookup_entry(dst_map, base_addr,
9413 				    &tmp_entry)) {
9414 					vm_map_unlock(dst_map);
9415 					return KERN_INVALID_ADDRESS;
9416 				}
9417 				copy_size = 0;
9418 				entry = tmp_entry;
9419 				continue;
9420 			}
9421 			if (entry->is_sub_map) {
9422 				vm_map_offset_t sub_start;
9423 				vm_map_offset_t sub_end;
9424 				vm_map_offset_t local_end;
9425 
9426 				if (entry->needs_copy) {
9427 					/* if this is a COW submap */
9428 					/* just back the range with a */
9429 					/* anonymous entry */
9430 					assert(!entry->vme_permanent);
9431 					if (entry->vme_end < dst_end) {
9432 						sub_end = entry->vme_end;
9433 					} else {
9434 						sub_end = dst_end;
9435 					}
9436 					if (entry->vme_start < base_addr) {
9437 						sub_start = base_addr;
9438 					} else {
9439 						sub_start = entry->vme_start;
9440 					}
9441 					vm_map_clip_end(
9442 						dst_map, entry, sub_end);
9443 					vm_map_clip_start(
9444 						dst_map, entry, sub_start);
9445 					assert(!entry->use_pmap);
9446 					assert(!entry->iokit_acct);
9447 					entry->use_pmap = TRUE;
9448 					vm_map_deallocate(VME_SUBMAP(entry));
9449 					assert(!entry->vme_permanent);
9450 					VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9451 					VME_OFFSET_SET(entry, 0);
9452 					entry->is_shared = FALSE;
9453 					entry->needs_copy = FALSE;
9454 					entry->protection = VM_PROT_DEFAULT;
9455 					entry->max_protection = VM_PROT_ALL;
9456 					entry->wired_count = 0;
9457 					entry->user_wired_count = 0;
9458 					if (entry->inheritance
9459 					    == VM_INHERIT_SHARE) {
9460 						entry->inheritance = VM_INHERIT_COPY;
9461 					}
9462 					continue;
9463 				}
9464 				/* first take care of any non-sub_map */
9465 				/* entries to send */
9466 				if (base_addr < entry->vme_start) {
9467 					/* stuff to send */
9468 					copy_size =
9469 					    entry->vme_start - base_addr;
9470 					break;
9471 				}
9472 				sub_start = VME_OFFSET(entry);
9473 
9474 				if (entry->vme_end < dst_end) {
9475 					sub_end = entry->vme_end;
9476 				} else {
9477 					sub_end = dst_end;
9478 				}
9479 				sub_end -= entry->vme_start;
9480 				sub_end += VME_OFFSET(entry);
9481 				local_end = entry->vme_end;
9482 				vm_map_unlock(dst_map);
9483 				copy_size = sub_end - sub_start;
9484 
9485 				/* adjust the copy object */
9486 				if (total_size > copy_size) {
9487 					vm_map_size_t   local_size = 0;
9488 					vm_map_size_t   entry_size;
9489 
9490 					nentries = 1;
9491 					new_offset = copy->offset;
9492 					copy_entry = vm_map_copy_first_entry(copy);
9493 					while (copy_entry !=
9494 					    vm_map_copy_to_entry(copy)) {
9495 						entry_size = copy_entry->vme_end -
9496 						    copy_entry->vme_start;
9497 						if ((local_size < copy_size) &&
9498 						    ((local_size + entry_size)
9499 						    >= copy_size)) {
9500 							vm_map_copy_clip_end(copy,
9501 							    copy_entry,
9502 							    copy_entry->vme_start +
9503 							    (copy_size - local_size));
9504 							entry_size = copy_entry->vme_end -
9505 							    copy_entry->vme_start;
9506 							local_size += entry_size;
9507 							new_offset += entry_size;
9508 						}
9509 						if (local_size >= copy_size) {
9510 							next_copy = copy_entry->vme_next;
9511 							copy_entry->vme_next =
9512 							    vm_map_copy_to_entry(copy);
9513 							previous_prev =
9514 							    copy->cpy_hdr.links.prev;
9515 							copy->cpy_hdr.links.prev = copy_entry;
9516 							copy->size = copy_size;
9517 							remaining_entries =
9518 							    copy->cpy_hdr.nentries;
9519 							remaining_entries -= nentries;
9520 							copy->cpy_hdr.nentries = nentries;
9521 							break;
9522 						} else {
9523 							local_size += entry_size;
9524 							new_offset += entry_size;
9525 							nentries++;
9526 						}
9527 						copy_entry = copy_entry->vme_next;
9528 					}
9529 				}
9530 
9531 				if ((entry->use_pmap) && (pmap == NULL)) {
9532 					kr = vm_map_copy_overwrite_nested(
9533 						VME_SUBMAP(entry),
9534 						sub_start,
9535 						copy,
9536 						interruptible,
9537 						VME_SUBMAP(entry)->pmap,
9538 						TRUE);
9539 				} else if (pmap != NULL) {
9540 					kr = vm_map_copy_overwrite_nested(
9541 						VME_SUBMAP(entry),
9542 						sub_start,
9543 						copy,
9544 						interruptible, pmap,
9545 						TRUE);
9546 				} else {
9547 					kr = vm_map_copy_overwrite_nested(
9548 						VME_SUBMAP(entry),
9549 						sub_start,
9550 						copy,
9551 						interruptible,
9552 						dst_map->pmap,
9553 						TRUE);
9554 				}
9555 				if (kr != KERN_SUCCESS) {
9556 					if (next_copy != NULL) {
9557 						copy->cpy_hdr.nentries +=
9558 						    remaining_entries;
9559 						copy->cpy_hdr.links.prev->vme_next =
9560 						    next_copy;
9561 						copy->cpy_hdr.links.prev
9562 						        = previous_prev;
9563 						copy->size = total_size;
9564 					}
9565 					return kr;
9566 				}
9567 				if (dst_end <= local_end) {
9568 					return KERN_SUCCESS;
9569 				}
9570 				/* otherwise copy no longer exists, it was */
9571 				/* destroyed after successful copy_overwrite */
9572 				copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9573 				copy->offset = new_offset;
9574 				copy->cpy_hdr.page_shift = copy_page_shift;
9575 
9576 				total_size -= copy_size;
9577 				copy_size = 0;
9578 				/* put back remainder of copy in container */
9579 				if (next_copy != NULL) {
9580 					copy->cpy_hdr.nentries = remaining_entries;
9581 					copy->cpy_hdr.links.next = next_copy;
9582 					copy->cpy_hdr.links.prev = previous_prev;
9583 					copy->size = total_size;
9584 					next_copy->vme_prev =
9585 					    vm_map_copy_to_entry(copy);
9586 					next_copy = NULL;
9587 				}
9588 				base_addr = local_end;
9589 				vm_map_lock(dst_map);
9590 				if (!vm_map_lookup_entry(dst_map,
9591 				    local_end, &tmp_entry)) {
9592 					vm_map_unlock(dst_map);
9593 					return KERN_INVALID_ADDRESS;
9594 				}
9595 				entry = tmp_entry;
9596 				continue;
9597 			}
9598 			if (dst_end <= entry->vme_end) {
9599 				copy_size = dst_end - base_addr;
9600 				break;
9601 			}
9602 
9603 			if ((next == vm_map_to_entry(dst_map)) ||
9604 			    (next->vme_start != entry->vme_end)) {
9605 				vm_map_unlock(dst_map);
9606 				return KERN_INVALID_ADDRESS;
9607 			}
9608 
9609 			entry = next;
9610 		}/* for */
9611 
9612 		next_copy = NULL;
9613 		nentries = 1;
9614 
9615 		/* adjust the copy object */
9616 		if (total_size > copy_size) {
9617 			vm_map_size_t   local_size = 0;
9618 			vm_map_size_t   entry_size;
9619 
9620 			new_offset = copy->offset;
9621 			copy_entry = vm_map_copy_first_entry(copy);
9622 			while (copy_entry != vm_map_copy_to_entry(copy)) {
9623 				entry_size = copy_entry->vme_end -
9624 				    copy_entry->vme_start;
9625 				if ((local_size < copy_size) &&
9626 				    ((local_size + entry_size)
9627 				    >= copy_size)) {
9628 					vm_map_copy_clip_end(copy, copy_entry,
9629 					    copy_entry->vme_start +
9630 					    (copy_size - local_size));
9631 					entry_size = copy_entry->vme_end -
9632 					    copy_entry->vme_start;
9633 					local_size += entry_size;
9634 					new_offset += entry_size;
9635 				}
9636 				if (local_size >= copy_size) {
9637 					next_copy = copy_entry->vme_next;
9638 					copy_entry->vme_next =
9639 					    vm_map_copy_to_entry(copy);
9640 					previous_prev =
9641 					    copy->cpy_hdr.links.prev;
9642 					copy->cpy_hdr.links.prev = copy_entry;
9643 					copy->size = copy_size;
9644 					remaining_entries =
9645 					    copy->cpy_hdr.nentries;
9646 					remaining_entries -= nentries;
9647 					copy->cpy_hdr.nentries = nentries;
9648 					break;
9649 				} else {
9650 					local_size += entry_size;
9651 					new_offset += entry_size;
9652 					nentries++;
9653 				}
9654 				copy_entry = copy_entry->vme_next;
9655 			}
9656 		}
9657 
9658 		if (aligned) {
9659 			pmap_t  local_pmap;
9660 
9661 			if (pmap) {
9662 				local_pmap = pmap;
9663 			} else {
9664 				local_pmap = dst_map->pmap;
9665 			}
9666 
9667 			if ((kr =  vm_map_copy_overwrite_aligned(
9668 				    dst_map, tmp_entry, copy,
9669 				    base_addr, local_pmap)) != KERN_SUCCESS) {
9670 				if (next_copy != NULL) {
9671 					copy->cpy_hdr.nentries +=
9672 					    remaining_entries;
9673 					copy->cpy_hdr.links.prev->vme_next =
9674 					    next_copy;
9675 					copy->cpy_hdr.links.prev =
9676 					    previous_prev;
9677 					copy->size += copy_size;
9678 				}
9679 				return kr;
9680 			}
9681 			vm_map_unlock(dst_map);
9682 		} else {
9683 			/*
9684 			 * Performance gain:
9685 			 *
9686 			 * if the copy and dst address are misaligned but the same
9687 			 * offset within the page we can copy_not_aligned the
9688 			 * misaligned parts and copy aligned the rest.  If they are
9689 			 * aligned but len is unaligned we simply need to copy
9690 			 * the end bit unaligned.  We'll need to split the misaligned
9691 			 * bits of the region in this case !
9692 			 */
9693 			/* ALWAYS UNLOCKS THE dst_map MAP */
9694 			kr = vm_map_copy_overwrite_unaligned(
9695 				dst_map,
9696 				tmp_entry,
9697 				copy,
9698 				base_addr,
9699 				discard_on_success);
9700 			if (kr != KERN_SUCCESS) {
9701 				if (next_copy != NULL) {
9702 					copy->cpy_hdr.nentries +=
9703 					    remaining_entries;
9704 					copy->cpy_hdr.links.prev->vme_next =
9705 					    next_copy;
9706 					copy->cpy_hdr.links.prev =
9707 					    previous_prev;
9708 					copy->size += copy_size;
9709 				}
9710 				return kr;
9711 			}
9712 		}
9713 		total_size -= copy_size;
9714 		if (total_size == 0) {
9715 			break;
9716 		}
9717 		base_addr += copy_size;
9718 		copy_size = 0;
9719 		copy->offset = new_offset;
9720 		if (next_copy != NULL) {
9721 			copy->cpy_hdr.nentries = remaining_entries;
9722 			copy->cpy_hdr.links.next = next_copy;
9723 			copy->cpy_hdr.links.prev = previous_prev;
9724 			next_copy->vme_prev = vm_map_copy_to_entry(copy);
9725 			copy->size = total_size;
9726 		}
9727 		vm_map_lock(dst_map);
9728 		while (TRUE) {
9729 			if (!vm_map_lookup_entry(dst_map,
9730 			    base_addr, &tmp_entry)) {
9731 				vm_map_unlock(dst_map);
9732 				return KERN_INVALID_ADDRESS;
9733 			}
9734 			if (tmp_entry->in_transition) {
9735 				entry->needs_wakeup = TRUE;
9736 				vm_map_entry_wait(dst_map, THREAD_UNINT);
9737 			} else {
9738 				break;
9739 			}
9740 		}
9741 		vm_map_clip_start(dst_map,
9742 		    tmp_entry,
9743 		    vm_map_trunc_page(base_addr,
9744 		    VM_MAP_PAGE_MASK(dst_map)));
9745 
9746 		entry = tmp_entry;
9747 	} /* while */
9748 
9749 	/*
9750 	 *	Throw away the vm_map_copy object
9751 	 */
9752 	if (discard_on_success) {
9753 		vm_map_copy_discard(copy);
9754 	}
9755 
9756 	return KERN_SUCCESS;
9757 }/* vm_map_copy_overwrite */
9758 
9759 static inline kern_return_t
vm_map_copy_addr_size_sanitize(vm_map_t map,vm_map_offset_ut addr_u,vm_map_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * addr,vm_map_offset_t * end,vm_map_size_t * size)9760 vm_map_copy_addr_size_sanitize(
9761 	vm_map_t                map,
9762 	vm_map_offset_ut        addr_u,
9763 	vm_map_size_ut          size_u,
9764 	vm_sanitize_caller_t    vm_sanitize_caller,
9765 	vm_map_offset_t        *addr,
9766 	vm_map_offset_t        *end,
9767 	vm_map_size_t          *size)
9768 {
9769 	return vm_sanitize_addr_size(addr_u, size_u,
9770 	           vm_sanitize_caller, map,
9771 	           VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
9772 	           addr, end, size);
9773 }
9774 
9775 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_ut dst_addr_u,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t interruptible)9776 vm_map_copy_overwrite(
9777 	vm_map_t                dst_map,
9778 	vm_map_offset_ut        dst_addr_u,
9779 	vm_map_copy_t           copy,
9780 	vm_map_size_ut          copy_size_u,
9781 	boolean_t               interruptible)
9782 {
9783 	vm_map_offset_t dst_addr, dst_end;
9784 	vm_map_size_t   copy_size;
9785 	vm_map_size_t   head_size, tail_size;
9786 	vm_map_copy_t   head_copy, tail_copy;
9787 	vm_map_offset_t head_addr, tail_addr;
9788 	vm_map_entry_t  entry;
9789 	kern_return_t   kr;
9790 	vm_map_offset_t effective_page_mask, effective_page_size;
9791 	uint16_t        copy_page_shift;
9792 
9793 	head_size = 0;
9794 	tail_size = 0;
9795 	head_copy = NULL;
9796 	tail_copy = NULL;
9797 	head_addr = 0;
9798 	tail_addr = 0;
9799 
9800 	/*
9801 	 *	Check for null copy object.
9802 	 */
9803 	if (copy == VM_MAP_COPY_NULL) {
9804 		return KERN_SUCCESS;
9805 	}
9806 
9807 	/*
9808 	 * Sanitize any input parameters that are addr/size/prot/inherit
9809 	 */
9810 	kr = vm_map_copy_addr_size_sanitize(
9811 		dst_map,
9812 		dst_addr_u,
9813 		copy_size_u,
9814 		VM_SANITIZE_CALLER_VM_MAP_COPY_OVERWRITE,
9815 		&dst_addr,
9816 		&dst_end,
9817 		&copy_size);
9818 	if (__improbable(kr != KERN_SUCCESS)) {
9819 		return vm_sanitize_get_kr(kr);
9820 	}
9821 
9822 	/*
9823 	 * Assert that the vm_map_copy is coming from the right
9824 	 * zone and hasn't been forged
9825 	 */
9826 	vm_map_copy_require(copy);
9827 
9828 	if (interruptible ||
9829 	    copy->type != VM_MAP_COPY_ENTRY_LIST) {
9830 		/*
9831 		 * We can't split the "copy" map if we're interruptible
9832 		 * or if we don't have a "copy" map...
9833 		 */
9834 blunt_copy:
9835 		kr = vm_map_copy_overwrite_nested(dst_map,
9836 		    dst_addr,
9837 		    copy,
9838 		    interruptible,
9839 		    (pmap_t) NULL,
9840 		    TRUE);
9841 		if (kr) {
9842 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
9843 		}
9844 		return kr;
9845 	}
9846 
9847 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
9848 	if (copy_page_shift < PAGE_SHIFT ||
9849 	    VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9850 		goto blunt_copy;
9851 	}
9852 
9853 	if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9854 		effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
9855 	} else {
9856 		effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
9857 		effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
9858 		    effective_page_mask);
9859 	}
9860 	effective_page_size = effective_page_mask + 1;
9861 
9862 	if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
9863 		/*
9864 		 * Too small to bother with optimizing...
9865 		 */
9866 		goto blunt_copy;
9867 	}
9868 
9869 	if ((dst_addr & effective_page_mask) !=
9870 	    (copy->offset & effective_page_mask)) {
9871 		/*
9872 		 * Incompatible mis-alignment of source and destination...
9873 		 */
9874 		goto blunt_copy;
9875 	}
9876 
9877 	/*
9878 	 * Proper alignment or identical mis-alignment at the beginning.
9879 	 * Let's try and do a small unaligned copy first (if needed)
9880 	 * and then an aligned copy for the rest.
9881 	 */
9882 	if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
9883 		head_addr = dst_addr;
9884 		head_size = (effective_page_size -
9885 		    (copy->offset & effective_page_mask));
9886 		head_size = MIN(head_size, copy_size);
9887 	}
9888 	if (!vm_map_page_aligned(copy->offset + copy_size,
9889 	    effective_page_mask)) {
9890 		/*
9891 		 * Mis-alignment at the end.
9892 		 * Do an aligned copy up to the last page and
9893 		 * then an unaligned copy for the remaining bytes.
9894 		 */
9895 		tail_size = ((copy->offset + copy_size) &
9896 		    effective_page_mask);
9897 		tail_size = MIN(tail_size, copy_size);
9898 		tail_addr = dst_addr + copy_size - tail_size;
9899 		assert(tail_addr >= head_addr + head_size);
9900 	}
9901 	assert(head_size + tail_size <= copy_size);
9902 
9903 	if (head_size + tail_size == copy_size) {
9904 		/*
9905 		 * It's all unaligned, no optimization possible...
9906 		 */
9907 		goto blunt_copy;
9908 	}
9909 
9910 	/*
9911 	 * Can't optimize if there are any submaps in the
9912 	 * destination due to the way we free the "copy" map
9913 	 * progressively in vm_map_copy_overwrite_nested()
9914 	 * in that case.
9915 	 */
9916 	vm_map_lock_read(dst_map);
9917 	if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
9918 		vm_map_unlock_read(dst_map);
9919 		goto blunt_copy;
9920 	}
9921 	for (;
9922 	    (entry != vm_map_to_entry(dst_map) &&
9923 	    entry->vme_start < dst_addr + copy_size);
9924 	    entry = entry->vme_next) {
9925 		if (entry->is_sub_map) {
9926 			vm_map_unlock_read(dst_map);
9927 			goto blunt_copy;
9928 		}
9929 	}
9930 	vm_map_unlock_read(dst_map);
9931 
9932 	if (head_size) {
9933 		/*
9934 		 * Unaligned copy of the first "head_size" bytes, to reach
9935 		 * a page boundary.
9936 		 */
9937 
9938 		/*
9939 		 * Extract "head_copy" out of "copy".
9940 		 */
9941 		head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9942 		head_copy->cpy_hdr.entries_pageable =
9943 		    copy->cpy_hdr.entries_pageable;
9944 		head_copy->cpy_hdr.page_shift = copy_page_shift;
9945 
9946 		entry = vm_map_copy_first_entry(copy);
9947 		if (entry->vme_end < copy->offset + head_size) {
9948 			head_size = entry->vme_end - copy->offset;
9949 		}
9950 
9951 		head_copy->offset = copy->offset;
9952 		head_copy->size = head_size;
9953 		copy->offset += head_size;
9954 		copy->size -= head_size;
9955 		copy_size -= head_size;
9956 		assert(copy_size > 0);
9957 
9958 		vm_map_copy_clip_end(copy, entry, copy->offset);
9959 		vm_map_copy_entry_unlink(copy, entry);
9960 		vm_map_copy_entry_link(head_copy,
9961 		    vm_map_copy_to_entry(head_copy),
9962 		    entry);
9963 
9964 		/*
9965 		 * Do the unaligned copy.
9966 		 */
9967 		kr = vm_map_copy_overwrite_nested(dst_map,
9968 		    head_addr,
9969 		    head_copy,
9970 		    interruptible,
9971 		    (pmap_t) NULL,
9972 		    FALSE);
9973 		if (kr != KERN_SUCCESS) {
9974 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
9975 			goto done;
9976 		}
9977 	}
9978 
9979 	if (tail_size) {
9980 		/*
9981 		 * Extract "tail_copy" out of "copy".
9982 		 */
9983 		tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9984 		tail_copy->cpy_hdr.entries_pageable =
9985 		    copy->cpy_hdr.entries_pageable;
9986 		tail_copy->cpy_hdr.page_shift = copy_page_shift;
9987 
9988 		tail_copy->offset = copy->offset + copy_size - tail_size;
9989 		tail_copy->size = tail_size;
9990 
9991 		copy->size -= tail_size;
9992 		copy_size -= tail_size;
9993 		assert(copy_size > 0);
9994 
9995 		entry = vm_map_copy_last_entry(copy);
9996 		vm_map_copy_clip_start(copy, entry, tail_copy->offset);
9997 		entry = vm_map_copy_last_entry(copy);
9998 		vm_map_copy_entry_unlink(copy, entry);
9999 		vm_map_copy_entry_link(tail_copy,
10000 		    vm_map_copy_last_entry(tail_copy),
10001 		    entry);
10002 	}
10003 
10004 	/*
10005 	 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10006 	 * we want to avoid TOCTOU issues w.r.t copy->size but
10007 	 * we don't need to change vm_map_copy_overwrite_nested()
10008 	 * and all other vm_map_copy_overwrite variants.
10009 	 *
10010 	 * So we assign the original copy_size that was passed into
10011 	 * this routine back to copy.
10012 	 *
10013 	 * This use of local 'copy_size' passed into this routine is
10014 	 * to try and protect against TOCTOU attacks where the kernel
10015 	 * has been exploited. We don't expect this to be an issue
10016 	 * during normal system operation.
10017 	 */
10018 	assertf(copy->size == copy_size,
10019 	    "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10020 	copy->size = copy_size;
10021 
10022 	/*
10023 	 * Copy most (or possibly all) of the data.
10024 	 */
10025 	kr = vm_map_copy_overwrite_nested(dst_map,
10026 	    dst_addr + head_size,
10027 	    copy,
10028 	    interruptible,
10029 	    (pmap_t) NULL,
10030 	    FALSE);
10031 	if (kr != KERN_SUCCESS) {
10032 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10033 		goto done;
10034 	}
10035 
10036 	if (tail_size) {
10037 		kr = vm_map_copy_overwrite_nested(dst_map,
10038 		    tail_addr,
10039 		    tail_copy,
10040 		    interruptible,
10041 		    (pmap_t) NULL,
10042 		    FALSE);
10043 		if (kr) {
10044 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10045 		}
10046 	}
10047 
10048 done:
10049 	assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10050 	if (kr == KERN_SUCCESS) {
10051 		/*
10052 		 * Discard all the copy maps.
10053 		 */
10054 		if (head_copy) {
10055 			vm_map_copy_discard(head_copy);
10056 			head_copy = NULL;
10057 		}
10058 		vm_map_copy_discard(copy);
10059 		if (tail_copy) {
10060 			vm_map_copy_discard(tail_copy);
10061 			tail_copy = NULL;
10062 		}
10063 	} else {
10064 		/*
10065 		 * Re-assemble the original copy map.
10066 		 */
10067 		if (head_copy) {
10068 			entry = vm_map_copy_first_entry(head_copy);
10069 			vm_map_copy_entry_unlink(head_copy, entry);
10070 			vm_map_copy_entry_link(copy,
10071 			    vm_map_copy_to_entry(copy),
10072 			    entry);
10073 			copy->offset -= head_size;
10074 			copy->size += head_size;
10075 			vm_map_copy_discard(head_copy);
10076 			head_copy = NULL;
10077 		}
10078 		if (tail_copy) {
10079 			entry = vm_map_copy_last_entry(tail_copy);
10080 			vm_map_copy_entry_unlink(tail_copy, entry);
10081 			vm_map_copy_entry_link(copy,
10082 			    vm_map_copy_last_entry(copy),
10083 			    entry);
10084 			copy->size += tail_size;
10085 			vm_map_copy_discard(tail_copy);
10086 			tail_copy = NULL;
10087 		}
10088 	}
10089 	return kr;
10090 }
10091 
10092 
10093 /*
10094  *	Routine: vm_map_copy_overwrite_unaligned	[internal use only]
10095  *
10096  *	Decription:
10097  *	Physically copy unaligned data
10098  *
10099  *	Implementation:
10100  *	Unaligned parts of pages have to be physically copied.  We use
10101  *	a modified form of vm_fault_copy (which understands none-aligned
10102  *	page offsets and sizes) to do the copy.  We attempt to copy as
10103  *	much memory in one go as possibly, however vm_fault_copy copies
10104  *	within 1 memory object so we have to find the smaller of "amount left"
10105  *	"source object data size" and "target object data size".  With
10106  *	unaligned data we don't need to split regions, therefore the source
10107  *	(copy) object should be one map entry, the target range may be split
10108  *	over multiple map entries however.  In any event we are pessimistic
10109  *	about these assumptions.
10110  *
10111  *	Callers of this function must call vm_map_copy_require on
10112  *	previously created vm_map_copy_t or pass a newly created
10113  *	one to ensure that it hasn't been forged.
10114  *
10115  *	Assumptions:
10116  *	dst_map is locked on entry and is return locked on success,
10117  *	unlocked on error.
10118  */
10119 
10120 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10121 vm_map_copy_overwrite_unaligned(
10122 	vm_map_t        dst_map,
10123 	vm_map_entry_t  entry,
10124 	vm_map_copy_t   copy,
10125 	vm_map_offset_t start,
10126 	boolean_t       discard_on_success)
10127 {
10128 	vm_map_entry_t          copy_entry;
10129 	vm_map_entry_t          copy_entry_next;
10130 	vm_map_version_t        version;
10131 	vm_object_t             dst_object;
10132 	vm_object_offset_t      dst_offset;
10133 	vm_object_offset_t      src_offset;
10134 	vm_object_offset_t      entry_offset;
10135 	vm_map_offset_t         entry_end;
10136 	vm_map_size_t           src_size,
10137 	    dst_size,
10138 	    copy_size,
10139 	    amount_left;
10140 	kern_return_t           kr = KERN_SUCCESS;
10141 
10142 
10143 	copy_entry = vm_map_copy_first_entry(copy);
10144 
10145 	vm_map_lock_write_to_read(dst_map);
10146 
10147 	src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10148 	amount_left = copy->size;
10149 /*
10150  *	unaligned so we never clipped this entry, we need the offset into
10151  *	the vm_object not just the data.
10152  */
10153 	while (amount_left > 0) {
10154 		if (entry == vm_map_to_entry(dst_map)) {
10155 			vm_map_unlock_read(dst_map);
10156 			return KERN_INVALID_ADDRESS;
10157 		}
10158 
10159 		/* "start" must be within the current map entry */
10160 		assert((start >= entry->vme_start) && (start < entry->vme_end));
10161 
10162 		/*
10163 		 *	Check protection again
10164 		 */
10165 		if (!(entry->protection & VM_PROT_WRITE)) {
10166 			vm_map_unlock_read(dst_map);
10167 			return KERN_PROTECTION_FAILURE;
10168 		}
10169 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10170 			vm_map_unlock_read(dst_map);
10171 			return KERN_PROTECTION_FAILURE;
10172 		}
10173 
10174 		/*
10175 		 *	If the entry is in transition, we must wait
10176 		 *	for it to exit that state.  Anything could happen
10177 		 *	when we unlock the map, so start over.
10178 		 */
10179 		if (entry->in_transition) {
10180 			/*
10181 			 * Say that we are waiting, and wait for entry.
10182 			 */
10183 			entry->needs_wakeup = TRUE;
10184 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10185 
10186 			goto RetryLookup;
10187 		}
10188 
10189 		dst_offset = start - entry->vme_start;
10190 
10191 		dst_size = entry->vme_end - start;
10192 
10193 		src_size = copy_entry->vme_end -
10194 		    (copy_entry->vme_start + src_offset);
10195 
10196 		if (dst_size < src_size) {
10197 /*
10198  *			we can only copy dst_size bytes before
10199  *			we have to get the next destination entry
10200  */
10201 			copy_size = dst_size;
10202 		} else {
10203 /*
10204  *			we can only copy src_size bytes before
10205  *			we have to get the next source copy entry
10206  */
10207 			copy_size = src_size;
10208 		}
10209 
10210 		if (copy_size > amount_left) {
10211 			copy_size = amount_left;
10212 		}
10213 /*
10214  *		Entry needs copy, create a shadow shadow object for
10215  *		Copy on write region.
10216  */
10217 		if (entry->needs_copy) {
10218 			if (vm_map_lock_read_to_write(dst_map)) {
10219 				vm_map_lock_read(dst_map);
10220 				goto RetryLookup;
10221 			}
10222 			VME_OBJECT_SHADOW(entry,
10223 			    (vm_map_size_t)(entry->vme_end
10224 			    - entry->vme_start),
10225 			    vm_map_always_shadow(dst_map));
10226 			entry->needs_copy = FALSE;
10227 			vm_map_lock_write_to_read(dst_map);
10228 		}
10229 		dst_object = VME_OBJECT(entry);
10230 /*
10231  *		unlike with the virtual (aligned) copy we're going
10232  *		to fault on it therefore we need a target object.
10233  */
10234 		if (dst_object == VM_OBJECT_NULL) {
10235 			if (vm_map_lock_read_to_write(dst_map)) {
10236 				vm_map_lock_read(dst_map);
10237 				goto RetryLookup;
10238 			}
10239 			dst_object = vm_object_allocate((vm_map_size_t)
10240 			    entry->vme_end - entry->vme_start);
10241 			VME_OBJECT_SET(entry, dst_object, false, 0);
10242 			VME_OFFSET_SET(entry, 0);
10243 			assert(entry->use_pmap);
10244 			vm_map_lock_write_to_read(dst_map);
10245 		}
10246 /*
10247  *		Take an object reference and unlock map. The "entry" may
10248  *		disappear or change when the map is unlocked.
10249  */
10250 		vm_object_reference(dst_object);
10251 		version.main_timestamp = dst_map->timestamp;
10252 		entry_offset = VME_OFFSET(entry);
10253 		entry_end = entry->vme_end;
10254 		vm_map_unlock_read(dst_map);
10255 /*
10256  *		Copy as much as possible in one pass
10257  */
10258 		kr = vm_fault_copy(
10259 			VME_OBJECT(copy_entry),
10260 			VME_OFFSET(copy_entry) + src_offset,
10261 			&copy_size,
10262 			dst_object,
10263 			entry_offset + dst_offset,
10264 			dst_map,
10265 			&version,
10266 			THREAD_UNINT );
10267 
10268 		start += copy_size;
10269 		src_offset += copy_size;
10270 		amount_left -= copy_size;
10271 /*
10272  *		Release the object reference
10273  */
10274 		vm_object_deallocate(dst_object);
10275 /*
10276  *		If a hard error occurred, return it now
10277  */
10278 		if (kr != KERN_SUCCESS) {
10279 			return kr;
10280 		}
10281 
10282 		if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10283 		    || amount_left == 0) {
10284 /*
10285  *			all done with this copy entry, dispose.
10286  */
10287 			copy_entry_next = copy_entry->vme_next;
10288 
10289 			if (discard_on_success) {
10290 				vm_map_copy_entry_unlink(copy, copy_entry);
10291 				assert(!copy_entry->is_sub_map);
10292 				vm_object_deallocate(VME_OBJECT(copy_entry));
10293 				vm_map_copy_entry_dispose(copy_entry);
10294 			}
10295 
10296 			if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10297 			    amount_left) {
10298 /*
10299  *				not finished copying but run out of source
10300  */
10301 				return KERN_INVALID_ADDRESS;
10302 			}
10303 
10304 			copy_entry = copy_entry_next;
10305 
10306 			src_offset = 0;
10307 		}
10308 
10309 		if (amount_left == 0) {
10310 			return KERN_SUCCESS;
10311 		}
10312 
10313 		vm_map_lock_read(dst_map);
10314 		if (version.main_timestamp == dst_map->timestamp) {
10315 			if (start == entry_end) {
10316 /*
10317  *				destination region is split.  Use the version
10318  *				information to avoid a lookup in the normal
10319  *				case.
10320  */
10321 				entry = entry->vme_next;
10322 /*
10323  *				should be contiguous. Fail if we encounter
10324  *				a hole in the destination.
10325  */
10326 				if (start != entry->vme_start) {
10327 					vm_map_unlock_read(dst_map);
10328 					return KERN_INVALID_ADDRESS;
10329 				}
10330 			}
10331 		} else {
10332 /*
10333  *			Map version check failed.
10334  *			we must lookup the entry because somebody
10335  *			might have changed the map behind our backs.
10336  */
10337 RetryLookup:
10338 			if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10339 				vm_map_unlock_read(dst_map);
10340 				return KERN_INVALID_ADDRESS;
10341 			}
10342 		}
10343 	}/* while */
10344 
10345 	return KERN_SUCCESS;
10346 }/* vm_map_copy_overwrite_unaligned */
10347 
10348 /*
10349  *	Routine: vm_map_copy_overwrite_aligned	[internal use only]
10350  *
10351  *	Description:
10352  *	Does all the vm_trickery possible for whole pages.
10353  *
10354  *	Implementation:
10355  *
10356  *	If there are no permanent objects in the destination,
10357  *	and the source and destination map entry zones match,
10358  *	and the destination map entry is not shared,
10359  *	then the map entries can be deleted and replaced
10360  *	with those from the copy.  The following code is the
10361  *	basic idea of what to do, but there are lots of annoying
10362  *	little details about getting protection and inheritance
10363  *	right.  Should add protection, inheritance, and sharing checks
10364  *	to the above pass and make sure that no wiring is involved.
10365  *
10366  *	Callers of this function must call vm_map_copy_require on
10367  *	previously created vm_map_copy_t or pass a newly created
10368  *	one to ensure that it hasn't been forged.
10369  */
10370 
10371 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10372 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10373 int vm_map_copy_overwrite_aligned_src_large = 0;
10374 
10375 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10376 vm_map_copy_overwrite_aligned(
10377 	vm_map_t        dst_map,
10378 	vm_map_entry_t  tmp_entry,
10379 	vm_map_copy_t   copy,
10380 	vm_map_offset_t start,
10381 	__unused pmap_t pmap)
10382 {
10383 	vm_object_t     object;
10384 	vm_map_entry_t  copy_entry;
10385 	vm_map_size_t   copy_size;
10386 	vm_map_size_t   size;
10387 	vm_map_entry_t  entry;
10388 
10389 	while ((copy_entry = vm_map_copy_first_entry(copy))
10390 	    != vm_map_copy_to_entry(copy)) {
10391 		copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10392 
10393 		entry = tmp_entry;
10394 		if (entry->is_sub_map) {
10395 			/* unnested when clipped earlier */
10396 			assert(!entry->use_pmap);
10397 		}
10398 		if (entry == vm_map_to_entry(dst_map)) {
10399 			vm_map_unlock(dst_map);
10400 			return KERN_INVALID_ADDRESS;
10401 		}
10402 		size = (entry->vme_end - entry->vme_start);
10403 		/*
10404 		 *	Make sure that no holes popped up in the
10405 		 *	address map, and that the protection is
10406 		 *	still valid, in case the map was unlocked
10407 		 *	earlier.
10408 		 */
10409 
10410 		if ((entry->vme_start != start) || ((entry->is_sub_map)
10411 		    && !entry->needs_copy)) {
10412 			vm_map_unlock(dst_map);
10413 			return KERN_INVALID_ADDRESS;
10414 		}
10415 		assert(entry != vm_map_to_entry(dst_map));
10416 
10417 		/*
10418 		 *	Check protection again
10419 		 */
10420 
10421 		if (!(entry->protection & VM_PROT_WRITE)) {
10422 			vm_map_unlock(dst_map);
10423 			return KERN_PROTECTION_FAILURE;
10424 		}
10425 
10426 		if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10427 			vm_map_unlock(dst_map);
10428 			return KERN_PROTECTION_FAILURE;
10429 		}
10430 
10431 		/*
10432 		 *	If the entry is in transition, we must wait
10433 		 *	for it to exit that state.  Anything could happen
10434 		 *	when we unlock the map, so start over.
10435 		 */
10436 		if (entry->in_transition) {
10437 			/*
10438 			 * Say that we are waiting, and wait for entry.
10439 			 */
10440 			entry->needs_wakeup = TRUE;
10441 			vm_map_entry_wait(dst_map, THREAD_UNINT);
10442 
10443 			goto RetryLookup;
10444 		}
10445 
10446 		/*
10447 		 *	Adjust to source size first
10448 		 */
10449 
10450 		if (copy_size < size) {
10451 			if (entry->map_aligned &&
10452 			    !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10453 			    VM_MAP_PAGE_MASK(dst_map))) {
10454 				/* no longer map-aligned */
10455 				entry->map_aligned = FALSE;
10456 			}
10457 			vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10458 			size = copy_size;
10459 		}
10460 
10461 		/*
10462 		 *	Adjust to destination size
10463 		 */
10464 
10465 		if (size < copy_size) {
10466 			vm_map_copy_clip_end(copy, copy_entry,
10467 			    copy_entry->vme_start + size);
10468 			copy_size = size;
10469 		}
10470 
10471 		assert((entry->vme_end - entry->vme_start) == size);
10472 		assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10473 		assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10474 
10475 		/*
10476 		 *	If the destination contains temporary unshared memory,
10477 		 *	we can perform the copy by throwing it away and
10478 		 *	installing the source data.
10479 		 *
10480 		 *	Exceptions for mappings with special semantics:
10481 		 *	+ "permanent" entries,
10482 		 *	+ JIT regions,
10483 		 *	+ TPRO regions,
10484 		 *      + pmap-specific protection policies,
10485 		 *	+ VM objects with COPY_NONE copy strategy.
10486 		 */
10487 
10488 		object = VME_OBJECT(entry);
10489 		if ((!entry->is_shared &&
10490 		    !entry->vme_permanent &&
10491 		    !entry->used_for_jit &&
10492 #if __arm64e__
10493 		    !entry->used_for_tpro &&
10494 #endif /* __arm64e__ */
10495 		    !(entry->protection & VM_PROT_EXECUTE) &&
10496 		    !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10497 		    ((object == VM_OBJECT_NULL) ||
10498 		    (object->internal &&
10499 		    !object->true_share &&
10500 		    object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10501 		    entry->needs_copy) {
10502 			vm_object_t     old_object = VME_OBJECT(entry);
10503 			vm_object_offset_t      old_offset = VME_OFFSET(entry);
10504 			vm_object_offset_t      offset;
10505 
10506 			/*
10507 			 * Ensure that the source and destination aren't
10508 			 * identical
10509 			 */
10510 			if (old_object == VME_OBJECT(copy_entry) &&
10511 			    old_offset == VME_OFFSET(copy_entry)) {
10512 				vm_map_copy_entry_unlink(copy, copy_entry);
10513 				vm_map_copy_entry_dispose(copy_entry);
10514 
10515 				if (old_object != VM_OBJECT_NULL) {
10516 					vm_object_deallocate(old_object);
10517 				}
10518 
10519 				start = tmp_entry->vme_end;
10520 				tmp_entry = tmp_entry->vme_next;
10521 				continue;
10522 			}
10523 
10524 #if XNU_TARGET_OS_OSX
10525 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10526 #define __TRADEOFF1_COPY_SIZE (128 * 1024)      /* 128 KB */
10527 			if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10528 			    VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10529 			    copy_size <= __TRADEOFF1_COPY_SIZE) {
10530 				/*
10531 				 * Virtual vs. Physical copy tradeoff #1.
10532 				 *
10533 				 * Copying only a few pages out of a large
10534 				 * object:  do a physical copy instead of
10535 				 * a virtual copy, to avoid possibly keeping
10536 				 * the entire large object alive because of
10537 				 * those few copy-on-write pages.
10538 				 */
10539 				vm_map_copy_overwrite_aligned_src_large++;
10540 				goto slow_copy;
10541 			}
10542 #endif /* XNU_TARGET_OS_OSX */
10543 
10544 			if ((dst_map->pmap != kernel_pmap) &&
10545 			    (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10546 			    (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10547 				vm_object_t new_object, new_shadow;
10548 
10549 				/*
10550 				 * We're about to map something over a mapping
10551 				 * established by malloc()...
10552 				 */
10553 				new_object = VME_OBJECT(copy_entry);
10554 				if (new_object != VM_OBJECT_NULL) {
10555 					vm_object_lock_shared(new_object);
10556 				}
10557 				while (new_object != VM_OBJECT_NULL &&
10558 #if XNU_TARGET_OS_OSX
10559 				    !new_object->true_share &&
10560 				    new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10561 #endif /* XNU_TARGET_OS_OSX */
10562 				    new_object->internal) {
10563 					new_shadow = new_object->shadow;
10564 					if (new_shadow == VM_OBJECT_NULL) {
10565 						break;
10566 					}
10567 					vm_object_lock_shared(new_shadow);
10568 					vm_object_unlock(new_object);
10569 					new_object = new_shadow;
10570 				}
10571 				if (new_object != VM_OBJECT_NULL) {
10572 					if (!new_object->internal) {
10573 						/*
10574 						 * The new mapping is backed
10575 						 * by an external object.  We
10576 						 * don't want malloc'ed memory
10577 						 * to be replaced with such a
10578 						 * non-anonymous mapping, so
10579 						 * let's go off the optimized
10580 						 * path...
10581 						 */
10582 						vm_map_copy_overwrite_aligned_src_not_internal++;
10583 						vm_object_unlock(new_object);
10584 						goto slow_copy;
10585 					}
10586 #if XNU_TARGET_OS_OSX
10587 					if (new_object->true_share ||
10588 					    new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10589 						/*
10590 						 * Same if there's a "true_share"
10591 						 * object in the shadow chain, or
10592 						 * an object with a non-default
10593 						 * (SYMMETRIC) copy strategy.
10594 						 */
10595 						vm_map_copy_overwrite_aligned_src_not_symmetric++;
10596 						vm_object_unlock(new_object);
10597 						goto slow_copy;
10598 					}
10599 #endif /* XNU_TARGET_OS_OSX */
10600 					vm_object_unlock(new_object);
10601 				}
10602 				/*
10603 				 * The new mapping is still backed by
10604 				 * anonymous (internal) memory, so it's
10605 				 * OK to substitute it for the original
10606 				 * malloc() mapping.
10607 				 */
10608 			}
10609 
10610 			if (old_object != VM_OBJECT_NULL) {
10611 				assert(!entry->vme_permanent);
10612 				if (entry->is_sub_map) {
10613 					if (entry->use_pmap) {
10614 #ifndef NO_NESTED_PMAP
10615 						pmap_unnest(dst_map->pmap,
10616 						    (addr64_t)entry->vme_start,
10617 						    entry->vme_end - entry->vme_start);
10618 #endif  /* NO_NESTED_PMAP */
10619 						if (dst_map->mapped_in_other_pmaps) {
10620 							/* clean up parent */
10621 							/* map/maps */
10622 							vm_map_submap_pmap_clean(
10623 								dst_map, entry->vme_start,
10624 								entry->vme_end,
10625 								VME_SUBMAP(entry),
10626 								VME_OFFSET(entry));
10627 						}
10628 					} else {
10629 						vm_map_submap_pmap_clean(
10630 							dst_map, entry->vme_start,
10631 							entry->vme_end,
10632 							VME_SUBMAP(entry),
10633 							VME_OFFSET(entry));
10634 					}
10635 					vm_map_deallocate(VME_SUBMAP(entry));
10636 				} else {
10637 					if (dst_map->mapped_in_other_pmaps) {
10638 						vm_object_pmap_protect_options(
10639 							VME_OBJECT(entry),
10640 							VME_OFFSET(entry),
10641 							entry->vme_end
10642 							- entry->vme_start,
10643 							PMAP_NULL,
10644 							PAGE_SIZE,
10645 							entry->vme_start,
10646 							VM_PROT_NONE,
10647 							PMAP_OPTIONS_REMOVE);
10648 					} else {
10649 						pmap_remove_options(
10650 							dst_map->pmap,
10651 							(addr64_t)(entry->vme_start),
10652 							(addr64_t)(entry->vme_end),
10653 							PMAP_OPTIONS_REMOVE);
10654 					}
10655 					vm_object_deallocate(old_object);
10656 				}
10657 			}
10658 
10659 			if (entry->iokit_acct) {
10660 				/* keep using iokit accounting */
10661 				entry->use_pmap = FALSE;
10662 			} else {
10663 				/* use pmap accounting */
10664 				entry->use_pmap = TRUE;
10665 			}
10666 			assert(!entry->vme_permanent);
10667 			VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10668 			object = VME_OBJECT(entry);
10669 			entry->needs_copy = copy_entry->needs_copy;
10670 			entry->wired_count = 0;
10671 			entry->user_wired_count = 0;
10672 			offset = VME_OFFSET(copy_entry);
10673 			VME_OFFSET_SET(entry, offset);
10674 
10675 			vm_map_copy_entry_unlink(copy, copy_entry);
10676 			vm_map_copy_entry_dispose(copy_entry);
10677 
10678 			/*
10679 			 * we could try to push pages into the pmap at this point, BUT
10680 			 * this optimization only saved on average 2 us per page if ALL
10681 			 * the pages in the source were currently mapped
10682 			 * and ALL the pages in the dest were touched, if there were fewer
10683 			 * than 2/3 of the pages touched, this optimization actually cost more cycles
10684 			 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10685 			 */
10686 
10687 			/*
10688 			 *	Set up for the next iteration.  The map
10689 			 *	has not been unlocked, so the next
10690 			 *	address should be at the end of this
10691 			 *	entry, and the next map entry should be
10692 			 *	the one following it.
10693 			 */
10694 
10695 			start = tmp_entry->vme_end;
10696 			tmp_entry = tmp_entry->vme_next;
10697 		} else {
10698 			vm_map_version_t        version;
10699 			vm_object_t             dst_object;
10700 			vm_object_offset_t      dst_offset;
10701 			kern_return_t           r;
10702 
10703 slow_copy:
10704 			if (entry->needs_copy) {
10705 				VME_OBJECT_SHADOW(entry,
10706 				    (entry->vme_end -
10707 				    entry->vme_start),
10708 				    vm_map_always_shadow(dst_map));
10709 				entry->needs_copy = FALSE;
10710 			}
10711 
10712 			dst_object = VME_OBJECT(entry);
10713 			dst_offset = VME_OFFSET(entry);
10714 
10715 			/*
10716 			 *	Take an object reference, and record
10717 			 *	the map version information so that the
10718 			 *	map can be safely unlocked.
10719 			 */
10720 
10721 			if (dst_object == VM_OBJECT_NULL) {
10722 				/*
10723 				 * We would usually have just taken the
10724 				 * optimized path above if the destination
10725 				 * object has not been allocated yet.  But we
10726 				 * now disable that optimization if the copy
10727 				 * entry's object is not backed by anonymous
10728 				 * memory to avoid replacing malloc'ed
10729 				 * (i.e. re-usable) anonymous memory with a
10730 				 * not-so-anonymous mapping.
10731 				 * So we have to handle this case here and
10732 				 * allocate a new VM object for this map entry.
10733 				 */
10734 				dst_object = vm_object_allocate(
10735 					entry->vme_end - entry->vme_start);
10736 				dst_offset = 0;
10737 				VME_OBJECT_SET(entry, dst_object, false, 0);
10738 				VME_OFFSET_SET(entry, dst_offset);
10739 				assert(entry->use_pmap);
10740 			}
10741 
10742 			vm_object_reference(dst_object);
10743 
10744 			/* account for unlock bumping up timestamp */
10745 			version.main_timestamp = dst_map->timestamp + 1;
10746 
10747 			vm_map_unlock(dst_map);
10748 
10749 			/*
10750 			 *	Copy as much as possible in one pass
10751 			 */
10752 
10753 			copy_size = size;
10754 			r = vm_fault_copy(
10755 				VME_OBJECT(copy_entry),
10756 				VME_OFFSET(copy_entry),
10757 				&copy_size,
10758 				dst_object,
10759 				dst_offset,
10760 				dst_map,
10761 				&version,
10762 				THREAD_UNINT );
10763 
10764 			/*
10765 			 *	Release the object reference
10766 			 */
10767 
10768 			vm_object_deallocate(dst_object);
10769 
10770 			/*
10771 			 *	If a hard error occurred, return it now
10772 			 */
10773 
10774 			if (r != KERN_SUCCESS) {
10775 				return r;
10776 			}
10777 
10778 			if (copy_size != 0) {
10779 				/*
10780 				 *	Dispose of the copied region
10781 				 */
10782 
10783 				vm_map_copy_clip_end(copy, copy_entry,
10784 				    copy_entry->vme_start + copy_size);
10785 				vm_map_copy_entry_unlink(copy, copy_entry);
10786 				vm_object_deallocate(VME_OBJECT(copy_entry));
10787 				vm_map_copy_entry_dispose(copy_entry);
10788 			}
10789 
10790 			/*
10791 			 *	Pick up in the destination map where we left off.
10792 			 *
10793 			 *	Use the version information to avoid a lookup
10794 			 *	in the normal case.
10795 			 */
10796 
10797 			start += copy_size;
10798 			vm_map_lock(dst_map);
10799 			if (version.main_timestamp == dst_map->timestamp &&
10800 			    copy_size != 0) {
10801 				/* We can safely use saved tmp_entry value */
10802 
10803 				if (tmp_entry->map_aligned &&
10804 				    !VM_MAP_PAGE_ALIGNED(
10805 					    start,
10806 					    VM_MAP_PAGE_MASK(dst_map))) {
10807 					/* no longer map-aligned */
10808 					tmp_entry->map_aligned = FALSE;
10809 				}
10810 				vm_map_clip_end(dst_map, tmp_entry, start);
10811 				tmp_entry = tmp_entry->vme_next;
10812 			} else {
10813 				/* Must do lookup of tmp_entry */
10814 
10815 RetryLookup:
10816 				if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
10817 					vm_map_unlock(dst_map);
10818 					return KERN_INVALID_ADDRESS;
10819 				}
10820 				if (tmp_entry->map_aligned &&
10821 				    !VM_MAP_PAGE_ALIGNED(
10822 					    start,
10823 					    VM_MAP_PAGE_MASK(dst_map))) {
10824 					/* no longer map-aligned */
10825 					tmp_entry->map_aligned = FALSE;
10826 				}
10827 				vm_map_clip_start(dst_map, tmp_entry, start);
10828 			}
10829 		}
10830 	}/* while */
10831 
10832 	return KERN_SUCCESS;
10833 }/* vm_map_copy_overwrite_aligned */
10834 
10835 /*
10836  *	Routine: vm_map_copyin_kernel_buffer [internal use only]
10837  *
10838  *	Description:
10839  *		Copy in data to a kernel buffer from space in the
10840  *		source map. The original space may be optionally
10841  *		deallocated.
10842  *
10843  *		If successful, returns a new copy object.
10844  */
10845 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)10846 vm_map_copyin_kernel_buffer(
10847 	vm_map_t        src_map,
10848 	vm_map_offset_t src_addr,
10849 	vm_map_size_t   len,
10850 	boolean_t       src_destroy,
10851 	vm_map_copy_t   *copy_result)
10852 {
10853 	kern_return_t kr;
10854 	vm_map_copy_t copy;
10855 	void *kdata;
10856 
10857 	if (len > msg_ool_size_small) {
10858 		return KERN_INVALID_ARGUMENT;
10859 	}
10860 
10861 	kdata = kalloc_data(len, Z_WAITOK);
10862 	if (kdata == NULL) {
10863 		return KERN_RESOURCE_SHORTAGE;
10864 	}
10865 	kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
10866 	if (kr != KERN_SUCCESS) {
10867 		kfree_data(kdata, len);
10868 		return kr;
10869 	}
10870 
10871 	copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
10872 	copy->cpy_kdata = kdata;
10873 	copy->size = len;
10874 	copy->offset = 0;
10875 
10876 	if (src_destroy) {
10877 		vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
10878 
10879 		if (src_map == kernel_map) {
10880 			flags |= VM_MAP_REMOVE_KUNWIRE;
10881 		}
10882 
10883 		(void)vm_map_remove_guard(src_map,
10884 		    vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
10885 		    vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
10886 		    flags, KMEM_GUARD_NONE);
10887 	}
10888 
10889 	*copy_result = copy;
10890 	return KERN_SUCCESS;
10891 }
10892 
10893 /*
10894  *	Routine: vm_map_copyout_kernel_buffer	[internal use only]
10895  *
10896  *	Description:
10897  *		Copy out data from a kernel buffer into space in the
10898  *		destination map. The space may be otpionally dynamically
10899  *		allocated.
10900  *
10901  *		If successful, consumes the copy object.
10902  *		Otherwise, the caller is responsible for it.
10903  *
10904  *		Callers of this function must call vm_map_copy_require on
10905  *		previously created vm_map_copy_t or pass a newly created
10906  *		one to ensure that it hasn't been forged.
10907  */
10908 static int vm_map_copyout_kernel_buffer_failures = 0;
10909 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)10910 vm_map_copyout_kernel_buffer(
10911 	vm_map_t                map,
10912 	vm_map_address_t        *addr,  /* IN/OUT */
10913 	vm_map_copy_t           copy,
10914 	vm_map_size_t           copy_size,
10915 	boolean_t               overwrite,
10916 	boolean_t               consume_on_success)
10917 {
10918 	kern_return_t kr = KERN_SUCCESS;
10919 	thread_t thread = current_thread();
10920 
10921 	assert(copy->size == copy_size);
10922 
10923 	/*
10924 	 * check for corrupted vm_map_copy structure
10925 	 */
10926 	if (copy_size > msg_ool_size_small || copy->offset) {
10927 		panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
10928 		    (long long)copy->size, (long long)copy->offset);
10929 	}
10930 
10931 	if (!overwrite) {
10932 		/*
10933 		 * Allocate space in the target map for the data
10934 		 */
10935 		vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
10936 
10937 		if (map == kernel_map) {
10938 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
10939 		}
10940 
10941 		*addr = 0;
10942 		kr = vm_map_enter(map,
10943 		    addr,
10944 		    vm_map_round_page(copy_size,
10945 		    VM_MAP_PAGE_MASK(map)),
10946 		    (vm_map_offset_t) 0,
10947 		    vmk_flags,
10948 		    VM_OBJECT_NULL,
10949 		    (vm_object_offset_t) 0,
10950 		    FALSE,
10951 		    VM_PROT_DEFAULT,
10952 		    VM_PROT_ALL,
10953 		    VM_INHERIT_DEFAULT);
10954 		if (kr != KERN_SUCCESS) {
10955 			return kr;
10956 		}
10957 #if KASAN
10958 		if (map->pmap == kernel_pmap) {
10959 			kasan_notify_address(*addr, copy->size);
10960 		}
10961 #endif
10962 	}
10963 
10964 	/*
10965 	 * Copyout the data from the kernel buffer to the target map.
10966 	 */
10967 	if (thread->map == map) {
10968 		/*
10969 		 * If the target map is the current map, just do
10970 		 * the copy.
10971 		 */
10972 		assert((vm_size_t)copy_size == copy_size);
10973 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10974 			kr = KERN_INVALID_ADDRESS;
10975 		}
10976 	} else {
10977 		vm_map_t oldmap;
10978 
10979 		/*
10980 		 * If the target map is another map, assume the
10981 		 * target's address space identity for the duration
10982 		 * of the copy.
10983 		 */
10984 		vm_map_reference(map);
10985 		oldmap = vm_map_switch(map);
10986 
10987 		assert((vm_size_t)copy_size == copy_size);
10988 		if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10989 			vm_map_copyout_kernel_buffer_failures++;
10990 			kr = KERN_INVALID_ADDRESS;
10991 		}
10992 
10993 		(void) vm_map_switch(oldmap);
10994 		vm_map_deallocate(map);
10995 	}
10996 
10997 	if (kr != KERN_SUCCESS) {
10998 		/* the copy failed, clean up */
10999 		if (!overwrite) {
11000 			/*
11001 			 * Deallocate the space we allocated in the target map.
11002 			 */
11003 			(void) vm_map_remove(map,
11004 			    vm_map_trunc_page(*addr,
11005 			    VM_MAP_PAGE_MASK(map)),
11006 			    vm_map_round_page((*addr +
11007 			    vm_map_round_page(copy_size,
11008 			    VM_MAP_PAGE_MASK(map))),
11009 			    VM_MAP_PAGE_MASK(map)));
11010 			*addr = 0;
11011 		}
11012 	} else {
11013 		/* copy was successful, dicard the copy structure */
11014 		if (consume_on_success) {
11015 			kfree_data(copy->cpy_kdata, copy_size);
11016 			zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11017 		}
11018 	}
11019 
11020 	return kr;
11021 }
11022 
11023 /*
11024  *	Routine:	vm_map_copy_insert      [internal use only]
11025  *
11026  *	Description:
11027  *		Link a copy chain ("copy") into a map at the
11028  *		specified location (after "where").
11029  *
11030  *		Callers of this function must call vm_map_copy_require on
11031  *		previously created vm_map_copy_t or pass a newly created
11032  *		one to ensure that it hasn't been forged.
11033  *	Side effects:
11034  *		The copy chain is destroyed.
11035  */
11036 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11037 vm_map_copy_insert(
11038 	vm_map_t        map,
11039 	vm_map_entry_t  after_where,
11040 	vm_map_copy_t   copy)
11041 {
11042 	vm_map_entry_t  entry;
11043 
11044 	while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11045 		entry = vm_map_copy_first_entry(copy);
11046 		vm_map_copy_entry_unlink(copy, entry);
11047 		vm_map_store_entry_link(map, after_where, entry,
11048 		    VM_MAP_KERNEL_FLAGS_NONE);
11049 		after_where = entry;
11050 	}
11051 	zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11052 }
11053 
11054 /*
11055  * Callers of this function must call vm_map_copy_require on
11056  * previously created vm_map_copy_t or pass a newly created
11057  * one to ensure that it hasn't been forged.
11058  */
11059 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11060 vm_map_copy_remap(
11061 	vm_map_t        map,
11062 	vm_map_entry_t  where,
11063 	vm_map_copy_t   copy,
11064 	vm_map_offset_t adjustment,
11065 	vm_prot_t       cur_prot,
11066 	vm_prot_t       max_prot,
11067 	vm_inherit_t    inheritance)
11068 {
11069 	vm_map_entry_t  copy_entry, new_entry;
11070 
11071 	for (copy_entry = vm_map_copy_first_entry(copy);
11072 	    copy_entry != vm_map_copy_to_entry(copy);
11073 	    copy_entry = copy_entry->vme_next) {
11074 		/* get a new VM map entry for the map */
11075 		new_entry = vm_map_entry_create(map);
11076 		/* copy the "copy entry" to the new entry */
11077 		vm_map_entry_copy(map, new_entry, copy_entry);
11078 		/* adjust "start" and "end" */
11079 		new_entry->vme_start += adjustment;
11080 		new_entry->vme_end += adjustment;
11081 		/* clear some attributes */
11082 		new_entry->inheritance = inheritance;
11083 		new_entry->protection = cur_prot;
11084 		new_entry->max_protection = max_prot;
11085 		new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11086 		/* take an extra reference on the entry's "object" */
11087 		if (new_entry->is_sub_map) {
11088 			assert(!new_entry->use_pmap); /* not nested */
11089 			vm_map_reference(VME_SUBMAP(new_entry));
11090 		} else {
11091 			vm_object_reference(VME_OBJECT(new_entry));
11092 		}
11093 		/* insert the new entry in the map */
11094 		vm_map_store_entry_link(map, where, new_entry,
11095 		    VM_MAP_KERNEL_FLAGS_NONE);
11096 		/* continue inserting the "copy entries" after the new entry */
11097 		where = new_entry;
11098 	}
11099 }
11100 
11101 
11102 /*
11103  * Returns true if *size matches (or is in the range of) copy->size.
11104  * Upon returning true, the *size field is updated with the actual size of the
11105  * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11106  */
11107 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11108 vm_map_copy_validate_size(
11109 	vm_map_t                dst_map,
11110 	vm_map_copy_t           copy,
11111 	vm_map_size_t           *size)
11112 {
11113 	if (copy == VM_MAP_COPY_NULL) {
11114 		return FALSE;
11115 	}
11116 
11117 	/*
11118 	 * Assert that the vm_map_copy is coming from the right
11119 	 * zone and hasn't been forged
11120 	 */
11121 	vm_map_copy_require(copy);
11122 
11123 	vm_map_size_t copy_sz = copy->size;
11124 	vm_map_size_t sz = *size;
11125 	switch (copy->type) {
11126 	case VM_MAP_COPY_KERNEL_BUFFER:
11127 		if (sz == copy_sz) {
11128 			return TRUE;
11129 		}
11130 		break;
11131 	case VM_MAP_COPY_ENTRY_LIST:
11132 		/*
11133 		 * potential page-size rounding prevents us from exactly
11134 		 * validating this flavor of vm_map_copy, but we can at least
11135 		 * assert that it's within a range.
11136 		 */
11137 		if (copy_sz >= sz &&
11138 		    copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11139 			*size = copy_sz;
11140 			return TRUE;
11141 		}
11142 		break;
11143 	default:
11144 		break;
11145 	}
11146 	return FALSE;
11147 }
11148 
11149 static kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11150 vm_map_copyout_internal(
11151 	vm_map_t                dst_map,
11152 	vm_map_address_t       *dst_addr,      /* OUT */
11153 	vm_map_copy_t           copy,
11154 	vm_map_size_ut          copy_size_u,
11155 	boolean_t               consume_on_success,
11156 	vm_prot_t               cur_protection,
11157 	vm_prot_t               max_protection,
11158 	vm_inherit_t            inheritance)
11159 {
11160 	vm_map_size_t           size, copy_size;
11161 	vm_map_size_t           adjustment;
11162 	vm_map_offset_t         start;
11163 	vm_object_offset_t      vm_copy_start;
11164 	vm_map_entry_t          last;
11165 	vm_map_entry_t          entry;
11166 	vm_map_copy_t           original_copy;
11167 	kern_return_t           kr;
11168 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11169 
11170 	/*
11171 	 *	Check for null copy object.
11172 	 */
11173 
11174 	if (copy == VM_MAP_COPY_NULL) {
11175 		*dst_addr = 0;
11176 		return KERN_SUCCESS;
11177 	}
11178 
11179 	/*
11180 	 * Assert that the vm_map_copy is coming from the right
11181 	 * zone and hasn't been forged
11182 	 */
11183 	vm_map_copy_require(copy);
11184 
11185 	if (!VM_SANITIZE_UNSAFE_IS_EQUAL(copy_size_u, copy->size)) {
11186 		*dst_addr = 0;
11187 		ktriage_record(thread_tid(current_thread()),
11188 		    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11189 		    KDBG_TRIAGE_RESERVED,
11190 		    KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR),
11191 		    KERN_FAILURE /* arg */);
11192 		return KERN_FAILURE;
11193 	}
11194 	copy_size = copy->size;
11195 
11196 	/*
11197 	 *	Check for special kernel buffer allocated
11198 	 *	by new_ipc_kmsg_copyin.
11199 	 */
11200 
11201 	if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11202 		kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11203 		    copy, copy_size, FALSE,
11204 		    consume_on_success);
11205 		if (kr) {
11206 			ktriage_record(thread_tid(current_thread()),
11207 			    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11208 			    KDBG_TRIAGE_RESERVED,
11209 			    KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11210 		}
11211 		return kr;
11212 	}
11213 
11214 	original_copy = copy;
11215 	if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11216 		vm_map_copy_t target_copy;
11217 		vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11218 
11219 		target_copy = VM_MAP_COPY_NULL;
11220 		DEBUG4K_ADJUST("adjusting...\n");
11221 		kr = vm_map_copy_adjust_to_target(
11222 			copy,
11223 			0, /* offset */
11224 			copy->size, /* size */
11225 			dst_map,
11226 			TRUE, /* copy */
11227 			&target_copy,
11228 			&overmap_start,
11229 			&overmap_end,
11230 			&trimmed_start);
11231 		if (kr != KERN_SUCCESS) {
11232 			DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11233 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11234 			return kr;
11235 		}
11236 		DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11237 		if (target_copy != copy) {
11238 			copy = target_copy;
11239 		}
11240 		copy_size = copy->size;
11241 	}
11242 
11243 	/*
11244 	 *	Find space for the data
11245 	 */
11246 
11247 	vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11248 	    VM_MAP_COPY_PAGE_MASK(copy));
11249 	size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11250 	    VM_MAP_COPY_PAGE_MASK(copy))
11251 	    - vm_copy_start;
11252 
11253 	vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map, size);
11254 
11255 	vm_map_lock(dst_map);
11256 	kr = vm_map_locate_space_anywhere(dst_map, size, 0, vmk_flags,
11257 	    &start, &last);
11258 	if (kr != KERN_SUCCESS) {
11259 		vm_map_unlock(dst_map);
11260 		ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11261 		return kr;
11262 	}
11263 
11264 	adjustment = start - vm_copy_start;
11265 	if (!consume_on_success) {
11266 		/*
11267 		 * We're not allowed to consume "copy", so we'll have to
11268 		 * copy its map entries into the destination map below.
11269 		 * No need to re-allocate map entries from the correct
11270 		 * (pageable or not) zone, since we'll get new map entries
11271 		 * during the transfer.
11272 		 * We'll also adjust the map entries's "start" and "end"
11273 		 * during the transfer, to keep "copy"'s entries consistent
11274 		 * with its "offset".
11275 		 */
11276 		goto after_adjustments;
11277 	}
11278 
11279 	/*
11280 	 *	Since we're going to just drop the map
11281 	 *	entries from the copy into the destination
11282 	 *	map, they must come from the same pool.
11283 	 */
11284 
11285 	if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11286 		/*
11287 		 * Mismatches occur when dealing with the default
11288 		 * pager.
11289 		 */
11290 		vm_map_entry_t  next, new;
11291 
11292 		/*
11293 		 * Find the zone that the copies were allocated from
11294 		 */
11295 
11296 		entry = vm_map_copy_first_entry(copy);
11297 
11298 		/*
11299 		 * Reinitialize the copy so that vm_map_copy_entry_link
11300 		 * will work.
11301 		 */
11302 		vm_map_store_copy_reset(copy, entry);
11303 		copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11304 
11305 		/*
11306 		 * Copy each entry.
11307 		 */
11308 		while (entry != vm_map_copy_to_entry(copy)) {
11309 			new = vm_map_copy_entry_create(copy);
11310 			vm_map_entry_copy_full(new, entry);
11311 			new->vme_no_copy_on_read = FALSE;
11312 			assert(!new->iokit_acct);
11313 			if (new->is_sub_map) {
11314 				/* clr address space specifics */
11315 				new->use_pmap = FALSE;
11316 			}
11317 			vm_map_copy_entry_link(copy,
11318 			    vm_map_copy_last_entry(copy),
11319 			    new);
11320 			next = entry->vme_next;
11321 			vm_map_entry_dispose(entry);
11322 			entry = next;
11323 		}
11324 	}
11325 
11326 	/*
11327 	 *	Adjust the addresses in the copy chain, and
11328 	 *	reset the region attributes.
11329 	 */
11330 
11331 	for (entry = vm_map_copy_first_entry(copy);
11332 	    entry != vm_map_copy_to_entry(copy);
11333 	    entry = entry->vme_next) {
11334 		if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11335 			/*
11336 			 * We're injecting this copy entry into a map that
11337 			 * has the standard page alignment, so clear
11338 			 * "map_aligned" (which might have been inherited
11339 			 * from the original map entry).
11340 			 */
11341 			entry->map_aligned = FALSE;
11342 		}
11343 
11344 		entry->vme_start += adjustment;
11345 		entry->vme_end += adjustment;
11346 
11347 		if (entry->map_aligned) {
11348 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11349 			    VM_MAP_PAGE_MASK(dst_map)));
11350 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11351 			    VM_MAP_PAGE_MASK(dst_map)));
11352 		}
11353 
11354 		entry->inheritance = VM_INHERIT_DEFAULT;
11355 		entry->protection = VM_PROT_DEFAULT;
11356 		entry->max_protection = VM_PROT_ALL;
11357 		entry->behavior = VM_BEHAVIOR_DEFAULT;
11358 
11359 		/*
11360 		 * If the entry is now wired,
11361 		 * map the pages into the destination map.
11362 		 */
11363 		if (entry->wired_count != 0) {
11364 			vm_map_offset_t va;
11365 			vm_object_offset_t       offset;
11366 			vm_object_t object;
11367 			vm_prot_t prot;
11368 			int     type_of_fault;
11369 			uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11370 
11371 			/* TODO4K would need to use actual page size */
11372 			assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11373 
11374 			object = VME_OBJECT(entry);
11375 			offset = VME_OFFSET(entry);
11376 			va = entry->vme_start;
11377 
11378 			pmap_pageable(dst_map->pmap,
11379 			    entry->vme_start,
11380 			    entry->vme_end,
11381 			    TRUE);
11382 
11383 			while (va < entry->vme_end) {
11384 				vm_page_t       m;
11385 				struct vm_object_fault_info fault_info = {};
11386 
11387 				/*
11388 				 * Look up the page in the object.
11389 				 * Assert that the page will be found in the
11390 				 * top object:
11391 				 * either
11392 				 *	the object was newly created by
11393 				 *	vm_object_copy_slowly, and has
11394 				 *	copies of all of the pages from
11395 				 *	the source object
11396 				 * or
11397 				 *	the object was moved from the old
11398 				 *	map entry; because the old map
11399 				 *	entry was wired, all of the pages
11400 				 *	were in the top-level object.
11401 				 *	(XXX not true if we wire pages for
11402 				 *	 reading)
11403 				 */
11404 				vm_object_lock(object);
11405 
11406 				m = vm_page_lookup(object, offset);
11407 				if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11408 				    m->vmp_absent) {
11409 					panic("vm_map_copyout: wiring %p", m);
11410 				}
11411 
11412 				prot = entry->protection;
11413 
11414 				if (override_nx(dst_map, VME_ALIAS(entry)) &&
11415 				    prot) {
11416 					prot |= VM_PROT_EXECUTE;
11417 				}
11418 
11419 				type_of_fault = DBG_CACHE_HIT_FAULT;
11420 
11421 				fault_info.user_tag = VME_ALIAS(entry);
11422 				fault_info.pmap_options = 0;
11423 				if (entry->iokit_acct ||
11424 				    (!entry->is_sub_map && !entry->use_pmap)) {
11425 					fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11426 				}
11427 				if (entry->vme_xnu_user_debug &&
11428 				    !VM_PAGE_OBJECT(m)->code_signed) {
11429 					/*
11430 					 * Modified code-signed executable
11431 					 * region: this page does not belong
11432 					 * to a code-signed VM object, so it
11433 					 * must have been copied and should
11434 					 * therefore be typed XNU_USER_DEBUG
11435 					 * rather than XNU_USER_EXEC.
11436 					 */
11437 					fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11438 				}
11439 
11440 				vm_fault_enter(m,
11441 				    dst_map->pmap,
11442 				    va,
11443 				    PAGE_SIZE, 0,
11444 				    prot,
11445 				    prot,
11446 				    VM_PAGE_WIRED(m),
11447 				    FALSE,            /* change_wiring */
11448 				    VM_KERN_MEMORY_NONE,            /* tag - not wiring */
11449 				    &fault_info,
11450 				    NULL,             /* need_retry */
11451 				    &type_of_fault,
11452 				    &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11453 
11454 				vm_object_unlock(object);
11455 
11456 				offset += PAGE_SIZE_64;
11457 				va += PAGE_SIZE;
11458 			}
11459 		}
11460 	}
11461 
11462 after_adjustments:
11463 
11464 	/*
11465 	 *	Correct the page alignment for the result
11466 	 */
11467 
11468 	*dst_addr = start + (copy->offset - vm_copy_start);
11469 
11470 #if KASAN
11471 	kasan_notify_address(*dst_addr, size);
11472 #endif
11473 
11474 	/*
11475 	 *	Update the hints and the map size
11476 	 */
11477 
11478 	if (consume_on_success) {
11479 		SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11480 	} else {
11481 		SAVE_HINT_MAP_WRITE(dst_map, last);
11482 	}
11483 
11484 	dst_map->size += size;
11485 
11486 	/*
11487 	 *	Link in the copy
11488 	 */
11489 
11490 	if (consume_on_success) {
11491 		vm_map_copy_insert(dst_map, last, copy);
11492 		if (copy != original_copy) {
11493 			vm_map_copy_discard(original_copy);
11494 			original_copy = VM_MAP_COPY_NULL;
11495 		}
11496 	} else {
11497 		vm_map_copy_remap(dst_map, last, copy, adjustment,
11498 		    cur_protection, max_protection,
11499 		    inheritance);
11500 		if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11501 			vm_map_copy_discard(copy);
11502 			copy = original_copy;
11503 		}
11504 	}
11505 
11506 
11507 	vm_map_unlock(dst_map);
11508 
11509 	/*
11510 	 * XXX	If wiring_required, call vm_map_pageable
11511 	 */
11512 
11513 	return KERN_SUCCESS;
11514 }
11515 
11516 /*
11517  *	Routine:	vm_map_copyout_size
11518  *
11519  *	Description:
11520  *		Copy out a copy chain ("copy") into newly-allocated
11521  *		space in the destination map. Uses a prevalidated
11522  *		size for the copy object (vm_map_copy_validate_size).
11523  *
11524  *		If successful, consumes the copy object.
11525  *		Otherwise, the caller is responsible for it.
11526  */
11527 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size)11528 vm_map_copyout_size(
11529 	vm_map_t                dst_map,
11530 	vm_map_address_t       *dst_addr,      /* OUT */
11531 	vm_map_copy_t           copy,
11532 	vm_map_size_ut          copy_size)
11533 {
11534 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11535 	           TRUE,                     /* consume_on_success */
11536 	           VM_PROT_DEFAULT,
11537 	           VM_PROT_ALL,
11538 	           VM_INHERIT_DEFAULT);
11539 }
11540 
11541 /*
11542  *	Routine:	vm_map_copyout
11543  *
11544  *	Description:
11545  *		Copy out a copy chain ("copy") into newly-allocated
11546  *		space in the destination map.
11547  *
11548  *		If successful, consumes the copy object.
11549  *		Otherwise, the caller is responsible for it.
11550  */
11551 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11552 vm_map_copyout(
11553 	vm_map_t                dst_map,
11554 	vm_map_address_t       *dst_addr,      /* OUT */
11555 	vm_map_copy_t           copy)
11556 {
11557 	return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11558 	           TRUE,                     /* consume_on_success */
11559 	           VM_PROT_DEFAULT,
11560 	           VM_PROT_ALL,
11561 	           VM_INHERIT_DEFAULT);
11562 }
11563 
11564 /*
11565  *	Routine:	vm_map_copyin
11566  *
11567  *	Description:
11568  *		see vm_map_copyin_common.  Exported via Unsupported.exports.
11569  *
11570  */
11571 
11572 #undef vm_map_copyin
11573 
11574 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,vm_map_copy_t * copy_result)11575 vm_map_copyin(
11576 	vm_map_t                src_map,
11577 	vm_map_address_ut       src_addr,
11578 	vm_map_size_ut          len,
11579 	boolean_t               src_destroy,
11580 	vm_map_copy_t          *copy_result)   /* OUT */
11581 {
11582 	return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11583 	           FALSE, copy_result, FALSE);
11584 }
11585 
11586 /*
11587  *	Routine:	vm_map_copyin_common
11588  *
11589  *	Description:
11590  *		Copy the specified region (src_addr, len) from the
11591  *		source address space (src_map), possibly removing
11592  *		the region from the source address space (src_destroy).
11593  *
11594  *	Returns:
11595  *		A vm_map_copy_t object (copy_result), suitable for
11596  *		insertion into another address space (using vm_map_copyout),
11597  *		copying over another address space region (using
11598  *		vm_map_copy_overwrite).  If the copy is unused, it
11599  *		should be destroyed (using vm_map_copy_discard).
11600  *
11601  *	In/out conditions:
11602  *		The source map should not be locked on entry.
11603  */
11604 
11605 typedef struct submap_map {
11606 	vm_map_t        parent_map;
11607 	vm_map_offset_t base_start;
11608 	vm_map_offset_t base_end;
11609 	vm_map_size_t   base_len;
11610 	struct submap_map *next;
11611 } submap_map_t;
11612 
11613 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11614 vm_map_copyin_common(
11615 	vm_map_t                src_map,
11616 	vm_map_address_ut       src_addr,
11617 	vm_map_size_ut          len,
11618 	boolean_t               src_destroy,
11619 	__unused boolean_t      src_volatile,
11620 	vm_map_copy_t          *copy_result,   /* OUT */
11621 	boolean_t               use_maxprot)
11622 {
11623 	int flags;
11624 
11625 	flags = 0;
11626 	if (src_destroy) {
11627 		flags |= VM_MAP_COPYIN_SRC_DESTROY;
11628 	}
11629 	if (use_maxprot) {
11630 		flags |= VM_MAP_COPYIN_USE_MAXPROT;
11631 	}
11632 	return vm_map_copyin_internal(src_map,
11633 	           src_addr,
11634 	           len,
11635 	           flags,
11636 	           copy_result);
11637 }
11638 
11639 static inline kern_return_t
vm_map_copyin_sanitize(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,vm_map_offset_t * src_start,vm_map_offset_t * src_end,vm_map_size_t * len,vm_map_offset_t * src_addr_unaligned)11640 vm_map_copyin_sanitize(
11641 	vm_map_t                src_map,
11642 	vm_map_address_ut       src_addr_u,
11643 	vm_map_size_ut          len_u,
11644 	vm_map_offset_t        *src_start,
11645 	vm_map_offset_t        *src_end,
11646 	vm_map_size_t          *len,
11647 	vm_map_offset_t        *src_addr_unaligned)
11648 {
11649 	kern_return_t   kr;
11650 
11651 	kr = vm_sanitize_addr_size(src_addr_u, len_u, VM_SANITIZE_CALLER_VM_MAP_COPYIN,
11652 	    src_map,
11653 	    VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
11654 	    (src_map->pmap == kernel_pmap ? VM_SANITIZE_FLAGS_CANONICALIZE : VM_SANITIZE_FLAGS_NONE),
11655 	    src_start, src_end, len);
11656 	if (__improbable(kr != KERN_SUCCESS)) {
11657 		return kr;
11658 	}
11659 
11660 	/*
11661 	 *	Compute (page aligned) start and end of region
11662 	 */
11663 	*src_addr_unaligned  = *src_start; /* remember unaligned value */
11664 	*src_start = vm_map_trunc_page(*src_addr_unaligned,
11665 	    VM_MAP_PAGE_MASK(src_map));
11666 	*src_end   = vm_map_round_page(*src_end, VM_MAP_PAGE_MASK(src_map));
11667 	return KERN_SUCCESS;
11668 }
11669 
11670 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,int flags,vm_map_copy_t * copy_result)11671 vm_map_copyin_internal(
11672 	vm_map_t                src_map,
11673 	vm_map_address_ut       src_addr_u,
11674 	vm_map_size_ut          len_u,
11675 	int                     flags,
11676 	vm_map_copy_t          *copy_result)   /* OUT */
11677 {
11678 	vm_map_entry_t  tmp_entry;      /* Result of last map lookup --
11679 	                                 * in multi-level lookup, this
11680 	                                 * entry contains the actual
11681 	                                 * vm_object/offset.
11682 	                                 */
11683 	vm_map_entry_t  new_entry = VM_MAP_ENTRY_NULL;  /* Map entry for copy */
11684 
11685 	vm_map_offset_t src_start;      /* Start of current entry --
11686 	                                 * where copy is taking place now
11687 	                                 */
11688 	vm_map_offset_t src_end;        /* End of entire region to be
11689 	                                 * copied */
11690 	vm_map_offset_t src_addr_unaligned;
11691 	vm_map_offset_t src_base;
11692 	vm_map_size_t   len;
11693 	vm_map_t        base_map = src_map;
11694 	boolean_t       map_share = FALSE;
11695 	submap_map_t    *parent_maps = NULL;
11696 
11697 	vm_map_copy_t   copy;           /* Resulting copy */
11698 	vm_map_address_t copy_addr;
11699 	vm_map_size_t   copy_size;
11700 	boolean_t       src_destroy;
11701 	boolean_t       use_maxprot;
11702 	boolean_t       preserve_purgeable;
11703 	boolean_t       entry_was_shared;
11704 	vm_map_entry_t  saved_src_entry;
11705 	kern_return_t   kr;
11706 
11707 
11708 	if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11709 		return KERN_INVALID_ARGUMENT;
11710 	}
11711 
11712 	/*
11713 	 *	Check for copies of zero bytes.
11714 	 */
11715 	if (VM_SANITIZE_UNSAFE_IS_ZERO(len_u)) {
11716 		*copy_result = VM_MAP_COPY_NULL;
11717 		return KERN_SUCCESS;
11718 	}
11719 
11720 	/*
11721 	 * Sanitize any input parameters that are addr/size/prot/inherit
11722 	 */
11723 	kr = vm_map_copyin_sanitize(
11724 		src_map,
11725 		src_addr_u,
11726 		len_u,
11727 		&src_start,
11728 		&src_end,
11729 		&len,
11730 		&src_addr_unaligned);
11731 	if (__improbable(kr != KERN_SUCCESS)) {
11732 		return vm_sanitize_get_kr(kr);
11733 	}
11734 
11735 	src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11736 	use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11737 	preserve_purgeable =
11738 	    (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11739 
11740 	/*
11741 	 * If the copy is sufficiently small, use a kernel buffer instead
11742 	 * of making a virtual copy.  The theory being that the cost of
11743 	 * setting up VM (and taking C-O-W faults) dominates the copy costs
11744 	 * for small regions.
11745 	 */
11746 	if ((len <= msg_ool_size_small) &&
11747 	    !use_maxprot &&
11748 	    !preserve_purgeable &&
11749 	    !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11750 	    /*
11751 	     * Since the "msg_ool_size_small" threshold was increased and
11752 	     * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11753 	     * address space limits, we revert to doing a virtual copy if the
11754 	     * copied range goes beyond those limits.  Otherwise, mach_vm_read()
11755 	     * of the commpage would now fail when it used to work.
11756 	     */
11757 	    (src_start >= vm_map_min(src_map) &&
11758 	    src_start < vm_map_max(src_map) &&
11759 	    src_end >= vm_map_min(src_map) &&
11760 	    src_end < vm_map_max(src_map))) {
11761 		return vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len,
11762 		           src_destroy, copy_result);
11763 	}
11764 
11765 	/*
11766 	 *	Allocate a header element for the list.
11767 	 *
11768 	 *	Use the start and end in the header to
11769 	 *	remember the endpoints prior to rounding.
11770 	 */
11771 
11772 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
11773 	copy->cpy_hdr.entries_pageable = TRUE;
11774 	copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
11775 	copy->offset = src_addr_unaligned;
11776 	copy->size = len;
11777 
11778 	new_entry = vm_map_copy_entry_create(copy);
11779 
11780 #define RETURN(x)                                               \
11781 	MACRO_BEGIN                                             \
11782 	vm_map_unlock(src_map);                                 \
11783 	if(src_map != base_map)                                 \
11784 	        vm_map_deallocate(src_map);                     \
11785 	if (new_entry != VM_MAP_ENTRY_NULL)                     \
11786 	        vm_map_copy_entry_dispose(new_entry);           \
11787 	vm_map_copy_discard(copy);                              \
11788 	{                                                       \
11789 	        submap_map_t	*_ptr;                          \
11790                                                                 \
11791 	        for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
11792 	                parent_maps=parent_maps->next;          \
11793 	                if (_ptr->parent_map != base_map)       \
11794 	                        vm_map_deallocate(_ptr->parent_map);    \
11795 	                kfree_type(submap_map_t, _ptr);         \
11796 	        }                                               \
11797 	}                                                       \
11798 	MACRO_RETURN(x);                                        \
11799 	MACRO_END
11800 
11801 	/*
11802 	 *	Find the beginning of the region.
11803 	 */
11804 
11805 	vm_map_lock(src_map);
11806 
11807 	/*
11808 	 * Lookup the original "src_addr_unaligned" rather than the truncated
11809 	 * "src_start", in case "src_start" falls in a non-map-aligned
11810 	 * map entry *before* the map entry that contains "src_addr_unaligned"...
11811 	 */
11812 	if (!vm_map_lookup_entry(src_map, src_addr_unaligned, &tmp_entry)) {
11813 		RETURN(KERN_INVALID_ADDRESS);
11814 	}
11815 	if (!tmp_entry->is_sub_map) {
11816 		/*
11817 		 * ... but clip to the map-rounded "src_start" rather than
11818 		 * "src_addr_unaligned" to preserve map-alignment.  We'll adjust the
11819 		 * first copy entry at the end, if needed.
11820 		 */
11821 		vm_map_clip_start(src_map, tmp_entry, src_start);
11822 	}
11823 	if (src_start < tmp_entry->vme_start) {
11824 		/*
11825 		 * Move "src_start" up to the start of the
11826 		 * first map entry to copy.
11827 		 */
11828 		src_start = tmp_entry->vme_start;
11829 	}
11830 	/* set for later submap fix-up */
11831 	copy_addr = src_start;
11832 
11833 	/*
11834 	 *	Go through entries until we get to the end.
11835 	 */
11836 
11837 	while (TRUE) {
11838 		vm_map_entry_t  src_entry = tmp_entry;  /* Top-level entry */
11839 		vm_map_size_t   src_size;               /* Size of source
11840 		                                         * map entry (in both
11841 		                                         * maps)
11842 		                                         */
11843 
11844 		vm_object_t             src_object;     /* Object to copy */
11845 		vm_object_offset_t      src_offset;
11846 
11847 		vm_object_t             new_copy_object;/* vm_object_copy_* result */
11848 
11849 		boolean_t       src_needs_copy;         /* Should source map
11850 		                                         * be made read-only
11851 		                                         * for copy-on-write?
11852 		                                         */
11853 
11854 		boolean_t       new_entry_needs_copy;   /* Will new entry be COW? */
11855 
11856 		boolean_t       was_wired;              /* Was source wired? */
11857 		boolean_t       saved_used_for_jit;     /* Saved used_for_jit. */
11858 		vm_map_version_t version;               /* Version before locks
11859 		                                         * dropped to make copy
11860 		                                         */
11861 		kern_return_t   result;                 /* Return value from
11862 		                                         * copy_strategically.
11863 		                                         */
11864 		while (tmp_entry->is_sub_map) {
11865 			vm_map_size_t submap_len;
11866 			submap_map_t *ptr;
11867 
11868 			ptr = kalloc_type(submap_map_t, Z_WAITOK);
11869 			ptr->next = parent_maps;
11870 			parent_maps = ptr;
11871 			ptr->parent_map = src_map;
11872 			ptr->base_start = src_start;
11873 			ptr->base_end = src_end;
11874 			submap_len = tmp_entry->vme_end - src_start;
11875 			if (submap_len > (src_end - src_start)) {
11876 				submap_len = src_end - src_start;
11877 			}
11878 			ptr->base_len = submap_len;
11879 
11880 			src_start -= tmp_entry->vme_start;
11881 			src_start += VME_OFFSET(tmp_entry);
11882 			src_end = src_start + submap_len;
11883 			src_map = VME_SUBMAP(tmp_entry);
11884 			vm_map_lock(src_map);
11885 			/* keep an outstanding reference for all maps in */
11886 			/* the parents tree except the base map */
11887 			vm_map_reference(src_map);
11888 			vm_map_unlock(ptr->parent_map);
11889 			if (!vm_map_lookup_entry(
11890 				    src_map, src_start, &tmp_entry)) {
11891 				RETURN(KERN_INVALID_ADDRESS);
11892 			}
11893 			map_share = TRUE;
11894 			if (!tmp_entry->is_sub_map) {
11895 				vm_map_clip_start(src_map, tmp_entry, src_start);
11896 			}
11897 			src_entry = tmp_entry;
11898 		}
11899 		/* we are now in the lowest level submap... */
11900 
11901 		if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
11902 		    (VME_OBJECT(tmp_entry)->phys_contiguous)) {
11903 			/* This is not, supported for now.In future */
11904 			/* we will need to detect the phys_contig   */
11905 			/* condition and then upgrade copy_slowly   */
11906 			/* to do physical copy from the device mem  */
11907 			/* based object. We can piggy-back off of   */
11908 			/* the was wired boolean to set-up the      */
11909 			/* proper handling */
11910 			RETURN(KERN_PROTECTION_FAILURE);
11911 		}
11912 		/*
11913 		 *	Create a new address map entry to hold the result.
11914 		 *	Fill in the fields from the appropriate source entries.
11915 		 *	We must unlock the source map to do this if we need
11916 		 *	to allocate a map entry.
11917 		 */
11918 		if (new_entry == VM_MAP_ENTRY_NULL) {
11919 			version.main_timestamp = src_map->timestamp;
11920 			vm_map_unlock(src_map);
11921 
11922 			new_entry = vm_map_copy_entry_create(copy);
11923 
11924 			vm_map_lock(src_map);
11925 			if ((version.main_timestamp + 1) != src_map->timestamp) {
11926 				if (!vm_map_lookup_entry(src_map, src_start,
11927 				    &tmp_entry)) {
11928 					RETURN(KERN_INVALID_ADDRESS);
11929 				}
11930 				if (!tmp_entry->is_sub_map) {
11931 					vm_map_clip_start(src_map, tmp_entry, src_start);
11932 				}
11933 				continue; /* restart w/ new tmp_entry */
11934 			}
11935 		}
11936 
11937 		/*
11938 		 *	Verify that the region can be read.
11939 		 */
11940 		if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
11941 		    !use_maxprot) ||
11942 		    (src_entry->max_protection & VM_PROT_READ) == 0) {
11943 			RETURN(KERN_PROTECTION_FAILURE);
11944 		}
11945 
11946 		/*
11947 		 *	Clip against the endpoints of the entire region.
11948 		 */
11949 
11950 		vm_map_clip_end(src_map, src_entry, src_end);
11951 
11952 		src_size = src_entry->vme_end - src_start;
11953 		src_object = VME_OBJECT(src_entry);
11954 		src_offset = VME_OFFSET(src_entry);
11955 		was_wired = (src_entry->wired_count != 0);
11956 
11957 		vm_map_entry_copy(src_map, new_entry, src_entry);
11958 		if (new_entry->is_sub_map) {
11959 			/* clr address space specifics */
11960 			new_entry->use_pmap = FALSE;
11961 		} else {
11962 			/*
11963 			 * We're dealing with a copy-on-write operation,
11964 			 * so the resulting mapping should not inherit the
11965 			 * original mapping's accounting settings.
11966 			 * "iokit_acct" should have been cleared in
11967 			 * vm_map_entry_copy().
11968 			 * "use_pmap" should be reset to its default (TRUE)
11969 			 * so that the new mapping gets accounted for in
11970 			 * the task's memory footprint.
11971 			 */
11972 			assert(!new_entry->iokit_acct);
11973 			new_entry->use_pmap = TRUE;
11974 		}
11975 
11976 		/*
11977 		 *	Attempt non-blocking copy-on-write optimizations.
11978 		 */
11979 
11980 		/*
11981 		 * If we are destroying the source, and the object
11982 		 * is internal, we could move the object reference
11983 		 * from the source to the copy.  The copy is
11984 		 * copy-on-write only if the source is.
11985 		 * We make another reference to the object, because
11986 		 * destroying the source entry will deallocate it.
11987 		 *
11988 		 * This memory transfer has to be atomic, (to prevent
11989 		 * the VM object from being shared or copied while
11990 		 * it's being moved here), so we could only do this
11991 		 * if we won't have to unlock the VM map until the
11992 		 * original mapping has been fully removed.
11993 		 */
11994 
11995 RestartCopy:
11996 		if ((src_object == VM_OBJECT_NULL ||
11997 		    (!was_wired && !map_share && !tmp_entry->is_shared
11998 		    && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
11999 		    vm_object_copy_quickly(
12000 			    VME_OBJECT(new_entry),
12001 			    src_offset,
12002 			    src_size,
12003 			    &src_needs_copy,
12004 			    &new_entry_needs_copy)) {
12005 			new_entry->needs_copy = new_entry_needs_copy;
12006 
12007 			/*
12008 			 *	Handle copy-on-write obligations
12009 			 */
12010 
12011 			if (src_needs_copy && !tmp_entry->needs_copy) {
12012 				vm_prot_t prot;
12013 
12014 				prot = src_entry->protection & ~VM_PROT_WRITE;
12015 
12016 				if (override_nx(src_map, VME_ALIAS(src_entry))
12017 				    && prot) {
12018 					prot |= VM_PROT_EXECUTE;
12019 				}
12020 
12021 				vm_object_pmap_protect(
12022 					src_object,
12023 					src_offset,
12024 					src_size,
12025 					(src_entry->is_shared ?
12026 					PMAP_NULL
12027 					: src_map->pmap),
12028 					VM_MAP_PAGE_SIZE(src_map),
12029 					src_entry->vme_start,
12030 					prot);
12031 
12032 				assert(tmp_entry->wired_count == 0);
12033 				tmp_entry->needs_copy = TRUE;
12034 			}
12035 
12036 			/*
12037 			 *	The map has never been unlocked, so it's safe
12038 			 *	to move to the next entry rather than doing
12039 			 *	another lookup.
12040 			 */
12041 
12042 			goto CopySuccessful;
12043 		}
12044 
12045 		entry_was_shared = tmp_entry->is_shared;
12046 
12047 		/*
12048 		 *	Take an object reference, so that we may
12049 		 *	release the map lock(s).
12050 		 */
12051 
12052 		assert(src_object != VM_OBJECT_NULL);
12053 		vm_object_reference(src_object);
12054 
12055 		/*
12056 		 *	Record the timestamp for later verification.
12057 		 *	Unlock the map.
12058 		 */
12059 
12060 		version.main_timestamp = src_map->timestamp;
12061 		vm_map_unlock(src_map); /* Increments timestamp once! */
12062 		saved_src_entry = src_entry;
12063 		tmp_entry = VM_MAP_ENTRY_NULL;
12064 		src_entry = VM_MAP_ENTRY_NULL;
12065 
12066 		/*
12067 		 *	Perform the copy
12068 		 */
12069 
12070 		if (was_wired ||
12071 		    (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12072 		    !(flags & VM_MAP_COPYIN_FORK)) ||
12073 		    (debug4k_no_cow_copyin &&
12074 		    VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12075 CopySlowly:
12076 			vm_object_lock(src_object);
12077 			result = vm_object_copy_slowly(
12078 				src_object,
12079 				src_offset,
12080 				src_size,
12081 				THREAD_UNINT,
12082 				&new_copy_object);
12083 			/* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12084 			saved_used_for_jit = new_entry->used_for_jit;
12085 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12086 			new_entry->used_for_jit = saved_used_for_jit;
12087 			VME_OFFSET_SET(new_entry,
12088 			    src_offset - vm_object_trunc_page(src_offset));
12089 			new_entry->needs_copy = FALSE;
12090 		} else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12091 		    (entry_was_shared || map_share)) {
12092 			vm_object_t new_object;
12093 
12094 			vm_object_lock_shared(src_object);
12095 			new_object = vm_object_copy_delayed(
12096 				src_object,
12097 				src_offset,
12098 				src_size,
12099 				TRUE);
12100 			if (new_object == VM_OBJECT_NULL) {
12101 				goto CopySlowly;
12102 			}
12103 
12104 			VME_OBJECT_SET(new_entry, new_object, false, 0);
12105 			assert(new_entry->wired_count == 0);
12106 			new_entry->needs_copy = TRUE;
12107 			assert(!new_entry->iokit_acct);
12108 			assert(new_object->purgable == VM_PURGABLE_DENY);
12109 			assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12110 			result = KERN_SUCCESS;
12111 		} else {
12112 			vm_object_offset_t new_offset;
12113 			new_offset = VME_OFFSET(new_entry);
12114 			result = vm_object_copy_strategically(src_object,
12115 			    src_offset,
12116 			    src_size,
12117 			    (flags & VM_MAP_COPYIN_FORK),
12118 			    &new_copy_object,
12119 			    &new_offset,
12120 			    &new_entry_needs_copy);
12121 			/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12122 			saved_used_for_jit = new_entry->used_for_jit;
12123 			VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12124 			new_entry->used_for_jit = saved_used_for_jit;
12125 			if (new_offset != VME_OFFSET(new_entry)) {
12126 				VME_OFFSET_SET(new_entry, new_offset);
12127 			}
12128 
12129 			new_entry->needs_copy = new_entry_needs_copy;
12130 		}
12131 
12132 		if (result == KERN_SUCCESS &&
12133 		    ((preserve_purgeable &&
12134 		    src_object->purgable != VM_PURGABLE_DENY) ||
12135 		    new_entry->used_for_jit)) {
12136 			/*
12137 			 * Purgeable objects should be COPY_NONE, true share;
12138 			 * this should be propogated to the copy.
12139 			 *
12140 			 * Also force mappings the pmap specially protects to
12141 			 * be COPY_NONE; trying to COW these mappings would
12142 			 * change the effective protections, which could have
12143 			 * side effects if the pmap layer relies on the
12144 			 * specified protections.
12145 			 */
12146 
12147 			vm_object_t     new_object;
12148 
12149 			new_object = VME_OBJECT(new_entry);
12150 			assert(new_object != src_object);
12151 			vm_object_lock(new_object);
12152 			assert(new_object->ref_count == 1);
12153 			assert(new_object->shadow == VM_OBJECT_NULL);
12154 			assert(new_object->vo_copy == VM_OBJECT_NULL);
12155 			assert(new_object->vo_owner == NULL);
12156 
12157 			new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12158 
12159 			if (preserve_purgeable &&
12160 			    src_object->purgable != VM_PURGABLE_DENY) {
12161 				VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12162 
12163 				/* start as non-volatile with no owner... */
12164 				VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12165 				vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12166 				/* ... and move to src_object's purgeable state */
12167 				if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12168 					int state;
12169 					state = src_object->purgable;
12170 					vm_object_purgable_control(
12171 						new_object,
12172 						VM_PURGABLE_SET_STATE_FROM_KERNEL,
12173 						&state);
12174 				}
12175 				/* no pmap accounting for purgeable objects */
12176 				new_entry->use_pmap = FALSE;
12177 			}
12178 
12179 			vm_object_unlock(new_object);
12180 			new_object = VM_OBJECT_NULL;
12181 		}
12182 
12183 		if (result != KERN_SUCCESS &&
12184 		    result != KERN_MEMORY_RESTART_COPY) {
12185 			vm_map_lock(src_map);
12186 			RETURN(result);
12187 		}
12188 
12189 		/*
12190 		 *	Throw away the extra reference
12191 		 */
12192 
12193 		vm_object_deallocate(src_object);
12194 
12195 		/*
12196 		 *	Verify that the map has not substantially
12197 		 *	changed while the copy was being made.
12198 		 */
12199 
12200 		vm_map_lock(src_map);
12201 
12202 		if ((version.main_timestamp + 1) == src_map->timestamp) {
12203 			/* src_map hasn't changed: src_entry is still valid */
12204 			src_entry = saved_src_entry;
12205 			goto VerificationSuccessful;
12206 		}
12207 
12208 		/*
12209 		 *	Simple version comparison failed.
12210 		 *
12211 		 *	Retry the lookup and verify that the
12212 		 *	same object/offset are still present.
12213 		 *
12214 		 *	[Note: a memory manager that colludes with
12215 		 *	the calling task can detect that we have
12216 		 *	cheated.  While the map was unlocked, the
12217 		 *	mapping could have been changed and restored.]
12218 		 */
12219 
12220 		if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12221 			if (result != KERN_MEMORY_RESTART_COPY) {
12222 				vm_object_deallocate(VME_OBJECT(new_entry));
12223 				VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12224 				/* reset accounting state */
12225 				new_entry->iokit_acct = FALSE;
12226 				new_entry->use_pmap = TRUE;
12227 			}
12228 			RETURN(KERN_INVALID_ADDRESS);
12229 		}
12230 
12231 		src_entry = tmp_entry;
12232 		vm_map_clip_start(src_map, src_entry, src_start);
12233 
12234 		if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12235 		    !use_maxprot) ||
12236 		    ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12237 			goto VerificationFailed;
12238 		}
12239 
12240 		if (src_entry->vme_end < new_entry->vme_end) {
12241 			/*
12242 			 * This entry might have been shortened
12243 			 * (vm_map_clip_end) or been replaced with
12244 			 * an entry that ends closer to "src_start"
12245 			 * than before.
12246 			 * Adjust "new_entry" accordingly; copying
12247 			 * less memory would be correct but we also
12248 			 * redo the copy (see below) if the new entry
12249 			 * no longer points at the same object/offset.
12250 			 */
12251 			assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12252 			    VM_MAP_COPY_PAGE_MASK(copy)));
12253 			new_entry->vme_end = src_entry->vme_end;
12254 			src_size = new_entry->vme_end - src_start;
12255 		} else if (src_entry->vme_end > new_entry->vme_end) {
12256 			/*
12257 			 * This entry might have been extended
12258 			 * (vm_map_entry_simplify() or coalesce)
12259 			 * or been replaced with an entry that ends farther
12260 			 * from "src_start" than before.
12261 			 *
12262 			 * We've called vm_object_copy_*() only on
12263 			 * the previous <start:end> range, so we can't
12264 			 * just extend new_entry.  We have to re-do
12265 			 * the copy based on the new entry as if it was
12266 			 * pointing at a different object/offset (see
12267 			 * "Verification failed" below).
12268 			 */
12269 		}
12270 
12271 		if ((VME_OBJECT(src_entry) != src_object) ||
12272 		    (VME_OFFSET(src_entry) != src_offset) ||
12273 		    (src_entry->vme_end > new_entry->vme_end)) {
12274 			/*
12275 			 *	Verification failed.
12276 			 *
12277 			 *	Start over with this top-level entry.
12278 			 */
12279 
12280 VerificationFailed:     ;
12281 
12282 			vm_object_deallocate(VME_OBJECT(new_entry));
12283 			tmp_entry = src_entry;
12284 			continue;
12285 		}
12286 
12287 		/*
12288 		 *	Verification succeeded.
12289 		 */
12290 
12291 VerificationSuccessful:;
12292 
12293 		if (result == KERN_MEMORY_RESTART_COPY) {
12294 			goto RestartCopy;
12295 		}
12296 
12297 		/*
12298 		 *	Copy succeeded.
12299 		 */
12300 
12301 CopySuccessful: ;
12302 
12303 		/*
12304 		 *	Link in the new copy entry.
12305 		 */
12306 
12307 		vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12308 		    new_entry);
12309 
12310 		/*
12311 		 *	Determine whether the entire region
12312 		 *	has been copied.
12313 		 */
12314 		src_base = src_start;
12315 		src_start = new_entry->vme_end;
12316 		new_entry = VM_MAP_ENTRY_NULL;
12317 		while ((src_start >= src_end) && (src_end != 0)) {
12318 			submap_map_t    *ptr;
12319 
12320 			if (src_map == base_map) {
12321 				/* back to the top */
12322 				break;
12323 			}
12324 
12325 			ptr = parent_maps;
12326 			assert(ptr != NULL);
12327 			parent_maps = parent_maps->next;
12328 
12329 			/* fix up the damage we did in that submap */
12330 			vm_map_simplify_range(src_map,
12331 			    src_base,
12332 			    src_end);
12333 
12334 			vm_map_unlock(src_map);
12335 			vm_map_deallocate(src_map);
12336 			vm_map_lock(ptr->parent_map);
12337 			src_map = ptr->parent_map;
12338 			src_base = ptr->base_start;
12339 			src_start = ptr->base_start + ptr->base_len;
12340 			src_end = ptr->base_end;
12341 			if (!vm_map_lookup_entry(src_map,
12342 			    src_start,
12343 			    &tmp_entry) &&
12344 			    (src_end > src_start)) {
12345 				RETURN(KERN_INVALID_ADDRESS);
12346 			}
12347 			kfree_type(submap_map_t, ptr);
12348 			if (parent_maps == NULL) {
12349 				map_share = FALSE;
12350 			}
12351 			src_entry = tmp_entry->vme_prev;
12352 		}
12353 
12354 		if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12355 		    (src_start >= src_addr_unaligned + len) &&
12356 		    (src_addr_unaligned + len != 0)) {
12357 			/*
12358 			 * Stop copying now, even though we haven't reached
12359 			 * "src_end".  We'll adjust the end of the last copy
12360 			 * entry at the end, if needed.
12361 			 *
12362 			 * If src_map's aligment is different from the
12363 			 * system's page-alignment, there could be
12364 			 * extra non-map-aligned map entries between
12365 			 * the original (non-rounded) "src_addr_unaligned + len"
12366 			 * and the rounded "src_end".
12367 			 * We do not want to copy those map entries since
12368 			 * they're not part of the copied range.
12369 			 */
12370 			break;
12371 		}
12372 
12373 		if ((src_start >= src_end) && (src_end != 0)) {
12374 			break;
12375 		}
12376 
12377 		/*
12378 		 *	Verify that there are no gaps in the region
12379 		 */
12380 
12381 		tmp_entry = src_entry->vme_next;
12382 		if ((tmp_entry->vme_start != src_start) ||
12383 		    (tmp_entry == vm_map_to_entry(src_map))) {
12384 			RETURN(KERN_INVALID_ADDRESS);
12385 		}
12386 	}
12387 
12388 	/*
12389 	 * If the source should be destroyed, do it now, since the
12390 	 * copy was successful.
12391 	 */
12392 	if (src_destroy) {
12393 		vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12394 
12395 		if (src_map == kernel_map) {
12396 			remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12397 		}
12398 		(void)vm_map_remove_and_unlock(src_map,
12399 		    vm_map_trunc_page(src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)),
12400 		    src_end,
12401 		    remove_flags,
12402 		    KMEM_GUARD_NONE);
12403 	} else {
12404 		/* fix up the damage we did in the base map */
12405 		vm_map_simplify_range(
12406 			src_map,
12407 			vm_map_trunc_page(src_addr_unaligned,
12408 			VM_MAP_PAGE_MASK(src_map)),
12409 			vm_map_round_page(src_end,
12410 			VM_MAP_PAGE_MASK(src_map)));
12411 		vm_map_unlock(src_map);
12412 	}
12413 
12414 	tmp_entry = VM_MAP_ENTRY_NULL;
12415 
12416 	if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12417 	    VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12418 		vm_map_offset_t original_start, original_offset, original_end;
12419 
12420 		assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12421 
12422 		/* adjust alignment of first copy_entry's "vme_start" */
12423 		tmp_entry = vm_map_copy_first_entry(copy);
12424 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12425 			vm_map_offset_t adjustment;
12426 
12427 			original_start = tmp_entry->vme_start;
12428 			original_offset = VME_OFFSET(tmp_entry);
12429 
12430 			/* map-align the start of the first copy entry... */
12431 			adjustment = (tmp_entry->vme_start -
12432 			    vm_map_trunc_page(
12433 				    tmp_entry->vme_start,
12434 				    VM_MAP_PAGE_MASK(src_map)));
12435 			tmp_entry->vme_start -= adjustment;
12436 			VME_OFFSET_SET(tmp_entry,
12437 			    VME_OFFSET(tmp_entry) - adjustment);
12438 			copy_addr -= adjustment;
12439 			assert(tmp_entry->vme_start < tmp_entry->vme_end);
12440 			/* ... adjust for mis-aligned start of copy range */
12441 			adjustment =
12442 			    (vm_map_trunc_page(copy->offset,
12443 			    PAGE_MASK) -
12444 			    vm_map_trunc_page(copy->offset,
12445 			    VM_MAP_PAGE_MASK(src_map)));
12446 			if (adjustment) {
12447 				assert(page_aligned(adjustment));
12448 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12449 				tmp_entry->vme_start += adjustment;
12450 				VME_OFFSET_SET(tmp_entry,
12451 				    (VME_OFFSET(tmp_entry) +
12452 				    adjustment));
12453 				copy_addr += adjustment;
12454 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12455 			}
12456 
12457 			/*
12458 			 * Assert that the adjustments haven't exposed
12459 			 * more than was originally copied...
12460 			 */
12461 			assert(tmp_entry->vme_start >= original_start);
12462 			assert(VME_OFFSET(tmp_entry) >= original_offset);
12463 			/*
12464 			 * ... and that it did not adjust outside of a
12465 			 * a single 16K page.
12466 			 */
12467 			assert(vm_map_trunc_page(tmp_entry->vme_start,
12468 			    VM_MAP_PAGE_MASK(src_map)) ==
12469 			    vm_map_trunc_page(original_start,
12470 			    VM_MAP_PAGE_MASK(src_map)));
12471 		}
12472 
12473 		/* adjust alignment of last copy_entry's "vme_end" */
12474 		tmp_entry = vm_map_copy_last_entry(copy);
12475 		if (tmp_entry != vm_map_copy_to_entry(copy)) {
12476 			vm_map_offset_t adjustment;
12477 
12478 			original_end = tmp_entry->vme_end;
12479 
12480 			/* map-align the end of the last copy entry... */
12481 			tmp_entry->vme_end =
12482 			    vm_map_round_page(tmp_entry->vme_end,
12483 			    VM_MAP_PAGE_MASK(src_map));
12484 			/* ... adjust for mis-aligned end of copy range */
12485 			adjustment =
12486 			    (vm_map_round_page((copy->offset +
12487 			    copy->size),
12488 			    VM_MAP_PAGE_MASK(src_map)) -
12489 			    vm_map_round_page((copy->offset +
12490 			    copy->size),
12491 			    PAGE_MASK));
12492 			if (adjustment) {
12493 				assert(page_aligned(adjustment));
12494 				assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12495 				tmp_entry->vme_end -= adjustment;
12496 				assert(tmp_entry->vme_start < tmp_entry->vme_end);
12497 			}
12498 
12499 			/*
12500 			 * Assert that the adjustments haven't exposed
12501 			 * more than was originally copied...
12502 			 */
12503 			assert(tmp_entry->vme_end <= original_end);
12504 			/*
12505 			 * ... and that it did not adjust outside of a
12506 			 * a single 16K page.
12507 			 */
12508 			assert(vm_map_round_page(tmp_entry->vme_end,
12509 			    VM_MAP_PAGE_MASK(src_map)) ==
12510 			    vm_map_round_page(original_end,
12511 			    VM_MAP_PAGE_MASK(src_map)));
12512 		}
12513 	}
12514 
12515 	/* Fix-up start and end points in copy.  This is necessary */
12516 	/* when the various entries in the copy object were picked */
12517 	/* up from different sub-maps */
12518 
12519 	tmp_entry = vm_map_copy_first_entry(copy);
12520 	copy_size = 0; /* compute actual size */
12521 	while (tmp_entry != vm_map_copy_to_entry(copy)) {
12522 		assert(VM_MAP_PAGE_ALIGNED(
12523 			    copy_addr + (tmp_entry->vme_end -
12524 			    tmp_entry->vme_start),
12525 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12526 		assert(VM_MAP_PAGE_ALIGNED(
12527 			    copy_addr,
12528 			    MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12529 
12530 		/*
12531 		 * The copy_entries will be injected directly into the
12532 		 * destination map and might not be "map aligned" there...
12533 		 */
12534 		tmp_entry->map_aligned = FALSE;
12535 
12536 		tmp_entry->vme_end = copy_addr +
12537 		    (tmp_entry->vme_end - tmp_entry->vme_start);
12538 		tmp_entry->vme_start = copy_addr;
12539 		assert(tmp_entry->vme_start < tmp_entry->vme_end);
12540 		copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12541 		copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12542 		tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12543 	}
12544 
12545 	if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12546 	    copy_size < copy->size) {
12547 		/*
12548 		 * The actual size of the VM map copy is smaller than what
12549 		 * was requested by the caller.  This must be because some
12550 		 * PAGE_SIZE-sized pages are missing at the end of the last
12551 		 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12552 		 * The caller might not have been aware of those missing
12553 		 * pages and might not want to be aware of it, which is
12554 		 * fine as long as they don't try to access (and crash on)
12555 		 * those missing pages.
12556 		 * Let's adjust the size of the "copy", to avoid failing
12557 		 * in vm_map_copyout() or vm_map_copy_overwrite().
12558 		 */
12559 		assert(vm_map_round_page(copy_size,
12560 		    VM_MAP_PAGE_MASK(src_map)) ==
12561 		    vm_map_round_page(copy->size,
12562 		    VM_MAP_PAGE_MASK(src_map)));
12563 		copy->size = copy_size;
12564 	}
12565 
12566 	*copy_result = copy;
12567 	return KERN_SUCCESS;
12568 
12569 #undef  RETURN
12570 }
12571 
12572 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12573 vm_map_copy_extract(
12574 	vm_map_t                src_map,
12575 	vm_map_address_t        src_addr,
12576 	vm_map_size_t           len,
12577 	boolean_t               do_copy,
12578 	vm_map_copy_t           *copy_result,   /* OUT */
12579 	vm_prot_t               *cur_prot,      /* IN/OUT */
12580 	vm_prot_t               *max_prot,      /* IN/OUT */
12581 	vm_inherit_t            inheritance,
12582 	vm_map_kernel_flags_t   vmk_flags)
12583 {
12584 	vm_map_copy_t   copy;
12585 	kern_return_t   kr;
12586 	vm_prot_t required_cur_prot, required_max_prot;
12587 
12588 	/*
12589 	 *	Check for copies of zero bytes.
12590 	 */
12591 
12592 	if (len == 0) {
12593 		*copy_result = VM_MAP_COPY_NULL;
12594 		return KERN_SUCCESS;
12595 	}
12596 
12597 	/*
12598 	 *	Check that the end address doesn't overflow
12599 	 */
12600 	if (src_addr + len < src_addr) {
12601 		return KERN_INVALID_ADDRESS;
12602 	}
12603 	if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12604 		return KERN_INVALID_ADDRESS;
12605 	}
12606 
12607 	if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12608 		DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12609 	}
12610 
12611 	required_cur_prot = *cur_prot;
12612 	required_max_prot = *max_prot;
12613 
12614 	/*
12615 	 *	Allocate a header element for the list.
12616 	 *
12617 	 *	Use the start and end in the header to
12618 	 *	remember the endpoints prior to rounding.
12619 	 */
12620 
12621 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12622 	copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12623 	copy->offset = 0;
12624 	copy->size = len;
12625 
12626 	kr = vm_map_remap_extract(src_map,
12627 	    src_addr,
12628 	    len,
12629 	    do_copy,             /* copy */
12630 	    copy,
12631 	    cur_prot,            /* IN/OUT */
12632 	    max_prot,            /* IN/OUT */
12633 	    inheritance,
12634 	    vmk_flags);
12635 	if (kr != KERN_SUCCESS) {
12636 		vm_map_copy_discard(copy);
12637 		if ((kr == KERN_INVALID_ADDRESS ||
12638 		    kr == KERN_INVALID_ARGUMENT) &&
12639 		    src_map->terminated) {
12640 			/* tell the caller that this address space is gone */
12641 			kr = KERN_TERMINATED;
12642 		}
12643 		return kr;
12644 	}
12645 	if (required_cur_prot != VM_PROT_NONE) {
12646 		assert((*cur_prot & required_cur_prot) == required_cur_prot);
12647 		assert((*max_prot & required_max_prot) == required_max_prot);
12648 	}
12649 
12650 	*copy_result = copy;
12651 	return KERN_SUCCESS;
12652 }
12653 
12654 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12655 vm_map_fork_share(
12656 	vm_map_t        old_map,
12657 	vm_map_entry_t  old_entry,
12658 	vm_map_t        new_map)
12659 {
12660 	vm_object_t     object;
12661 	vm_map_entry_t  new_entry;
12662 
12663 	/*
12664 	 *	New sharing code.  New map entry
12665 	 *	references original object.  Internal
12666 	 *	objects use asynchronous copy algorithm for
12667 	 *	future copies.  First make sure we have
12668 	 *	the right object.  If we need a shadow,
12669 	 *	or someone else already has one, then
12670 	 *	make a new shadow and share it.
12671 	 */
12672 
12673 	if (!old_entry->is_sub_map) {
12674 		object = VME_OBJECT(old_entry);
12675 	}
12676 
12677 	if (old_entry->is_sub_map) {
12678 		assert(old_entry->wired_count == 0);
12679 #ifndef NO_NESTED_PMAP
12680 #if !PMAP_FORK_NEST
12681 		if (old_entry->use_pmap) {
12682 			kern_return_t   result;
12683 
12684 			result = pmap_nest(new_map->pmap,
12685 			    (VME_SUBMAP(old_entry))->pmap,
12686 			    (addr64_t)old_entry->vme_start,
12687 			    (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12688 			if (result) {
12689 				panic("vm_map_fork_share: pmap_nest failed!");
12690 			}
12691 		}
12692 #endif /* !PMAP_FORK_NEST */
12693 #endif  /* NO_NESTED_PMAP */
12694 	} else if (object == VM_OBJECT_NULL) {
12695 		object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12696 		    old_entry->vme_start));
12697 		VME_OFFSET_SET(old_entry, 0);
12698 		VME_OBJECT_SET(old_entry, object, false, 0);
12699 		old_entry->use_pmap = TRUE;
12700 //		assert(!old_entry->needs_copy);
12701 	} else if (object->copy_strategy !=
12702 	    MEMORY_OBJECT_COPY_SYMMETRIC) {
12703 		/*
12704 		 *	We are already using an asymmetric
12705 		 *	copy, and therefore we already have
12706 		 *	the right object.
12707 		 */
12708 
12709 		assert(!old_entry->needs_copy);
12710 	} else if (old_entry->needs_copy ||       /* case 1 */
12711 	    object->shadowed ||                 /* case 2 */
12712 	    (!object->true_share &&             /* case 3 */
12713 	    !old_entry->is_shared &&
12714 	    (object->vo_size >
12715 	    (vm_map_size_t)(old_entry->vme_end -
12716 	    old_entry->vme_start)))) {
12717 		bool is_writable;
12718 
12719 		/*
12720 		 *	We need to create a shadow.
12721 		 *	There are three cases here.
12722 		 *	In the first case, we need to
12723 		 *	complete a deferred symmetrical
12724 		 *	copy that we participated in.
12725 		 *	In the second and third cases,
12726 		 *	we need to create the shadow so
12727 		 *	that changes that we make to the
12728 		 *	object do not interfere with
12729 		 *	any symmetrical copies which
12730 		 *	have occured (case 2) or which
12731 		 *	might occur (case 3).
12732 		 *
12733 		 *	The first case is when we had
12734 		 *	deferred shadow object creation
12735 		 *	via the entry->needs_copy mechanism.
12736 		 *	This mechanism only works when
12737 		 *	only one entry points to the source
12738 		 *	object, and we are about to create
12739 		 *	a second entry pointing to the
12740 		 *	same object. The problem is that
12741 		 *	there is no way of mapping from
12742 		 *	an object to the entries pointing
12743 		 *	to it. (Deferred shadow creation
12744 		 *	works with one entry because occurs
12745 		 *	at fault time, and we walk from the
12746 		 *	entry to the object when handling
12747 		 *	the fault.)
12748 		 *
12749 		 *	The second case is when the object
12750 		 *	to be shared has already been copied
12751 		 *	with a symmetric copy, but we point
12752 		 *	directly to the object without
12753 		 *	needs_copy set in our entry. (This
12754 		 *	can happen because different ranges
12755 		 *	of an object can be pointed to by
12756 		 *	different entries. In particular,
12757 		 *	a single entry pointing to an object
12758 		 *	can be split by a call to vm_inherit,
12759 		 *	which, combined with task_create, can
12760 		 *	result in the different entries
12761 		 *	having different needs_copy values.)
12762 		 *	The shadowed flag in the object allows
12763 		 *	us to detect this case. The problem
12764 		 *	with this case is that if this object
12765 		 *	has or will have shadows, then we
12766 		 *	must not perform an asymmetric copy
12767 		 *	of this object, since such a copy
12768 		 *	allows the object to be changed, which
12769 		 *	will break the previous symmetrical
12770 		 *	copies (which rely upon the object
12771 		 *	not changing). In a sense, the shadowed
12772 		 *	flag says "don't change this object".
12773 		 *	We fix this by creating a shadow
12774 		 *	object for this object, and sharing
12775 		 *	that. This works because we are free
12776 		 *	to change the shadow object (and thus
12777 		 *	to use an asymmetric copy strategy);
12778 		 *	this is also semantically correct,
12779 		 *	since this object is temporary, and
12780 		 *	therefore a copy of the object is
12781 		 *	as good as the object itself. (This
12782 		 *	is not true for permanent objects,
12783 		 *	since the pager needs to see changes,
12784 		 *	which won't happen if the changes
12785 		 *	are made to a copy.)
12786 		 *
12787 		 *	The third case is when the object
12788 		 *	to be shared has parts sticking
12789 		 *	outside of the entry we're working
12790 		 *	with, and thus may in the future
12791 		 *	be subject to a symmetrical copy.
12792 		 *	(This is a preemptive version of
12793 		 *	case 2.)
12794 		 */
12795 		VME_OBJECT_SHADOW(old_entry,
12796 		    (vm_map_size_t) (old_entry->vme_end -
12797 		    old_entry->vme_start),
12798 		    vm_map_always_shadow(old_map));
12799 
12800 		/*
12801 		 *	If we're making a shadow for other than
12802 		 *	copy on write reasons, then we have
12803 		 *	to remove write permission.
12804 		 */
12805 
12806 		is_writable = false;
12807 		if (old_entry->protection & VM_PROT_WRITE) {
12808 			is_writable = true;
12809 #if __arm64e__
12810 		} else if (old_entry->used_for_tpro) {
12811 			is_writable = true;
12812 #endif /* __arm64e__ */
12813 		}
12814 		if (!old_entry->needs_copy && is_writable) {
12815 			vm_prot_t prot;
12816 
12817 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
12818 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
12819 				    __FUNCTION__, old_map, old_map->pmap,
12820 				    old_entry,
12821 				    (uint64_t)old_entry->vme_start,
12822 				    (uint64_t)old_entry->vme_end,
12823 				    old_entry->protection);
12824 			}
12825 
12826 			prot = old_entry->protection & ~VM_PROT_WRITE;
12827 
12828 			if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
12829 				panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
12830 				    __FUNCTION__, old_map, old_map->pmap,
12831 				    old_entry,
12832 				    (uint64_t)old_entry->vme_start,
12833 				    (uint64_t)old_entry->vme_end,
12834 				    prot);
12835 			}
12836 
12837 			if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
12838 				prot |= VM_PROT_EXECUTE;
12839 			}
12840 
12841 
12842 			if (old_map->mapped_in_other_pmaps) {
12843 				vm_object_pmap_protect(
12844 					VME_OBJECT(old_entry),
12845 					VME_OFFSET(old_entry),
12846 					(old_entry->vme_end -
12847 					old_entry->vme_start),
12848 					PMAP_NULL,
12849 					PAGE_SIZE,
12850 					old_entry->vme_start,
12851 					prot);
12852 			} else {
12853 				pmap_protect(old_map->pmap,
12854 				    old_entry->vme_start,
12855 				    old_entry->vme_end,
12856 				    prot);
12857 			}
12858 		}
12859 
12860 		old_entry->needs_copy = FALSE;
12861 		object = VME_OBJECT(old_entry);
12862 	}
12863 
12864 
12865 	/*
12866 	 *	If object was using a symmetric copy strategy,
12867 	 *	change its copy strategy to the default
12868 	 *	asymmetric copy strategy, which is copy_delay
12869 	 *	in the non-norma case and copy_call in the
12870 	 *	norma case. Bump the reference count for the
12871 	 *	new entry.
12872 	 */
12873 
12874 	if (old_entry->is_sub_map) {
12875 		vm_map_reference(VME_SUBMAP(old_entry));
12876 	} else {
12877 		vm_object_lock(object);
12878 		vm_object_reference_locked(object);
12879 		if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
12880 			object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
12881 		}
12882 		vm_object_unlock(object);
12883 	}
12884 
12885 	/*
12886 	 *	Clone the entry, using object ref from above.
12887 	 *	Mark both entries as shared.
12888 	 */
12889 
12890 	new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
12891 	vm_map_entry_copy(old_map, new_entry, old_entry);
12892 	old_entry->is_shared = TRUE;
12893 	new_entry->is_shared = TRUE;
12894 
12895 	/*
12896 	 * We're dealing with a shared mapping, so the resulting mapping
12897 	 * should inherit some of the original mapping's accounting settings.
12898 	 * "iokit_acct" should have been cleared in vm_map_entry_copy().
12899 	 * "use_pmap" should stay the same as before (if it hasn't been reset
12900 	 * to TRUE when we cleared "iokit_acct").
12901 	 */
12902 	assert(!new_entry->iokit_acct);
12903 
12904 	/*
12905 	 *	If old entry's inheritence is VM_INHERIT_NONE,
12906 	 *	the new entry is for corpse fork, remove the
12907 	 *	write permission from the new entry.
12908 	 */
12909 	if (old_entry->inheritance == VM_INHERIT_NONE) {
12910 		new_entry->protection &= ~VM_PROT_WRITE;
12911 		new_entry->max_protection &= ~VM_PROT_WRITE;
12912 	}
12913 
12914 	/*
12915 	 *	Insert the entry into the new map -- we
12916 	 *	know we're inserting at the end of the new
12917 	 *	map.
12918 	 */
12919 
12920 	vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
12921 	    VM_MAP_KERNEL_FLAGS_NONE);
12922 
12923 	/*
12924 	 *	Update the physical map
12925 	 */
12926 
12927 	if (old_entry->is_sub_map) {
12928 		/* Bill Angell pmap support goes here */
12929 	} else {
12930 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
12931 		    old_entry->vme_end - old_entry->vme_start,
12932 		    old_entry->vme_start);
12933 	}
12934 }
12935 
12936 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)12937 vm_map_fork_copy(
12938 	vm_map_t        old_map,
12939 	vm_map_entry_t  *old_entry_p,
12940 	vm_map_t        new_map,
12941 	int             vm_map_copyin_flags)
12942 {
12943 	vm_map_entry_t old_entry = *old_entry_p;
12944 	vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
12945 	vm_map_offset_t start = old_entry->vme_start;
12946 	vm_map_copy_t copy;
12947 	vm_map_entry_t last = vm_map_last_entry(new_map);
12948 
12949 	vm_map_unlock(old_map);
12950 	/*
12951 	 *	Use maxprot version of copyin because we
12952 	 *	care about whether this memory can ever
12953 	 *	be accessed, not just whether it's accessible
12954 	 *	right now.
12955 	 */
12956 	vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
12957 	if (vm_map_copyin_internal(old_map, start, entry_size,
12958 	    vm_map_copyin_flags, &copy)
12959 	    != KERN_SUCCESS) {
12960 		/*
12961 		 *	The map might have changed while it
12962 		 *	was unlocked, check it again.  Skip
12963 		 *	any blank space or permanently
12964 		 *	unreadable region.
12965 		 */
12966 		vm_map_lock(old_map);
12967 		if (!vm_map_lookup_entry(old_map, start, &last) ||
12968 		    (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
12969 			last = last->vme_next;
12970 		}
12971 		*old_entry_p = last;
12972 
12973 		/*
12974 		 * XXX	For some error returns, want to
12975 		 * XXX	skip to the next element.  Note
12976 		 *	that INVALID_ADDRESS and
12977 		 *	PROTECTION_FAILURE are handled above.
12978 		 */
12979 
12980 		return FALSE;
12981 	}
12982 
12983 	/*
12984 	 * Assert that the vm_map_copy is coming from the right
12985 	 * zone and hasn't been forged
12986 	 */
12987 	vm_map_copy_require(copy);
12988 
12989 	/*
12990 	 *	Insert the copy into the new map
12991 	 */
12992 	vm_map_copy_insert(new_map, last, copy);
12993 
12994 	/*
12995 	 *	Pick up the traversal at the end of
12996 	 *	the copied region.
12997 	 */
12998 
12999 	vm_map_lock(old_map);
13000 	start += entry_size;
13001 	if (!vm_map_lookup_entry(old_map, start, &last)) {
13002 		last = last->vme_next;
13003 	} else {
13004 		if (last->vme_start == start) {
13005 			/*
13006 			 * No need to clip here and we don't
13007 			 * want to cause any unnecessary
13008 			 * unnesting...
13009 			 */
13010 		} else {
13011 			vm_map_clip_start(old_map, last, start);
13012 		}
13013 	}
13014 	*old_entry_p = last;
13015 
13016 	return TRUE;
13017 }
13018 
13019 #if PMAP_FORK_NEST
13020 #define PMAP_FORK_NEST_DEBUG 0
13021 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13022 vm_map_fork_unnest(
13023 	pmap_t new_pmap,
13024 	vm_map_offset_t pre_nested_start,
13025 	vm_map_offset_t pre_nested_end,
13026 	vm_map_offset_t start,
13027 	vm_map_offset_t end)
13028 {
13029 	kern_return_t kr;
13030 	vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13031 
13032 	assertf(pre_nested_start <= pre_nested_end,
13033 	    "pre_nested start 0x%llx end 0x%llx",
13034 	    (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13035 	assertf(start <= end,
13036 	    "start 0x%llx end 0x%llx",
13037 	    (uint64_t) start, (uint64_t)end);
13038 
13039 	if (pre_nested_start == pre_nested_end) {
13040 		/* nothing was pre-nested: done */
13041 		return;
13042 	}
13043 	if (end <= pre_nested_start) {
13044 		/* fully before pre-nested range: done */
13045 		return;
13046 	}
13047 	if (start >= pre_nested_end) {
13048 		/* fully after pre-nested range: done */
13049 		return;
13050 	}
13051 	/* ignore parts of range outside of pre_nested range */
13052 	if (start < pre_nested_start) {
13053 		start = pre_nested_start;
13054 	}
13055 	if (end > pre_nested_end) {
13056 		end = pre_nested_end;
13057 	}
13058 	nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13059 	start_unnest = start & ~nesting_mask;
13060 	end_unnest = (end + nesting_mask) & ~nesting_mask;
13061 	kr = pmap_unnest(new_pmap,
13062 	    (addr64_t)start_unnest,
13063 	    (uint64_t)(end_unnest - start_unnest));
13064 #if PMAP_FORK_NEST_DEBUG
13065 	printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13066 #endif /* PMAP_FORK_NEST_DEBUG */
13067 	assertf(kr == KERN_SUCCESS,
13068 	    "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13069 	    (uint64_t)start, (uint64_t)end, new_pmap,
13070 	    (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13071 	    kr);
13072 }
13073 #endif /* PMAP_FORK_NEST */
13074 
13075 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13076 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13077 {
13078 	new_map->size_limit = old_map->size_limit;
13079 	new_map->data_limit = old_map->data_limit;
13080 	new_map->user_wire_limit = old_map->user_wire_limit;
13081 	new_map->reserved_regions = old_map->reserved_regions;
13082 }
13083 
13084 /*
13085  *	vm_map_fork:
13086  *
13087  *	Create and return a new map based on the old
13088  *	map, according to the inheritance values on the
13089  *	regions in that map and the options.
13090  *
13091  *	The source map must not be locked.
13092  */
13093 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13094 vm_map_fork(
13095 	ledger_t        ledger,
13096 	vm_map_t        old_map,
13097 	int             options)
13098 {
13099 	pmap_t          new_pmap;
13100 	vm_map_t        new_map;
13101 	vm_map_entry_t  old_entry;
13102 	vm_map_size_t   new_size = 0, entry_size;
13103 	vm_map_entry_t  new_entry;
13104 	boolean_t       src_needs_copy;
13105 	boolean_t       new_entry_needs_copy;
13106 	boolean_t       pmap_is64bit;
13107 	int             vm_map_copyin_flags;
13108 	vm_inherit_t    old_entry_inheritance;
13109 	int             map_create_options;
13110 	kern_return_t   footprint_collect_kr;
13111 
13112 	if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13113 	    VM_MAP_FORK_PRESERVE_PURGEABLE |
13114 	    VM_MAP_FORK_CORPSE_FOOTPRINT |
13115 	    VM_MAP_FORK_SHARE_IF_OWNED)) {
13116 		/* unsupported option */
13117 		return VM_MAP_NULL;
13118 	}
13119 
13120 	pmap_is64bit =
13121 #if defined(__i386__) || defined(__x86_64__)
13122 	    old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13123 #elif defined(__arm64__)
13124 	    old_map->pmap->is_64bit;
13125 #else
13126 #error Unknown architecture.
13127 #endif
13128 
13129 	unsigned int pmap_flags = 0;
13130 	pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13131 #if defined(HAS_APPLE_PAC)
13132 	pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13133 #endif
13134 #if CONFIG_ROSETTA
13135 	pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13136 #endif
13137 #if PMAP_CREATE_FORCE_4K_PAGES
13138 	if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13139 	    PAGE_SIZE != FOURK_PAGE_SIZE) {
13140 		pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13141 	}
13142 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13143 	new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13144 	if (new_pmap == NULL) {
13145 		return VM_MAP_NULL;
13146 	}
13147 
13148 	vm_map_reference(old_map);
13149 	vm_map_lock(old_map);
13150 
13151 	map_create_options = 0;
13152 	if (old_map->hdr.entries_pageable) {
13153 		map_create_options |= VM_MAP_CREATE_PAGEABLE;
13154 	}
13155 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13156 		map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13157 		footprint_collect_kr = KERN_SUCCESS;
13158 	}
13159 	new_map = vm_map_create_options(new_pmap,
13160 	    old_map->min_offset,
13161 	    old_map->max_offset,
13162 	    map_create_options);
13163 
13164 	/* inherit cs_enforcement */
13165 	vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13166 
13167 	vm_map_lock(new_map);
13168 	vm_commit_pagezero_status(new_map);
13169 	/* inherit the parent map's page size */
13170 	vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13171 
13172 	/* inherit the parent rlimits */
13173 	vm_map_inherit_limits(new_map, old_map);
13174 
13175 #if CONFIG_MAP_RANGES
13176 	/* inherit the parent map's VM ranges */
13177 	vm_map_range_fork(new_map, old_map);
13178 #endif
13179 
13180 #if CODE_SIGNING_MONITOR
13181 	/* Prepare the monitor for the fork */
13182 	csm_fork_prepare(old_map->pmap, new_pmap);
13183 #endif
13184 
13185 #if PMAP_FORK_NEST
13186 	/*
13187 	 * Pre-nest the shared region's pmap.
13188 	 */
13189 	vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13190 	pmap_fork_nest(old_map->pmap, new_pmap,
13191 	    &pre_nested_start, &pre_nested_end);
13192 #if PMAP_FORK_NEST_DEBUG
13193 	printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13194 #endif /* PMAP_FORK_NEST_DEBUG */
13195 #endif /* PMAP_FORK_NEST */
13196 
13197 	for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13198 		/*
13199 		 * Abort any corpse collection if the system is shutting down.
13200 		 */
13201 		if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13202 		    get_system_inshutdown()) {
13203 #if PMAP_FORK_NEST
13204 			new_entry = vm_map_last_entry(new_map);
13205 			if (new_entry == vm_map_to_entry(new_map)) {
13206 				/* unnest all that was pre-nested */
13207 				vm_map_fork_unnest(new_pmap,
13208 				    pre_nested_start, pre_nested_end,
13209 				    vm_map_min(new_map), vm_map_max(new_map));
13210 			} else if (new_entry->vme_end < vm_map_max(new_map)) {
13211 				/* unnest hole at the end, if pre-nested */
13212 				vm_map_fork_unnest(new_pmap,
13213 				    pre_nested_start, pre_nested_end,
13214 				    new_entry->vme_end, vm_map_max(new_map));
13215 			}
13216 #endif /* PMAP_FORK_NEST */
13217 			vm_map_corpse_footprint_collect_done(new_map);
13218 			vm_map_unlock(new_map);
13219 			vm_map_unlock(old_map);
13220 			vm_map_deallocate(new_map);
13221 			vm_map_deallocate(old_map);
13222 			printf("Aborting corpse map due to system shutdown\n");
13223 			return VM_MAP_NULL;
13224 		}
13225 
13226 		entry_size = old_entry->vme_end - old_entry->vme_start;
13227 
13228 #if PMAP_FORK_NEST
13229 		/*
13230 		 * Undo any unnecessary pre-nesting.
13231 		 */
13232 		vm_map_offset_t prev_end;
13233 		if (old_entry == vm_map_first_entry(old_map)) {
13234 			prev_end = vm_map_min(old_map);
13235 		} else {
13236 			prev_end = old_entry->vme_prev->vme_end;
13237 		}
13238 		if (prev_end < old_entry->vme_start) {
13239 			/* unnest hole before this entry, if pre-nested */
13240 			vm_map_fork_unnest(new_pmap,
13241 			    pre_nested_start, pre_nested_end,
13242 			    prev_end, old_entry->vme_start);
13243 		}
13244 		if (old_entry->is_sub_map && old_entry->use_pmap) {
13245 			/* keep this entry nested in the child */
13246 #if PMAP_FORK_NEST_DEBUG
13247 			printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13248 #endif /* PMAP_FORK_NEST_DEBUG */
13249 		} else {
13250 			/* undo nesting for this entry, if pre-nested */
13251 			vm_map_fork_unnest(new_pmap,
13252 			    pre_nested_start, pre_nested_end,
13253 			    old_entry->vme_start, old_entry->vme_end);
13254 		}
13255 #endif /* PMAP_FORK_NEST */
13256 
13257 		old_entry_inheritance = old_entry->inheritance;
13258 		/*
13259 		 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13260 		 * share VM_INHERIT_NONE entries that are not backed by a
13261 		 * device pager.
13262 		 */
13263 		if (old_entry_inheritance == VM_INHERIT_NONE &&
13264 		    (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13265 		    (old_entry->protection & VM_PROT_READ) &&
13266 		    !(!old_entry->is_sub_map &&
13267 		    VME_OBJECT(old_entry) != NULL &&
13268 		    VME_OBJECT(old_entry)->pager != NULL &&
13269 		    is_device_pager_ops(
13270 			    VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13271 			old_entry_inheritance = VM_INHERIT_SHARE;
13272 		}
13273 		if (old_entry_inheritance == VM_INHERIT_COPY &&
13274 		    (options & VM_MAP_FORK_SHARE_IF_OWNED) &&
13275 		    !old_entry->is_sub_map &&
13276 		    VME_OBJECT(old_entry) != VM_OBJECT_NULL) {
13277 			vm_object_t object;
13278 			task_t owner;
13279 			object = VME_OBJECT(old_entry);
13280 			owner = VM_OBJECT_OWNER(object);
13281 			if (owner != TASK_NULL &&
13282 			    owner->map == old_map) {
13283 				/*
13284 				 * This mapping points at a VM object owned
13285 				 * by the task being forked.
13286 				 * Some tools reporting memory accounting
13287 				 * info rely on the object ID, so share this
13288 				 * mapping instead of copying, to make the
13289 				 * corpse look exactly like the original
13290 				 * task in that respect.
13291 				 */
13292 				assert(object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC);
13293 				old_entry_inheritance = VM_INHERIT_SHARE;
13294 			}
13295 		}
13296 
13297 		if (old_entry_inheritance != VM_INHERIT_NONE &&
13298 		    (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13299 		    footprint_collect_kr == KERN_SUCCESS) {
13300 			/*
13301 			 * The corpse won't have old_map->pmap to query
13302 			 * footprint information, so collect that data now
13303 			 * and store it in new_map->vmmap_corpse_footprint
13304 			 * for later autopsy.
13305 			 */
13306 			footprint_collect_kr =
13307 			    vm_map_corpse_footprint_collect(old_map,
13308 			    old_entry,
13309 			    new_map);
13310 		}
13311 
13312 		switch (old_entry_inheritance) {
13313 		case VM_INHERIT_NONE:
13314 			break;
13315 
13316 		case VM_INHERIT_SHARE:
13317 			vm_map_fork_share(old_map, old_entry, new_map);
13318 			new_size += entry_size;
13319 			break;
13320 
13321 		case VM_INHERIT_COPY:
13322 
13323 			/*
13324 			 *	Inline the copy_quickly case;
13325 			 *	upon failure, fall back on call
13326 			 *	to vm_map_fork_copy.
13327 			 */
13328 
13329 			if (old_entry->is_sub_map) {
13330 				break;
13331 			}
13332 			if ((old_entry->wired_count != 0) ||
13333 			    ((VME_OBJECT(old_entry) != NULL) &&
13334 			    (VME_OBJECT(old_entry)->true_share))) {
13335 				goto slow_vm_map_fork_copy;
13336 			}
13337 
13338 			new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13339 			vm_map_entry_copy(old_map, new_entry, old_entry);
13340 			if (old_entry->vme_permanent) {
13341 				/* inherit "permanent" on fork() */
13342 				new_entry->vme_permanent = TRUE;
13343 			}
13344 
13345 			if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13346 				new_map->jit_entry_exists = TRUE;
13347 			}
13348 
13349 			if (new_entry->is_sub_map) {
13350 				/* clear address space specifics */
13351 				new_entry->use_pmap = FALSE;
13352 			} else {
13353 				/*
13354 				 * We're dealing with a copy-on-write operation,
13355 				 * so the resulting mapping should not inherit
13356 				 * the original mapping's accounting settings.
13357 				 * "iokit_acct" should have been cleared in
13358 				 * vm_map_entry_copy().
13359 				 * "use_pmap" should be reset to its default
13360 				 * (TRUE) so that the new mapping gets
13361 				 * accounted for in the task's memory footprint.
13362 				 */
13363 				assert(!new_entry->iokit_acct);
13364 				new_entry->use_pmap = TRUE;
13365 			}
13366 
13367 			if (!vm_object_copy_quickly(
13368 				    VME_OBJECT(new_entry),
13369 				    VME_OFFSET(old_entry),
13370 				    (old_entry->vme_end -
13371 				    old_entry->vme_start),
13372 				    &src_needs_copy,
13373 				    &new_entry_needs_copy)) {
13374 				vm_map_entry_dispose(new_entry);
13375 				goto slow_vm_map_fork_copy;
13376 			}
13377 
13378 			/*
13379 			 *	Handle copy-on-write obligations
13380 			 */
13381 
13382 			if (src_needs_copy && !old_entry->needs_copy) {
13383 				vm_prot_t prot;
13384 
13385 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13386 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13387 					    __FUNCTION__,
13388 					    old_map, old_map->pmap, old_entry,
13389 					    (uint64_t)old_entry->vme_start,
13390 					    (uint64_t)old_entry->vme_end,
13391 					    old_entry->protection);
13392 				}
13393 
13394 				prot = old_entry->protection & ~VM_PROT_WRITE;
13395 
13396 				if (override_nx(old_map, VME_ALIAS(old_entry))
13397 				    && prot) {
13398 					prot |= VM_PROT_EXECUTE;
13399 				}
13400 
13401 				if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13402 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13403 					    __FUNCTION__,
13404 					    old_map, old_map->pmap, old_entry,
13405 					    (uint64_t)old_entry->vme_start,
13406 					    (uint64_t)old_entry->vme_end,
13407 					    prot);
13408 				}
13409 
13410 				vm_object_pmap_protect(
13411 					VME_OBJECT(old_entry),
13412 					VME_OFFSET(old_entry),
13413 					(old_entry->vme_end -
13414 					old_entry->vme_start),
13415 					((old_entry->is_shared
13416 					|| old_map->mapped_in_other_pmaps)
13417 					? PMAP_NULL :
13418 					old_map->pmap),
13419 					VM_MAP_PAGE_SIZE(old_map),
13420 					old_entry->vme_start,
13421 					prot);
13422 
13423 				assert(old_entry->wired_count == 0);
13424 				old_entry->needs_copy = TRUE;
13425 			}
13426 			new_entry->needs_copy = new_entry_needs_copy;
13427 
13428 			/*
13429 			 *	Insert the entry at the end
13430 			 *	of the map.
13431 			 */
13432 
13433 			vm_map_store_entry_link(new_map,
13434 			    vm_map_last_entry(new_map),
13435 			    new_entry,
13436 			    VM_MAP_KERNEL_FLAGS_NONE);
13437 			new_size += entry_size;
13438 			break;
13439 
13440 slow_vm_map_fork_copy:
13441 			vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13442 			if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13443 				vm_map_copyin_flags |=
13444 				    VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13445 			}
13446 			if (vm_map_fork_copy(old_map,
13447 			    &old_entry,
13448 			    new_map,
13449 			    vm_map_copyin_flags)) {
13450 				new_size += entry_size;
13451 			}
13452 			continue;
13453 		}
13454 		old_entry = old_entry->vme_next;
13455 	}
13456 
13457 #if PMAP_FORK_NEST
13458 	new_entry = vm_map_last_entry(new_map);
13459 	if (new_entry == vm_map_to_entry(new_map)) {
13460 		/* unnest all that was pre-nested */
13461 		vm_map_fork_unnest(new_pmap,
13462 		    pre_nested_start, pre_nested_end,
13463 		    vm_map_min(new_map), vm_map_max(new_map));
13464 	} else if (new_entry->vme_end < vm_map_max(new_map)) {
13465 		/* unnest hole at the end, if pre-nested */
13466 		vm_map_fork_unnest(new_pmap,
13467 		    pre_nested_start, pre_nested_end,
13468 		    new_entry->vme_end, vm_map_max(new_map));
13469 	}
13470 #endif /* PMAP_FORK_NEST */
13471 
13472 #if defined(__arm64__)
13473 	pmap_insert_commpage(new_map->pmap);
13474 #endif /* __arm64__ */
13475 
13476 	new_map->size = new_size;
13477 
13478 	if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13479 		vm_map_corpse_footprint_collect_done(new_map);
13480 	}
13481 
13482 	/* Propagate JIT entitlement for the pmap layer. */
13483 	if (pmap_get_jit_entitled(old_map->pmap)) {
13484 		/* Tell the pmap that it supports JIT. */
13485 		pmap_set_jit_entitled(new_map->pmap);
13486 	}
13487 
13488 	/* Propagate TPRO settings for the pmap layer */
13489 	if (pmap_get_tpro(old_map->pmap)) {
13490 		/* Tell the pmap that it supports TPRO */
13491 		pmap_set_tpro(new_map->pmap);
13492 	}
13493 
13494 
13495 	vm_map_unlock(new_map);
13496 	vm_map_unlock(old_map);
13497 	vm_map_deallocate(old_map);
13498 
13499 	return new_map;
13500 }
13501 
13502 /*
13503  * vm_map_exec:
13504  *
13505  *      Setup the "new_map" with the proper execution environment according
13506  *	to the type of executable (platform, 64bit, chroot environment).
13507  *	Map the comm page and shared region, etc...
13508  */
13509 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13510 vm_map_exec(
13511 	vm_map_t        new_map,
13512 	task_t          task,
13513 	boolean_t       is64bit,
13514 	void            *fsroot,
13515 	cpu_type_t      cpu,
13516 	cpu_subtype_t   cpu_subtype,
13517 	boolean_t       reslide,
13518 	boolean_t       is_driverkit,
13519 	uint32_t        rsr_version)
13520 {
13521 	SHARED_REGION_TRACE_DEBUG(
13522 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13523 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13524 		(void *)VM_KERNEL_ADDRPERM(new_map),
13525 		(void *)VM_KERNEL_ADDRPERM(task),
13526 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13527 		cpu,
13528 		cpu_subtype));
13529 	(void) vm_commpage_enter(new_map, task, is64bit);
13530 
13531 	(void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13532 
13533 	SHARED_REGION_TRACE_DEBUG(
13534 		("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13535 		(void *)VM_KERNEL_ADDRPERM(current_task()),
13536 		(void *)VM_KERNEL_ADDRPERM(new_map),
13537 		(void *)VM_KERNEL_ADDRPERM(task),
13538 		(void *)VM_KERNEL_ADDRPERM(fsroot),
13539 		cpu,
13540 		cpu_subtype));
13541 
13542 	/*
13543 	 * Some devices have region(s) of memory that shouldn't get allocated by
13544 	 * user processes. The following code creates dummy vm_map_entry_t's for each
13545 	 * of the regions that needs to be reserved to prevent any allocations in
13546 	 * those regions.
13547 	 */
13548 	kern_return_t kr = KERN_FAILURE;
13549 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13550 	vmk_flags.vmkf_beyond_max = true;
13551 
13552 	const struct vm_reserved_region *regions = NULL;
13553 	size_t num_regions = ml_get_vm_reserved_regions(is64bit, &regions);
13554 	assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13555 
13556 	for (size_t i = 0; i < num_regions; ++i) {
13557 		vm_map_offset_t address = regions[i].vmrr_addr;
13558 
13559 		kr = vm_map_enter(
13560 			new_map,
13561 			&address,
13562 			regions[i].vmrr_size,
13563 			(vm_map_offset_t)0,
13564 			vmk_flags,
13565 			VM_OBJECT_NULL,
13566 			(vm_object_offset_t)0,
13567 			FALSE,
13568 			VM_PROT_NONE,
13569 			VM_PROT_NONE,
13570 			VM_INHERIT_COPY);
13571 
13572 		if (kr != KERN_SUCCESS) {
13573 			panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13574 		}
13575 	}
13576 
13577 	new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13578 
13579 	return KERN_SUCCESS;
13580 }
13581 
13582 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13583 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13584 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13585 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13586 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13587 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13588 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13589 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13590 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13591 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13592 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13593 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13594 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13595 /*
13596  *	vm_map_lookup_and_lock_object:
13597  *
13598  *	Finds the VM object, offset, and
13599  *	protection for a given virtual address in the
13600  *	specified map, assuming a page fault of the
13601  *	type specified.
13602  *
13603  *	Returns the (object, offset, protection) for
13604  *	this address, whether it is wired down, and whether
13605  *	this map has the only reference to the data in question.
13606  *	In order to later verify this lookup, a "version"
13607  *	is returned.
13608  *	If contended != NULL, *contended will be set to
13609  *	true iff the thread had to spin or block to acquire
13610  *	an exclusive lock.
13611  *
13612  *	The map MUST be locked by the caller and WILL be
13613  *	locked on exit.  In order to guarantee the
13614  *	existence of the returned object, it is returned
13615  *	locked.
13616  *
13617  *	If a lookup is requested with "write protection"
13618  *	specified, the map may be changed to perform virtual
13619  *	copying operations, although the data referenced will
13620  *	remain the same.
13621  */
13622 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13623 vm_map_lookup_and_lock_object(
13624 	vm_map_t                *var_map,       /* IN/OUT */
13625 	vm_map_offset_t         vaddr,
13626 	vm_prot_t               fault_type,
13627 	int                     object_lock_type,
13628 	vm_map_version_t        *out_version,   /* OUT */
13629 	vm_object_t             *object,        /* OUT */
13630 	vm_object_offset_t      *offset,        /* OUT */
13631 	vm_prot_t               *out_prot,      /* OUT */
13632 	boolean_t               *wired,         /* OUT */
13633 	vm_object_fault_info_t  fault_info,     /* OUT */
13634 	vm_map_t                *real_map,      /* OUT */
13635 	bool                    *contended)     /* OUT */
13636 {
13637 	vm_map_entry_t                  entry;
13638 	vm_map_t                        map = *var_map;
13639 	vm_map_t                        old_map = *var_map;
13640 	vm_map_t                        cow_sub_map_parent = VM_MAP_NULL;
13641 	vm_map_offset_t                 cow_parent_vaddr = 0;
13642 	vm_map_offset_t                 old_start = 0;
13643 	vm_map_offset_t                 old_end = 0;
13644 	vm_prot_t                       prot;
13645 	boolean_t                       mask_protections;
13646 	boolean_t                       force_copy;
13647 	boolean_t                       no_force_copy_if_executable;
13648 	boolean_t                       submap_needed_copy;
13649 	vm_prot_t                       original_fault_type;
13650 	vm_map_size_t                   fault_page_mask;
13651 
13652 	/*
13653 	 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13654 	 * as a mask against the mapping's actual protections, not as an
13655 	 * absolute value.
13656 	 */
13657 	mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13658 	force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13659 	no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13660 	fault_type &= VM_PROT_ALL;
13661 	original_fault_type = fault_type;
13662 	if (contended) {
13663 		*contended = false;
13664 	}
13665 
13666 	*real_map = map;
13667 
13668 	fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13669 	vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13670 
13671 RetryLookup:
13672 	fault_type = original_fault_type;
13673 
13674 	/*
13675 	 *	If the map has an interesting hint, try it before calling
13676 	 *	full blown lookup routine.
13677 	 */
13678 	entry = map->hint;
13679 
13680 	if ((entry == vm_map_to_entry(map)) ||
13681 	    (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13682 		vm_map_entry_t  tmp_entry;
13683 
13684 		/*
13685 		 *	Entry was either not a valid hint, or the vaddr
13686 		 *	was not contained in the entry, so do a full lookup.
13687 		 */
13688 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13689 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13690 				vm_map_unlock(cow_sub_map_parent);
13691 			}
13692 			if ((*real_map != map)
13693 			    && (*real_map != cow_sub_map_parent)) {
13694 				vm_map_unlock(*real_map);
13695 			}
13696 			return KERN_INVALID_ADDRESS;
13697 		}
13698 
13699 		entry = tmp_entry;
13700 	}
13701 	if (map == old_map) {
13702 		old_start = entry->vme_start;
13703 		old_end = entry->vme_end;
13704 	}
13705 
13706 	/*
13707 	 *	Handle submaps.  Drop lock on upper map, submap is
13708 	 *	returned locked.
13709 	 */
13710 
13711 	submap_needed_copy = FALSE;
13712 submap_recurse:
13713 	if (entry->is_sub_map) {
13714 		vm_map_offset_t         local_vaddr;
13715 		vm_map_offset_t         end_delta;
13716 		vm_map_offset_t         start_delta;
13717 		vm_map_offset_t         top_entry_saved_start;
13718 		vm_object_offset_t      top_entry_saved_offset;
13719 		vm_map_entry_t          submap_entry, saved_submap_entry;
13720 		vm_object_offset_t      submap_entry_offset;
13721 		vm_object_size_t        submap_entry_size;
13722 		vm_prot_t               subentry_protection;
13723 		vm_prot_t               subentry_max_protection;
13724 		boolean_t               subentry_no_copy_on_read;
13725 		boolean_t               subentry_permanent;
13726 		boolean_t               subentry_csm_associated;
13727 #if __arm64e__
13728 		boolean_t               subentry_used_for_tpro;
13729 #endif /* __arm64e__ */
13730 		boolean_t               mapped_needs_copy = FALSE;
13731 		vm_map_version_t        version;
13732 
13733 		assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13734 		    "map %p (%d) entry %p submap %p (%d)\n",
13735 		    map, VM_MAP_PAGE_SHIFT(map), entry,
13736 		    VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13737 
13738 		local_vaddr = vaddr;
13739 		top_entry_saved_start = entry->vme_start;
13740 		top_entry_saved_offset = VME_OFFSET(entry);
13741 
13742 		if ((entry->use_pmap &&
13743 		    !((fault_type & VM_PROT_WRITE) ||
13744 		    force_copy))) {
13745 			/* if real_map equals map we unlock below */
13746 			if ((*real_map != map) &&
13747 			    (*real_map != cow_sub_map_parent)) {
13748 				vm_map_unlock(*real_map);
13749 			}
13750 			*real_map = VME_SUBMAP(entry);
13751 		}
13752 
13753 		if (entry->needs_copy &&
13754 		    ((fault_type & VM_PROT_WRITE) ||
13755 		    force_copy)) {
13756 			if (!mapped_needs_copy) {
13757 				if (vm_map_lock_read_to_write(map)) {
13758 					vm_map_lock_read(map);
13759 					*real_map = map;
13760 					goto RetryLookup;
13761 				}
13762 				vm_map_lock_read(VME_SUBMAP(entry));
13763 				*var_map = VME_SUBMAP(entry);
13764 				cow_sub_map_parent = map;
13765 				/* reset base to map before cow object */
13766 				/* this is the map which will accept   */
13767 				/* the new cow object */
13768 				old_start = entry->vme_start;
13769 				old_end = entry->vme_end;
13770 				cow_parent_vaddr = vaddr;
13771 				mapped_needs_copy = TRUE;
13772 			} else {
13773 				vm_map_lock_read(VME_SUBMAP(entry));
13774 				*var_map = VME_SUBMAP(entry);
13775 				if ((cow_sub_map_parent != map) &&
13776 				    (*real_map != map)) {
13777 					vm_map_unlock(map);
13778 				}
13779 			}
13780 		} else {
13781 			if (entry->needs_copy) {
13782 				submap_needed_copy = TRUE;
13783 			}
13784 			vm_map_lock_read(VME_SUBMAP(entry));
13785 			*var_map = VME_SUBMAP(entry);
13786 			/* leave map locked if it is a target */
13787 			/* cow sub_map above otherwise, just  */
13788 			/* follow the maps down to the object */
13789 			/* here we unlock knowing we are not  */
13790 			/* revisiting the map.  */
13791 			if ((*real_map != map) && (map != cow_sub_map_parent)) {
13792 				vm_map_unlock_read(map);
13793 			}
13794 		}
13795 
13796 		entry = NULL;
13797 		map = *var_map;
13798 
13799 		/* calculate the offset in the submap for vaddr */
13800 		local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
13801 		assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
13802 		    "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
13803 		    (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
13804 
13805 RetrySubMap:
13806 		if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
13807 			if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13808 				vm_map_unlock(cow_sub_map_parent);
13809 			}
13810 			if ((*real_map != map)
13811 			    && (*real_map != cow_sub_map_parent)) {
13812 				vm_map_unlock(*real_map);
13813 			}
13814 			*real_map = map;
13815 			return KERN_INVALID_ADDRESS;
13816 		}
13817 
13818 		/* find the attenuated shadow of the underlying object */
13819 		/* on our target map */
13820 
13821 		/* in english the submap object may extend beyond the     */
13822 		/* region mapped by the entry or, may only fill a portion */
13823 		/* of it.  For our purposes, we only care if the object   */
13824 		/* doesn't fill.  In this case the area which will        */
13825 		/* ultimately be clipped in the top map will only need    */
13826 		/* to be as big as the portion of the underlying entry    */
13827 		/* which is mapped */
13828 		start_delta = submap_entry->vme_start > top_entry_saved_offset ?
13829 		    submap_entry->vme_start - top_entry_saved_offset : 0;
13830 
13831 		end_delta =
13832 		    (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
13833 		    submap_entry->vme_end ?
13834 		    0 : (top_entry_saved_offset +
13835 		    (old_end - old_start))
13836 		    - submap_entry->vme_end;
13837 
13838 		old_start += start_delta;
13839 		old_end -= end_delta;
13840 
13841 		if (submap_entry->is_sub_map) {
13842 			entry = submap_entry;
13843 			vaddr = local_vaddr;
13844 			goto submap_recurse;
13845 		}
13846 
13847 		if (((fault_type & VM_PROT_WRITE) ||
13848 		    force_copy)
13849 		    && cow_sub_map_parent) {
13850 			vm_object_t     sub_object, copy_object;
13851 			vm_object_offset_t copy_offset;
13852 			vm_map_offset_t local_start;
13853 			vm_map_offset_t local_end;
13854 			boolean_t       object_copied = FALSE;
13855 			vm_object_offset_t object_copied_offset = 0;
13856 			boolean_t       object_copied_needs_copy = FALSE;
13857 			kern_return_t   kr = KERN_SUCCESS;
13858 
13859 			if (vm_map_lock_read_to_write(map)) {
13860 				vm_map_lock_read(map);
13861 				old_start -= start_delta;
13862 				old_end += end_delta;
13863 				goto RetrySubMap;
13864 			}
13865 
13866 
13867 			sub_object = VME_OBJECT(submap_entry);
13868 			if (sub_object == VM_OBJECT_NULL) {
13869 				sub_object =
13870 				    vm_object_allocate(
13871 					(vm_map_size_t)
13872 					(submap_entry->vme_end -
13873 					submap_entry->vme_start));
13874 				VME_OBJECT_SET(submap_entry, sub_object, false, 0);
13875 				VME_OFFSET_SET(submap_entry, 0);
13876 				assert(!submap_entry->is_sub_map);
13877 				assert(submap_entry->use_pmap);
13878 			}
13879 			local_start =  local_vaddr -
13880 			    (cow_parent_vaddr - old_start);
13881 			local_end = local_vaddr +
13882 			    (old_end - cow_parent_vaddr);
13883 			vm_map_clip_start(map, submap_entry, local_start);
13884 			vm_map_clip_end(map, submap_entry, local_end);
13885 			if (submap_entry->is_sub_map) {
13886 				/* unnesting was done when clipping */
13887 				assert(!submap_entry->use_pmap);
13888 			}
13889 
13890 			/* This is the COW case, lets connect */
13891 			/* an entry in our space to the underlying */
13892 			/* object in the submap, bypassing the  */
13893 			/* submap. */
13894 			submap_entry_offset = VME_OFFSET(submap_entry);
13895 			submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
13896 
13897 			if ((submap_entry->wired_count != 0 ||
13898 			    sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
13899 			    (submap_entry->protection & VM_PROT_EXECUTE) &&
13900 			    no_force_copy_if_executable) {
13901 //				printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
13902 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13903 					vm_map_unlock(cow_sub_map_parent);
13904 				}
13905 				if ((*real_map != map)
13906 				    && (*real_map != cow_sub_map_parent)) {
13907 					vm_map_unlock(*real_map);
13908 				}
13909 				*real_map = map;
13910 				ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
13911 				vm_map_lock_write_to_read(map);
13912 				kr = KERN_PROTECTION_FAILURE;
13913 				DTRACE_VM4(submap_no_copy_executable,
13914 				    vm_map_t, map,
13915 				    vm_object_offset_t, submap_entry_offset,
13916 				    vm_object_size_t, submap_entry_size,
13917 				    int, kr);
13918 				return kr;
13919 			}
13920 
13921 			if (submap_entry->wired_count != 0) {
13922 				vm_object_reference(sub_object);
13923 
13924 				assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
13925 				    "submap_entry %p offset 0x%llx\n",
13926 				    submap_entry, VME_OFFSET(submap_entry));
13927 
13928 				DTRACE_VM6(submap_copy_slowly,
13929 				    vm_map_t, cow_sub_map_parent,
13930 				    vm_map_offset_t, vaddr,
13931 				    vm_map_t, map,
13932 				    vm_object_size_t, submap_entry_size,
13933 				    int, submap_entry->wired_count,
13934 				    int, sub_object->copy_strategy);
13935 
13936 				saved_submap_entry = submap_entry;
13937 				version.main_timestamp = map->timestamp;
13938 				vm_map_unlock(map); /* Increments timestamp by 1 */
13939 				submap_entry = VM_MAP_ENTRY_NULL;
13940 
13941 				vm_object_lock(sub_object);
13942 				kr = vm_object_copy_slowly(sub_object,
13943 				    submap_entry_offset,
13944 				    submap_entry_size,
13945 				    FALSE,
13946 				    &copy_object);
13947 				object_copied = TRUE;
13948 				object_copied_offset = 0;
13949 				/* 4k: account for extra offset in physical page */
13950 				object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
13951 				object_copied_needs_copy = FALSE;
13952 				vm_object_deallocate(sub_object);
13953 
13954 				vm_map_lock(map);
13955 
13956 				if (kr != KERN_SUCCESS &&
13957 				    kr != KERN_MEMORY_RESTART_COPY) {
13958 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13959 						vm_map_unlock(cow_sub_map_parent);
13960 					}
13961 					if ((*real_map != map)
13962 					    && (*real_map != cow_sub_map_parent)) {
13963 						vm_map_unlock(*real_map);
13964 					}
13965 					*real_map = map;
13966 					vm_object_deallocate(copy_object);
13967 					copy_object = VM_OBJECT_NULL;
13968 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
13969 					vm_map_lock_write_to_read(map);
13970 					DTRACE_VM4(submap_copy_error_slowly,
13971 					    vm_object_t, sub_object,
13972 					    vm_object_offset_t, submap_entry_offset,
13973 					    vm_object_size_t, submap_entry_size,
13974 					    int, kr);
13975 					vm_map_lookup_and_lock_object_copy_slowly_error++;
13976 					return kr;
13977 				}
13978 
13979 				if ((kr == KERN_SUCCESS) &&
13980 				    (version.main_timestamp + 1) == map->timestamp) {
13981 					submap_entry = saved_submap_entry;
13982 				} else {
13983 					saved_submap_entry = NULL;
13984 					old_start -= start_delta;
13985 					old_end += end_delta;
13986 					vm_object_deallocate(copy_object);
13987 					copy_object = VM_OBJECT_NULL;
13988 					vm_map_lock_write_to_read(map);
13989 					vm_map_lookup_and_lock_object_copy_slowly_restart++;
13990 					goto RetrySubMap;
13991 				}
13992 				vm_map_lookup_and_lock_object_copy_slowly_count++;
13993 				vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
13994 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
13995 					vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
13996 				}
13997 			} else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
13998 				submap_entry_offset = VME_OFFSET(submap_entry);
13999 				copy_object = VM_OBJECT_NULL;
14000 				object_copied_offset = submap_entry_offset;
14001 				object_copied_needs_copy = FALSE;
14002 				DTRACE_VM6(submap_copy_strategically,
14003 				    vm_map_t, cow_sub_map_parent,
14004 				    vm_map_offset_t, vaddr,
14005 				    vm_map_t, map,
14006 				    vm_object_size_t, submap_entry_size,
14007 				    int, submap_entry->wired_count,
14008 				    int, sub_object->copy_strategy);
14009 				kr = vm_object_copy_strategically(
14010 					sub_object,
14011 					submap_entry_offset,
14012 					submap_entry->vme_end - submap_entry->vme_start,
14013 					false, /* forking */
14014 					&copy_object,
14015 					&object_copied_offset,
14016 					&object_copied_needs_copy);
14017 				if (kr == KERN_MEMORY_RESTART_COPY) {
14018 					old_start -= start_delta;
14019 					old_end += end_delta;
14020 					vm_object_deallocate(copy_object);
14021 					copy_object = VM_OBJECT_NULL;
14022 					vm_map_lock_write_to_read(map);
14023 					vm_map_lookup_and_lock_object_copy_strategically_restart++;
14024 					goto RetrySubMap;
14025 				}
14026 				if (kr != KERN_SUCCESS) {
14027 					if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14028 						vm_map_unlock(cow_sub_map_parent);
14029 					}
14030 					if ((*real_map != map)
14031 					    && (*real_map != cow_sub_map_parent)) {
14032 						vm_map_unlock(*real_map);
14033 					}
14034 					*real_map = map;
14035 					vm_object_deallocate(copy_object);
14036 					copy_object = VM_OBJECT_NULL;
14037 					ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14038 					vm_map_lock_write_to_read(map);
14039 					DTRACE_VM4(submap_copy_error_strategically,
14040 					    vm_object_t, sub_object,
14041 					    vm_object_offset_t, submap_entry_offset,
14042 					    vm_object_size_t, submap_entry_size,
14043 					    int, kr);
14044 					vm_map_lookup_and_lock_object_copy_strategically_error++;
14045 					return kr;
14046 				}
14047 				assert(copy_object != VM_OBJECT_NULL);
14048 				assert(copy_object != sub_object);
14049 				object_copied = TRUE;
14050 				vm_map_lookup_and_lock_object_copy_strategically_count++;
14051 				vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14052 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14053 					vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14054 				}
14055 			} else {
14056 				/* set up shadow object */
14057 				object_copied = FALSE;
14058 				copy_object = sub_object;
14059 				vm_object_lock(sub_object);
14060 				vm_object_reference_locked(sub_object);
14061 				VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14062 				vm_object_unlock(sub_object);
14063 
14064 				assert(submap_entry->wired_count == 0);
14065 				submap_entry->needs_copy = TRUE;
14066 
14067 				prot = submap_entry->protection;
14068 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14069 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14070 					    __FUNCTION__,
14071 					    map, map->pmap, submap_entry,
14072 					    (uint64_t)submap_entry->vme_start,
14073 					    (uint64_t)submap_entry->vme_end,
14074 					    prot);
14075 				}
14076 				prot = prot & ~VM_PROT_WRITE;
14077 				if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14078 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14079 					    __FUNCTION__,
14080 					    map, map->pmap, submap_entry,
14081 					    (uint64_t)submap_entry->vme_start,
14082 					    (uint64_t)submap_entry->vme_end,
14083 					    prot);
14084 				}
14085 
14086 				if (override_nx(old_map,
14087 				    VME_ALIAS(submap_entry))
14088 				    && prot) {
14089 					prot |= VM_PROT_EXECUTE;
14090 				}
14091 
14092 				vm_object_pmap_protect(
14093 					sub_object,
14094 					VME_OFFSET(submap_entry),
14095 					submap_entry->vme_end -
14096 					submap_entry->vme_start,
14097 					(submap_entry->is_shared
14098 					|| map->mapped_in_other_pmaps) ?
14099 					PMAP_NULL : map->pmap,
14100 					VM_MAP_PAGE_SIZE(map),
14101 					submap_entry->vme_start,
14102 					prot);
14103 				vm_map_lookup_and_lock_object_copy_shadow_count++;
14104 				vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14105 				if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14106 					vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14107 				}
14108 			}
14109 
14110 			/*
14111 			 * Adjust the fault offset to the submap entry.
14112 			 */
14113 			copy_offset = (local_vaddr -
14114 			    submap_entry->vme_start +
14115 			    VME_OFFSET(submap_entry));
14116 
14117 			/* This works diffently than the   */
14118 			/* normal submap case. We go back  */
14119 			/* to the parent of the cow map and*/
14120 			/* clip out the target portion of  */
14121 			/* the sub_map, substituting the   */
14122 			/* new copy object,                */
14123 
14124 			subentry_protection = submap_entry->protection;
14125 			subentry_max_protection = submap_entry->max_protection;
14126 			subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14127 			subentry_permanent = submap_entry->vme_permanent;
14128 			subentry_csm_associated = submap_entry->csm_associated;
14129 #if __arm64e__
14130 			subentry_used_for_tpro = submap_entry->used_for_tpro;
14131 #endif // __arm64e__
14132 			vm_map_unlock(map);
14133 			submap_entry = NULL; /* not valid after map unlock */
14134 
14135 			local_start = old_start;
14136 			local_end = old_end;
14137 			map = cow_sub_map_parent;
14138 			*var_map = cow_sub_map_parent;
14139 			vaddr = cow_parent_vaddr;
14140 			cow_sub_map_parent = NULL;
14141 
14142 			if (!vm_map_lookup_entry(map,
14143 			    vaddr, &entry)) {
14144 				if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14145 					vm_map_unlock(cow_sub_map_parent);
14146 				}
14147 				if ((*real_map != map)
14148 				    && (*real_map != cow_sub_map_parent)) {
14149 					vm_map_unlock(*real_map);
14150 				}
14151 				*real_map = map;
14152 				vm_object_deallocate(
14153 					copy_object);
14154 				copy_object = VM_OBJECT_NULL;
14155 				vm_map_lock_write_to_read(map);
14156 				DTRACE_VM4(submap_lookup_post_unlock,
14157 				    uint64_t, (uint64_t)entry->vme_start,
14158 				    uint64_t, (uint64_t)entry->vme_end,
14159 				    vm_map_offset_t, vaddr,
14160 				    int, object_copied);
14161 				return KERN_INVALID_ADDRESS;
14162 			}
14163 
14164 			/* clip out the portion of space */
14165 			/* mapped by the sub map which   */
14166 			/* corresponds to the underlying */
14167 			/* object */
14168 
14169 			/*
14170 			 * Clip (and unnest) the smallest nested chunk
14171 			 * possible around the faulting address...
14172 			 */
14173 			local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14174 			local_end = local_start + pmap_shared_region_size_min(map->pmap);
14175 			/*
14176 			 * ... but don't go beyond the "old_start" to "old_end"
14177 			 * range, to avoid spanning over another VM region
14178 			 * with a possibly different VM object and/or offset.
14179 			 */
14180 			if (local_start < old_start) {
14181 				local_start = old_start;
14182 			}
14183 			if (local_end > old_end) {
14184 				local_end = old_end;
14185 			}
14186 			/*
14187 			 * Adjust copy_offset to the start of the range.
14188 			 */
14189 			copy_offset -= (vaddr - local_start);
14190 
14191 			vm_map_clip_start(map, entry, local_start);
14192 			vm_map_clip_end(map, entry, local_end);
14193 			if (entry->is_sub_map) {
14194 				/* unnesting was done when clipping */
14195 				assert(!entry->use_pmap);
14196 			}
14197 
14198 			/* substitute copy object for */
14199 			/* shared map entry           */
14200 			vm_map_deallocate(VME_SUBMAP(entry));
14201 			assert(!entry->iokit_acct);
14202 			entry->use_pmap = TRUE;
14203 			VME_OBJECT_SET(entry, copy_object, false, 0);
14204 
14205 			/* propagate the submap entry's protections */
14206 			if (entry->protection != VM_PROT_READ) {
14207 				/*
14208 				 * Someone has already altered the top entry's
14209 				 * protections via vm_protect(VM_PROT_COPY).
14210 				 * Respect these new values and ignore the
14211 				 * submap entry's protections.
14212 				 */
14213 			} else {
14214 				/*
14215 				 * Regular copy-on-write: propagate the submap
14216 				 * entry's protections to the top map entry.
14217 				 */
14218 				entry->protection |= subentry_protection;
14219 			}
14220 			entry->max_protection |= subentry_max_protection;
14221 			/* propagate some attributes from subentry */
14222 			entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14223 			entry->vme_permanent = subentry_permanent;
14224 			entry->csm_associated = subentry_csm_associated;
14225 #if __arm64e__
14226 			/* propagate TPRO iff the destination map has TPRO enabled */
14227 			if (subentry_used_for_tpro && vm_map_tpro(map)) {
14228 				entry->used_for_tpro = subentry_used_for_tpro;
14229 			}
14230 #endif /* __arm64e */
14231 			if ((entry->protection & VM_PROT_WRITE) &&
14232 			    (entry->protection & VM_PROT_EXECUTE) &&
14233 #if XNU_TARGET_OS_OSX
14234 			    map->pmap != kernel_pmap &&
14235 			    (vm_map_cs_enforcement(map)
14236 #if __arm64__
14237 			    || !VM_MAP_IS_EXOTIC(map)
14238 #endif /* __arm64__ */
14239 			    ) &&
14240 #endif /* XNU_TARGET_OS_OSX */
14241 #if CODE_SIGNING_MONITOR
14242 			    (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14243 #endif
14244 			    !(entry->used_for_jit) &&
14245 			    VM_MAP_POLICY_WX_STRIP_X(map)) {
14246 				DTRACE_VM3(cs_wx,
14247 				    uint64_t, (uint64_t)entry->vme_start,
14248 				    uint64_t, (uint64_t)entry->vme_end,
14249 				    vm_prot_t, entry->protection);
14250 				printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14251 				    proc_selfpid(),
14252 				    (get_bsdtask_info(current_task())
14253 				    ? proc_name_address(get_bsdtask_info(current_task()))
14254 				    : "?"),
14255 				    __FUNCTION__, __LINE__,
14256 #if DEVELOPMENT || DEBUG
14257 				    (uint64_t)entry->vme_start,
14258 				    (uint64_t)entry->vme_end,
14259 #else /* DEVELOPMENT || DEBUG */
14260 				    (uint64_t)0,
14261 				    (uint64_t)0,
14262 #endif /* DEVELOPMENT || DEBUG */
14263 				    entry->protection);
14264 				entry->protection &= ~VM_PROT_EXECUTE;
14265 			}
14266 
14267 			if (object_copied) {
14268 				VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14269 				entry->needs_copy = object_copied_needs_copy;
14270 				entry->is_shared = FALSE;
14271 			} else {
14272 				assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14273 				assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14274 				assert(entry->wired_count == 0);
14275 				VME_OFFSET_SET(entry, copy_offset);
14276 				entry->needs_copy = TRUE;
14277 				if (map != old_map) {
14278 					entry->is_shared = TRUE;
14279 				}
14280 			}
14281 			if (entry->inheritance == VM_INHERIT_SHARE) {
14282 				entry->inheritance = VM_INHERIT_COPY;
14283 			}
14284 
14285 			vm_map_lock_write_to_read(map);
14286 		} else {
14287 			if ((cow_sub_map_parent)
14288 			    && (cow_sub_map_parent != *real_map)
14289 			    && (cow_sub_map_parent != map)) {
14290 				vm_map_unlock(cow_sub_map_parent);
14291 			}
14292 			entry = submap_entry;
14293 			vaddr = local_vaddr;
14294 		}
14295 	}
14296 
14297 	/*
14298 	 *	Check whether this task is allowed to have
14299 	 *	this page.
14300 	 */
14301 
14302 	prot = entry->protection;
14303 
14304 	if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14305 		/*
14306 		 * HACK -- if not a stack, then allow execution
14307 		 */
14308 		prot |= VM_PROT_EXECUTE;
14309 	}
14310 
14311 #if __arm64e__
14312 	/*
14313 	 * If the entry we're dealing with is TPRO and we have a write
14314 	 * fault, inject VM_PROT_WRITE into protections. This allows us
14315 	 * to maintain RO permissions when not marked as TPRO.
14316 	 */
14317 	if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14318 		prot |= VM_PROT_WRITE;
14319 	}
14320 #endif /* __arm64e__ */
14321 	if (mask_protections) {
14322 		fault_type &= prot;
14323 		if (fault_type == VM_PROT_NONE) {
14324 			goto protection_failure;
14325 		}
14326 	}
14327 	if (((fault_type & prot) != fault_type)
14328 #if __arm64__
14329 	    /* prefetch abort in execute-only page */
14330 	    && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14331 #elif defined(__x86_64__)
14332 	    /* Consider the UEXEC bit when handling an EXECUTE fault */
14333 	    && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14334 #endif
14335 	    ) {
14336 protection_failure:
14337 		if (*real_map != map) {
14338 			vm_map_unlock(*real_map);
14339 		}
14340 		*real_map = map;
14341 
14342 		if ((fault_type & VM_PROT_EXECUTE) && prot) {
14343 			log_stack_execution_failure((addr64_t)vaddr, prot);
14344 		}
14345 
14346 		DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14347 		DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14348 		/*
14349 		 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14350 		 *
14351 		 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14352 		 */
14353 		return KERN_PROTECTION_FAILURE;
14354 	}
14355 
14356 	/*
14357 	 *	If this page is not pageable, we have to get
14358 	 *	it for all possible accesses.
14359 	 */
14360 
14361 	*wired = (entry->wired_count != 0);
14362 	if (*wired) {
14363 		fault_type = prot;
14364 	}
14365 
14366 	/*
14367 	 *	If the entry was copy-on-write, we either ...
14368 	 */
14369 
14370 	if (entry->needs_copy) {
14371 		/*
14372 		 *	If we want to write the page, we may as well
14373 		 *	handle that now since we've got the map locked.
14374 		 *
14375 		 *	If we don't need to write the page, we just
14376 		 *	demote the permissions allowed.
14377 		 */
14378 
14379 		if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14380 			/*
14381 			 *	Make a new object, and place it in the
14382 			 *	object chain.  Note that no new references
14383 			 *	have appeared -- one just moved from the
14384 			 *	map to the new object.
14385 			 */
14386 
14387 			if (vm_map_lock_read_to_write(map)) {
14388 				vm_map_lock_read(map);
14389 				goto RetryLookup;
14390 			}
14391 
14392 			if (VME_OBJECT(entry)->shadowed == FALSE) {
14393 				vm_object_lock(VME_OBJECT(entry));
14394 				VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14395 				vm_object_unlock(VME_OBJECT(entry));
14396 			}
14397 			VME_OBJECT_SHADOW(entry,
14398 			    (vm_map_size_t) (entry->vme_end -
14399 			    entry->vme_start),
14400 			    vm_map_always_shadow(map));
14401 			entry->needs_copy = FALSE;
14402 
14403 			vm_map_lock_write_to_read(map);
14404 		}
14405 		if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14406 			/*
14407 			 *	We're attempting to read a copy-on-write
14408 			 *	page -- don't allow writes.
14409 			 */
14410 
14411 			prot &= (~VM_PROT_WRITE);
14412 		}
14413 	}
14414 
14415 	if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14416 		/*
14417 		 * We went through a "needs_copy" submap without triggering
14418 		 * a copy, so granting write access to the page would bypass
14419 		 * that submap's "needs_copy".
14420 		 */
14421 		assert(!(fault_type & VM_PROT_WRITE));
14422 		assert(!*wired);
14423 		assert(!force_copy);
14424 		// printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14425 		prot &= ~VM_PROT_WRITE;
14426 	}
14427 
14428 	/*
14429 	 *	Create an object if necessary.
14430 	 */
14431 	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14432 		if (vm_map_lock_read_to_write(map)) {
14433 			vm_map_lock_read(map);
14434 			goto RetryLookup;
14435 		}
14436 
14437 		VME_OBJECT_SET(entry,
14438 		    vm_object_allocate(
14439 			    (vm_map_size_t)(entry->vme_end -
14440 			    entry->vme_start)), false, 0);
14441 		VME_OFFSET_SET(entry, 0);
14442 		assert(entry->use_pmap);
14443 		vm_map_lock_write_to_read(map);
14444 	}
14445 
14446 	/*
14447 	 *	Return the object/offset from this entry.  If the entry
14448 	 *	was copy-on-write or empty, it has been fixed up.  Also
14449 	 *	return the protection.
14450 	 */
14451 
14452 	*offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14453 	*object = VME_OBJECT(entry);
14454 	*out_prot = prot;
14455 	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14456 
14457 	if (fault_info) {
14458 		/* ... the caller will change "interruptible" if needed */
14459 		fault_info->user_tag = VME_ALIAS(entry);
14460 		fault_info->pmap_options = 0;
14461 		if (entry->iokit_acct ||
14462 		    (!entry->is_sub_map && !entry->use_pmap)) {
14463 			fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14464 		}
14465 		if (fault_info->behavior == VM_BEHAVIOR_DEFAULT) {
14466 			fault_info->behavior = entry->behavior;
14467 		}
14468 		fault_info->lo_offset = VME_OFFSET(entry);
14469 		fault_info->hi_offset =
14470 		    (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14471 		fault_info->no_cache  = entry->no_cache;
14472 		fault_info->stealth = FALSE;
14473 		fault_info->io_sync = FALSE;
14474 		if (entry->used_for_jit ||
14475 #if CODE_SIGNING_MONITOR
14476 		    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14477 #endif
14478 		    entry->vme_resilient_codesign) {
14479 			fault_info->cs_bypass = TRUE;
14480 		} else {
14481 			fault_info->cs_bypass = FALSE;
14482 		}
14483 		fault_info->csm_associated = FALSE;
14484 #if CODE_SIGNING_MONITOR
14485 		if (entry->csm_associated) {
14486 			/*
14487 			 * The pmap layer will validate this page
14488 			 * before allowing it to be executed from.
14489 			 */
14490 			fault_info->csm_associated = TRUE;
14491 		}
14492 #endif
14493 		fault_info->mark_zf_absent = FALSE;
14494 		fault_info->batch_pmap_op = FALSE;
14495 		fault_info->resilient_media = entry->vme_resilient_media;
14496 		fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14497 		fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14498 #if __arm64e__
14499 		fault_info->fi_used_for_tpro = entry->used_for_tpro;
14500 #else /* __arm64e__ */
14501 		fault_info->fi_used_for_tpro = FALSE;
14502 #endif
14503 		if (entry->translated_allow_execute) {
14504 			fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14505 		}
14506 	}
14507 
14508 	/*
14509 	 *	Lock the object to prevent it from disappearing
14510 	 */
14511 	if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14512 		if (contended == NULL) {
14513 			vm_object_lock(*object);
14514 		} else {
14515 			*contended = vm_object_lock_check_contended(*object);
14516 		}
14517 	} else {
14518 		vm_object_lock_shared(*object);
14519 	}
14520 
14521 	/*
14522 	 *	Save the version number
14523 	 */
14524 
14525 	out_version->main_timestamp = map->timestamp;
14526 
14527 	return KERN_SUCCESS;
14528 }
14529 
14530 
14531 /*
14532  *	vm_map_verify:
14533  *
14534  *	Verifies that the map in question has not changed
14535  *	since the given version. The map has to be locked
14536  *	("shared" mode is fine) before calling this function
14537  *	and it will be returned locked too.
14538  */
14539 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14540 vm_map_verify(
14541 	vm_map_t                map,
14542 	vm_map_version_t        *version)       /* REF */
14543 {
14544 	boolean_t       result;
14545 
14546 	vm_map_lock_assert_held(map);
14547 	result = (map->timestamp == version->main_timestamp);
14548 
14549 	return result;
14550 }
14551 
14552 /*
14553  *	TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14554  *	Goes away after regular vm_region_recurse function migrates to
14555  *	64 bits
14556  *	vm_region_recurse: A form of vm_region which follows the
14557  *	submaps in a target map
14558  *
14559  */
14560 
14561 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14562 vm_map_region_recurse_64(
14563 	vm_map_t                 map,
14564 	vm_map_offset_t *address,               /* IN/OUT */
14565 	vm_map_size_t           *size,                  /* OUT */
14566 	natural_t               *nesting_depth, /* IN/OUT */
14567 	vm_region_submap_info_64_t      submap_info,    /* IN/OUT */
14568 	mach_msg_type_number_t  *count) /* IN/OUT */
14569 {
14570 	mach_msg_type_number_t  original_count;
14571 	vm_region_extended_info_data_t  extended;
14572 	vm_map_entry_t                  tmp_entry;
14573 	vm_map_offset_t                 user_address;
14574 	unsigned int                    user_max_depth;
14575 
14576 	/*
14577 	 * "curr_entry" is the VM map entry preceding or including the
14578 	 * address we're looking for.
14579 	 * "curr_map" is the map or sub-map containing "curr_entry".
14580 	 * "curr_address" is the equivalent of the top map's "user_address"
14581 	 * in the current map.
14582 	 * "curr_offset" is the cumulated offset of "curr_map" in the
14583 	 * target task's address space.
14584 	 * "curr_depth" is the depth of "curr_map" in the chain of
14585 	 * sub-maps.
14586 	 *
14587 	 * "curr_max_below" and "curr_max_above" limit the range (around
14588 	 * "curr_address") we should take into account in the current (sub)map.
14589 	 * They limit the range to what's visible through the map entries
14590 	 * we've traversed from the top map to the current map.
14591 	 *
14592 	 */
14593 	vm_map_entry_t                  curr_entry;
14594 	vm_map_address_t                curr_address;
14595 	vm_map_offset_t                 curr_offset;
14596 	vm_map_t                        curr_map;
14597 	unsigned int                    curr_depth;
14598 	vm_map_offset_t                 curr_max_below, curr_max_above;
14599 	vm_map_offset_t                 curr_skip;
14600 
14601 	/*
14602 	 * "next_" is the same as "curr_" but for the VM region immediately
14603 	 * after the address we're looking for.  We need to keep track of this
14604 	 * too because we want to return info about that region if the
14605 	 * address we're looking for is not mapped.
14606 	 */
14607 	vm_map_entry_t                  next_entry;
14608 	vm_map_offset_t                 next_offset;
14609 	vm_map_offset_t                 next_address;
14610 	vm_map_t                        next_map;
14611 	unsigned int                    next_depth;
14612 	vm_map_offset_t                 next_max_below, next_max_above;
14613 	vm_map_offset_t                 next_skip;
14614 
14615 	boolean_t                       look_for_pages;
14616 	vm_region_submap_short_info_64_t short_info;
14617 	boolean_t                       do_region_footprint;
14618 	int                             effective_page_size, effective_page_shift;
14619 	boolean_t                       submap_needed_copy;
14620 
14621 	if (map == VM_MAP_NULL) {
14622 		/* no address space to work on */
14623 		return KERN_INVALID_ARGUMENT;
14624 	}
14625 
14626 	effective_page_shift = vm_self_region_page_shift(map);
14627 	effective_page_size = (1 << effective_page_shift);
14628 
14629 	if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14630 		/*
14631 		 * "info" structure is not big enough and
14632 		 * would overflow
14633 		 */
14634 		return KERN_INVALID_ARGUMENT;
14635 	}
14636 
14637 	do_region_footprint = task_self_region_footprint();
14638 	original_count = *count;
14639 
14640 	if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14641 		*count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14642 		look_for_pages = FALSE;
14643 		short_info = (vm_region_submap_short_info_64_t) submap_info;
14644 		submap_info = NULL;
14645 	} else {
14646 		look_for_pages = TRUE;
14647 		*count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14648 		short_info = NULL;
14649 
14650 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14651 			*count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14652 		}
14653 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14654 			*count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14655 		}
14656 	}
14657 
14658 	user_address = *address;
14659 	user_max_depth = *nesting_depth;
14660 	submap_needed_copy = FALSE;
14661 
14662 	if (not_in_kdp) {
14663 		vm_map_lock_read(map);
14664 	}
14665 
14666 recurse_again:
14667 	curr_entry = NULL;
14668 	curr_map = map;
14669 	curr_address = user_address;
14670 	curr_offset = 0;
14671 	curr_skip = 0;
14672 	curr_depth = 0;
14673 	curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14674 	curr_max_below = curr_address;
14675 
14676 	next_entry = NULL;
14677 	next_map = NULL;
14678 	next_address = 0;
14679 	next_offset = 0;
14680 	next_skip = 0;
14681 	next_depth = 0;
14682 	next_max_above = (vm_map_offset_t) -1;
14683 	next_max_below = (vm_map_offset_t) -1;
14684 
14685 	for (;;) {
14686 		if (vm_map_lookup_entry(curr_map,
14687 		    curr_address,
14688 		    &tmp_entry)) {
14689 			/* tmp_entry contains the address we're looking for */
14690 			curr_entry = tmp_entry;
14691 		} else {
14692 			vm_map_offset_t skip;
14693 			/*
14694 			 * The address is not mapped.  "tmp_entry" is the
14695 			 * map entry preceding the address.  We want the next
14696 			 * one, if it exists.
14697 			 */
14698 			curr_entry = tmp_entry->vme_next;
14699 
14700 			if (curr_entry == vm_map_to_entry(curr_map) ||
14701 			    (curr_entry->vme_start >=
14702 			    curr_address + curr_max_above)) {
14703 				/* no next entry at this level: stop looking */
14704 				if (not_in_kdp) {
14705 					vm_map_unlock_read(curr_map);
14706 				}
14707 				curr_entry = NULL;
14708 				curr_map = NULL;
14709 				curr_skip = 0;
14710 				curr_offset = 0;
14711 				curr_depth = 0;
14712 				curr_max_above = 0;
14713 				curr_max_below = 0;
14714 				break;
14715 			}
14716 
14717 			/* adjust current address and offset */
14718 			skip = curr_entry->vme_start - curr_address;
14719 			curr_address = curr_entry->vme_start;
14720 			curr_skip += skip;
14721 			curr_offset += skip;
14722 			curr_max_above -= skip;
14723 			curr_max_below = 0;
14724 		}
14725 
14726 		/*
14727 		 * Is the next entry at this level closer to the address (or
14728 		 * deeper in the submap chain) than the one we had
14729 		 * so far ?
14730 		 */
14731 		tmp_entry = curr_entry->vme_next;
14732 		if (tmp_entry == vm_map_to_entry(curr_map)) {
14733 			/* no next entry at this level */
14734 		} else if (tmp_entry->vme_start >=
14735 		    curr_address + curr_max_above) {
14736 			/*
14737 			 * tmp_entry is beyond the scope of what we mapped of
14738 			 * this submap in the upper level: ignore it.
14739 			 */
14740 		} else if ((next_entry == NULL) ||
14741 		    (tmp_entry->vme_start + curr_offset <=
14742 		    next_entry->vme_start + next_offset)) {
14743 			/*
14744 			 * We didn't have a "next_entry" or this one is
14745 			 * closer to the address we're looking for:
14746 			 * use this "tmp_entry" as the new "next_entry".
14747 			 */
14748 			if (next_entry != NULL) {
14749 				/* unlock the last "next_map" */
14750 				if (next_map != curr_map && not_in_kdp) {
14751 					vm_map_unlock_read(next_map);
14752 				}
14753 			}
14754 			next_entry = tmp_entry;
14755 			next_map = curr_map;
14756 			next_depth = curr_depth;
14757 			next_address = next_entry->vme_start;
14758 			next_skip = curr_skip;
14759 			next_skip += (next_address - curr_address);
14760 			next_offset = curr_offset;
14761 			next_offset += (next_address - curr_address);
14762 			next_max_above = MIN(next_max_above, curr_max_above);
14763 			next_max_above = MIN(next_max_above,
14764 			    next_entry->vme_end - next_address);
14765 			next_max_below = MIN(next_max_below, curr_max_below);
14766 			next_max_below = MIN(next_max_below,
14767 			    next_address - next_entry->vme_start);
14768 		}
14769 
14770 		/*
14771 		 * "curr_max_{above,below}" allow us to keep track of the
14772 		 * portion of the submap that is actually mapped at this level:
14773 		 * the rest of that submap is irrelevant to us, since it's not
14774 		 * mapped here.
14775 		 * The relevant portion of the map starts at
14776 		 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
14777 		 */
14778 		curr_max_above = MIN(curr_max_above,
14779 		    curr_entry->vme_end - curr_address);
14780 		curr_max_below = MIN(curr_max_below,
14781 		    curr_address - curr_entry->vme_start);
14782 
14783 		if (!curr_entry->is_sub_map ||
14784 		    curr_depth >= user_max_depth) {
14785 			/*
14786 			 * We hit a leaf map or we reached the maximum depth
14787 			 * we could, so stop looking.  Keep the current map
14788 			 * locked.
14789 			 */
14790 			break;
14791 		}
14792 
14793 		/*
14794 		 * Get down to the next submap level.
14795 		 */
14796 
14797 		if (curr_entry->needs_copy) {
14798 			/* everything below this is effectively copy-on-write */
14799 			submap_needed_copy = TRUE;
14800 		}
14801 
14802 		/*
14803 		 * Lock the next level and unlock the current level,
14804 		 * unless we need to keep it locked to access the "next_entry"
14805 		 * later.
14806 		 */
14807 		if (not_in_kdp) {
14808 			vm_map_lock_read(VME_SUBMAP(curr_entry));
14809 		}
14810 		if (curr_map == next_map) {
14811 			/* keep "next_map" locked in case we need it */
14812 		} else {
14813 			/* release this map */
14814 			if (not_in_kdp) {
14815 				vm_map_unlock_read(curr_map);
14816 			}
14817 		}
14818 
14819 		/*
14820 		 * Adjust the offset.  "curr_entry" maps the submap
14821 		 * at relative address "curr_entry->vme_start" in the
14822 		 * curr_map but skips the first "VME_OFFSET(curr_entry)"
14823 		 * bytes of the submap.
14824 		 * "curr_offset" always represents the offset of a virtual
14825 		 * address in the curr_map relative to the absolute address
14826 		 * space (i.e. the top-level VM map).
14827 		 */
14828 		curr_offset +=
14829 		    (VME_OFFSET(curr_entry) - curr_entry->vme_start);
14830 		curr_address = user_address + curr_offset;
14831 		/* switch to the submap */
14832 		curr_map = VME_SUBMAP(curr_entry);
14833 		curr_depth++;
14834 		curr_entry = NULL;
14835 	}
14836 
14837 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
14838 // so probably should be a real 32b ID vs. ptr.
14839 // Current users just check for equality
14840 
14841 	if (curr_entry == NULL) {
14842 		/* no VM region contains the address... */
14843 
14844 		if (do_region_footprint && /* we want footprint numbers */
14845 		    next_entry == NULL && /* & there are no more regions */
14846 		    /* & we haven't already provided our fake region: */
14847 		    user_address <= vm_map_last_entry(map)->vme_end) {
14848 			ledger_amount_t ledger_resident, ledger_compressed;
14849 
14850 			/*
14851 			 * Add a fake memory region to account for
14852 			 * purgeable and/or ledger-tagged memory that
14853 			 * counts towards this task's memory footprint,
14854 			 * i.e. the resident/compressed pages of non-volatile
14855 			 * objects owned by that task.
14856 			 */
14857 			task_ledgers_footprint(map->pmap->ledger,
14858 			    &ledger_resident,
14859 			    &ledger_compressed);
14860 			if (ledger_resident + ledger_compressed == 0) {
14861 				/* no purgeable memory usage to report */
14862 				return KERN_INVALID_ADDRESS;
14863 			}
14864 			/* fake region to show nonvolatile footprint */
14865 			if (look_for_pages) {
14866 				submap_info->protection = VM_PROT_DEFAULT;
14867 				submap_info->max_protection = VM_PROT_DEFAULT;
14868 				submap_info->inheritance = VM_INHERIT_DEFAULT;
14869 				submap_info->offset = 0;
14870 				submap_info->user_tag = -1;
14871 				submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
14872 				submap_info->pages_shared_now_private = 0;
14873 				submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
14874 				submap_info->pages_dirtied = submap_info->pages_resident;
14875 				submap_info->ref_count = 1;
14876 				submap_info->shadow_depth = 0;
14877 				submap_info->external_pager = 0;
14878 				submap_info->share_mode = SM_PRIVATE;
14879 				if (submap_needed_copy) {
14880 					submap_info->share_mode = SM_COW;
14881 				}
14882 				submap_info->is_submap = 0;
14883 				submap_info->behavior = VM_BEHAVIOR_DEFAULT;
14884 				submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14885 				submap_info->user_wired_count = 0;
14886 				submap_info->pages_reusable = 0;
14887 			} else {
14888 				short_info->user_tag = -1;
14889 				short_info->offset = 0;
14890 				short_info->protection = VM_PROT_DEFAULT;
14891 				short_info->inheritance = VM_INHERIT_DEFAULT;
14892 				short_info->max_protection = VM_PROT_DEFAULT;
14893 				short_info->behavior = VM_BEHAVIOR_DEFAULT;
14894 				short_info->user_wired_count = 0;
14895 				short_info->is_submap = 0;
14896 				short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14897 				short_info->external_pager = 0;
14898 				short_info->shadow_depth = 0;
14899 				short_info->share_mode = SM_PRIVATE;
14900 				if (submap_needed_copy) {
14901 					short_info->share_mode = SM_COW;
14902 				}
14903 				short_info->ref_count = 1;
14904 			}
14905 			*nesting_depth = 0;
14906 			*size = (vm_map_size_t) (ledger_resident + ledger_compressed);
14907 //			*address = user_address;
14908 			*address = vm_map_last_entry(map)->vme_end;
14909 			return KERN_SUCCESS;
14910 		}
14911 
14912 		if (next_entry == NULL) {
14913 			/* ... and no VM region follows it either */
14914 			return KERN_INVALID_ADDRESS;
14915 		}
14916 		/* ... gather info about the next VM region */
14917 		curr_entry = next_entry;
14918 		curr_map = next_map;    /* still locked ... */
14919 		curr_address = next_address;
14920 		curr_skip = next_skip;
14921 		curr_offset = next_offset;
14922 		curr_depth = next_depth;
14923 		curr_max_above = next_max_above;
14924 		curr_max_below = next_max_below;
14925 	} else {
14926 		/* we won't need "next_entry" after all */
14927 		if (next_entry != NULL) {
14928 			/* release "next_map" */
14929 			if (next_map != curr_map && not_in_kdp) {
14930 				vm_map_unlock_read(next_map);
14931 			}
14932 		}
14933 	}
14934 	next_entry = NULL;
14935 	next_map = NULL;
14936 	next_offset = 0;
14937 	next_skip = 0;
14938 	next_depth = 0;
14939 	next_max_below = -1;
14940 	next_max_above = -1;
14941 
14942 	if (curr_entry->is_sub_map &&
14943 	    curr_depth < user_max_depth) {
14944 		/*
14945 		 * We're not as deep as we could be:  we must have
14946 		 * gone back up after not finding anything mapped
14947 		 * below the original top-level map entry's.
14948 		 * Let's move "curr_address" forward and recurse again.
14949 		 */
14950 		user_address = curr_address;
14951 		goto recurse_again;
14952 	}
14953 
14954 	*nesting_depth = curr_depth;
14955 	*size = curr_max_above + curr_max_below;
14956 	*address = user_address + curr_skip - curr_max_below;
14957 
14958 	if (look_for_pages) {
14959 		submap_info->user_tag = VME_ALIAS(curr_entry);
14960 		submap_info->offset = VME_OFFSET(curr_entry);
14961 		submap_info->protection = curr_entry->protection;
14962 		submap_info->inheritance = curr_entry->inheritance;
14963 		submap_info->max_protection = curr_entry->max_protection;
14964 		submap_info->behavior = curr_entry->behavior;
14965 		submap_info->user_wired_count = curr_entry->user_wired_count;
14966 		submap_info->is_submap = curr_entry->is_sub_map;
14967 		if (curr_entry->is_sub_map) {
14968 			submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14969 		} else {
14970 			submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14971 		}
14972 	} else {
14973 		short_info->user_tag = VME_ALIAS(curr_entry);
14974 		short_info->offset = VME_OFFSET(curr_entry);
14975 		short_info->protection = curr_entry->protection;
14976 		short_info->inheritance = curr_entry->inheritance;
14977 		short_info->max_protection = curr_entry->max_protection;
14978 		short_info->behavior = curr_entry->behavior;
14979 		short_info->user_wired_count = curr_entry->user_wired_count;
14980 		short_info->is_submap = curr_entry->is_sub_map;
14981 		if (curr_entry->is_sub_map) {
14982 			short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14983 		} else {
14984 			short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14985 		}
14986 	}
14987 
14988 	extended.pages_resident = 0;
14989 	extended.pages_swapped_out = 0;
14990 	extended.pages_shared_now_private = 0;
14991 	extended.pages_dirtied = 0;
14992 	extended.pages_reusable = 0;
14993 	extended.external_pager = 0;
14994 	extended.shadow_depth = 0;
14995 	extended.share_mode = SM_EMPTY;
14996 	extended.ref_count = 0;
14997 
14998 	if (not_in_kdp) {
14999 		if (!curr_entry->is_sub_map) {
15000 			vm_map_offset_t range_start, range_end;
15001 			range_start = MAX((curr_address - curr_max_below),
15002 			    curr_entry->vme_start);
15003 			range_end = MIN((curr_address + curr_max_above),
15004 			    curr_entry->vme_end);
15005 			vm_map_region_walk(curr_map,
15006 			    range_start,
15007 			    curr_entry,
15008 			    (VME_OFFSET(curr_entry) +
15009 			    (range_start -
15010 			    curr_entry->vme_start)),
15011 			    range_end - range_start,
15012 			    &extended,
15013 			    look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15014 			if (submap_needed_copy) {
15015 				extended.share_mode = SM_COW;
15016 			}
15017 		} else {
15018 			if (curr_entry->use_pmap) {
15019 				extended.share_mode = SM_TRUESHARED;
15020 			} else {
15021 				extended.share_mode = SM_PRIVATE;
15022 			}
15023 			extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15024 		}
15025 	}
15026 
15027 	if (look_for_pages) {
15028 		submap_info->pages_resident = extended.pages_resident;
15029 		submap_info->pages_swapped_out = extended.pages_swapped_out;
15030 		submap_info->pages_shared_now_private =
15031 		    extended.pages_shared_now_private;
15032 		submap_info->pages_dirtied = extended.pages_dirtied;
15033 		submap_info->external_pager = extended.external_pager;
15034 		submap_info->shadow_depth = extended.shadow_depth;
15035 		submap_info->share_mode = extended.share_mode;
15036 		submap_info->ref_count = extended.ref_count;
15037 
15038 		if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15039 			submap_info->pages_reusable = extended.pages_reusable;
15040 		}
15041 		if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15042 			if (curr_entry->is_sub_map) {
15043 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15044 			} else if (VME_OBJECT(curr_entry)) {
15045 				submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15046 			} else {
15047 				submap_info->object_id_full = 0ull;
15048 			}
15049 		}
15050 	} else {
15051 		short_info->external_pager = extended.external_pager;
15052 		short_info->shadow_depth = extended.shadow_depth;
15053 		short_info->share_mode = extended.share_mode;
15054 		short_info->ref_count = extended.ref_count;
15055 	}
15056 
15057 	if (not_in_kdp) {
15058 		vm_map_unlock_read(curr_map);
15059 	}
15060 
15061 	return KERN_SUCCESS;
15062 }
15063 
15064 /*
15065  *	vm_region:
15066  *
15067  *	User call to obtain information about a region in
15068  *	a task's address map. Currently, only one flavor is
15069  *	supported.
15070  *
15071  *	XXX The reserved and behavior fields cannot be filled
15072  *	    in until the vm merge from the IK is completed, and
15073  *	    vm_reserve is implemented.
15074  */
15075 
15076 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15077 vm_map_region(
15078 	vm_map_t                 map,
15079 	vm_map_offset_t *address,               /* IN/OUT */
15080 	vm_map_size_t           *size,                  /* OUT */
15081 	vm_region_flavor_t       flavor,                /* IN */
15082 	vm_region_info_t         info,                  /* OUT */
15083 	mach_msg_type_number_t  *count, /* IN/OUT */
15084 	mach_port_t             *object_name)           /* OUT */
15085 {
15086 	vm_map_entry_t          tmp_entry;
15087 	vm_map_entry_t          entry;
15088 	vm_map_offset_t         start;
15089 
15090 	if (map == VM_MAP_NULL) {
15091 		return KERN_INVALID_ARGUMENT;
15092 	}
15093 
15094 	switch (flavor) {
15095 	case VM_REGION_BASIC_INFO:
15096 		/* legacy for old 32-bit objects info */
15097 	{
15098 		vm_region_basic_info_t  basic;
15099 
15100 		if (*count < VM_REGION_BASIC_INFO_COUNT) {
15101 			return KERN_INVALID_ARGUMENT;
15102 		}
15103 
15104 		basic = (vm_region_basic_info_t) info;
15105 		*count = VM_REGION_BASIC_INFO_COUNT;
15106 
15107 		vm_map_lock_read(map);
15108 
15109 		start = *address;
15110 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15111 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15112 				vm_map_unlock_read(map);
15113 				return KERN_INVALID_ADDRESS;
15114 			}
15115 		} else {
15116 			entry = tmp_entry;
15117 		}
15118 
15119 		start = entry->vme_start;
15120 
15121 		basic->offset = (uint32_t)VME_OFFSET(entry);
15122 		basic->protection = entry->protection;
15123 		basic->inheritance = entry->inheritance;
15124 		basic->max_protection = entry->max_protection;
15125 		basic->behavior = entry->behavior;
15126 		basic->user_wired_count = entry->user_wired_count;
15127 		basic->reserved = entry->is_sub_map;
15128 		*address = start;
15129 		*size = (entry->vme_end - start);
15130 
15131 		if (object_name) {
15132 			*object_name = IP_NULL;
15133 		}
15134 		if (entry->is_sub_map) {
15135 			basic->shared = FALSE;
15136 		} else {
15137 			basic->shared = entry->is_shared;
15138 		}
15139 
15140 		vm_map_unlock_read(map);
15141 		return KERN_SUCCESS;
15142 	}
15143 
15144 	case VM_REGION_BASIC_INFO_64:
15145 	{
15146 		vm_region_basic_info_64_t       basic;
15147 
15148 		if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15149 			return KERN_INVALID_ARGUMENT;
15150 		}
15151 
15152 		basic = (vm_region_basic_info_64_t) info;
15153 		*count = VM_REGION_BASIC_INFO_COUNT_64;
15154 
15155 		vm_map_lock_read(map);
15156 
15157 		start = *address;
15158 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15159 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15160 				vm_map_unlock_read(map);
15161 				return KERN_INVALID_ADDRESS;
15162 			}
15163 		} else {
15164 			entry = tmp_entry;
15165 		}
15166 
15167 		start = entry->vme_start;
15168 
15169 		basic->offset = VME_OFFSET(entry);
15170 		basic->protection = entry->protection;
15171 		basic->inheritance = entry->inheritance;
15172 		basic->max_protection = entry->max_protection;
15173 		basic->behavior = entry->behavior;
15174 		basic->user_wired_count = entry->user_wired_count;
15175 		basic->reserved = entry->is_sub_map;
15176 		*address = start;
15177 		*size = (entry->vme_end - start);
15178 
15179 		if (object_name) {
15180 			*object_name = IP_NULL;
15181 		}
15182 		if (entry->is_sub_map) {
15183 			basic->shared = FALSE;
15184 		} else {
15185 			basic->shared = entry->is_shared;
15186 		}
15187 
15188 		vm_map_unlock_read(map);
15189 		return KERN_SUCCESS;
15190 	}
15191 	case VM_REGION_EXTENDED_INFO:
15192 		if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15193 			return KERN_INVALID_ARGUMENT;
15194 		}
15195 		OS_FALLTHROUGH;
15196 	case VM_REGION_EXTENDED_INFO__legacy:
15197 		if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15198 			return KERN_INVALID_ARGUMENT;
15199 		}
15200 
15201 		{
15202 			vm_region_extended_info_t       extended;
15203 			mach_msg_type_number_t original_count;
15204 			int effective_page_size, effective_page_shift;
15205 
15206 			extended = (vm_region_extended_info_t) info;
15207 
15208 			effective_page_shift = vm_self_region_page_shift(map);
15209 			effective_page_size = (1 << effective_page_shift);
15210 
15211 			vm_map_lock_read(map);
15212 
15213 			start = *address;
15214 			if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15215 				if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15216 					vm_map_unlock_read(map);
15217 					return KERN_INVALID_ADDRESS;
15218 				}
15219 			} else {
15220 				entry = tmp_entry;
15221 			}
15222 			start = entry->vme_start;
15223 
15224 			extended->protection = entry->protection;
15225 			extended->user_tag = VME_ALIAS(entry);
15226 			extended->pages_resident = 0;
15227 			extended->pages_swapped_out = 0;
15228 			extended->pages_shared_now_private = 0;
15229 			extended->pages_dirtied = 0;
15230 			extended->external_pager = 0;
15231 			extended->shadow_depth = 0;
15232 
15233 			original_count = *count;
15234 			if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15235 				*count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15236 			} else {
15237 				extended->pages_reusable = 0;
15238 				*count = VM_REGION_EXTENDED_INFO_COUNT;
15239 			}
15240 
15241 			vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15242 
15243 			if (object_name) {
15244 				*object_name = IP_NULL;
15245 			}
15246 			*address = start;
15247 			*size = (entry->vme_end - start);
15248 
15249 			vm_map_unlock_read(map);
15250 			return KERN_SUCCESS;
15251 		}
15252 	case VM_REGION_TOP_INFO:
15253 	{
15254 		vm_region_top_info_t    top;
15255 
15256 		if (*count < VM_REGION_TOP_INFO_COUNT) {
15257 			return KERN_INVALID_ARGUMENT;
15258 		}
15259 
15260 		top = (vm_region_top_info_t) info;
15261 		*count = VM_REGION_TOP_INFO_COUNT;
15262 
15263 		vm_map_lock_read(map);
15264 
15265 		start = *address;
15266 		if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15267 			if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15268 				vm_map_unlock_read(map);
15269 				return KERN_INVALID_ADDRESS;
15270 			}
15271 		} else {
15272 			entry = tmp_entry;
15273 		}
15274 		start = entry->vme_start;
15275 
15276 		top->private_pages_resident = 0;
15277 		top->shared_pages_resident = 0;
15278 
15279 		vm_map_region_top_walk(entry, top);
15280 
15281 		if (object_name) {
15282 			*object_name = IP_NULL;
15283 		}
15284 		*address = start;
15285 		*size = (entry->vme_end - start);
15286 
15287 		vm_map_unlock_read(map);
15288 		return KERN_SUCCESS;
15289 	}
15290 	default:
15291 		return KERN_INVALID_ARGUMENT;
15292 	}
15293 }
15294 
15295 #define OBJ_RESIDENT_COUNT(obj, entry_size)                             \
15296 	MIN((entry_size),                                               \
15297 	    ((obj)->all_reusable ?                                      \
15298 	     (obj)->wired_page_count :                                  \
15299 	     (obj)->resident_page_count - (obj)->reusable_page_count))
15300 
15301 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15302 vm_map_region_top_walk(
15303 	vm_map_entry_t             entry,
15304 	vm_region_top_info_t       top)
15305 {
15306 	if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15307 		top->share_mode = SM_EMPTY;
15308 		top->ref_count = 0;
15309 		top->obj_id = 0;
15310 		return;
15311 	}
15312 
15313 	{
15314 		struct  vm_object *obj, *tmp_obj;
15315 		int             ref_count;
15316 		uint32_t        entry_size;
15317 
15318 		entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15319 
15320 		obj = VME_OBJECT(entry);
15321 
15322 		vm_object_lock(obj);
15323 
15324 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15325 			ref_count--;
15326 		}
15327 
15328 		assert(obj->reusable_page_count <= obj->resident_page_count);
15329 		if (obj->shadow) {
15330 			if (ref_count == 1) {
15331 				top->private_pages_resident =
15332 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15333 			} else {
15334 				top->shared_pages_resident =
15335 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15336 			}
15337 			top->ref_count  = ref_count;
15338 			top->share_mode = SM_COW;
15339 
15340 			while ((tmp_obj = obj->shadow)) {
15341 				vm_object_lock(tmp_obj);
15342 				vm_object_unlock(obj);
15343 				obj = tmp_obj;
15344 
15345 				if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15346 					ref_count--;
15347 				}
15348 
15349 				assert(obj->reusable_page_count <= obj->resident_page_count);
15350 				top->shared_pages_resident +=
15351 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15352 				top->ref_count += ref_count - 1;
15353 			}
15354 		} else {
15355 			if (entry->superpage_size) {
15356 				top->share_mode = SM_LARGE_PAGE;
15357 				top->shared_pages_resident = 0;
15358 				top->private_pages_resident = entry_size;
15359 			} else if (entry->needs_copy) {
15360 				top->share_mode = SM_COW;
15361 				top->shared_pages_resident =
15362 				    OBJ_RESIDENT_COUNT(obj, entry_size);
15363 			} else {
15364 				if (ref_count == 1 ||
15365 				    (ref_count == 2 && obj->named)) {
15366 					top->share_mode = SM_PRIVATE;
15367 					top->private_pages_resident =
15368 					    OBJ_RESIDENT_COUNT(obj,
15369 					    entry_size);
15370 				} else {
15371 					top->share_mode = SM_SHARED;
15372 					top->shared_pages_resident =
15373 					    OBJ_RESIDENT_COUNT(obj,
15374 					    entry_size);
15375 				}
15376 			}
15377 			top->ref_count = ref_count;
15378 		}
15379 
15380 		vm_object_unlock(obj);
15381 
15382 		/* XXX K64: obj_id will be truncated */
15383 		top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15384 	}
15385 }
15386 
15387 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15388 vm_map_region_walk(
15389 	vm_map_t                        map,
15390 	vm_map_offset_t                 va,
15391 	vm_map_entry_t                  entry,
15392 	vm_object_offset_t              offset,
15393 	vm_object_size_t                range,
15394 	vm_region_extended_info_t       extended,
15395 	boolean_t                       look_for_pages,
15396 	mach_msg_type_number_t count)
15397 {
15398 	struct vm_object *obj, *tmp_obj;
15399 	vm_map_offset_t       last_offset;
15400 	int               i;
15401 	int               ref_count;
15402 	struct vm_object        *shadow_object;
15403 	unsigned short          shadow_depth;
15404 	boolean_t         do_region_footprint;
15405 	int                     effective_page_size, effective_page_shift;
15406 	vm_map_offset_t         effective_page_mask;
15407 
15408 	do_region_footprint = task_self_region_footprint();
15409 
15410 	if ((entry->is_sub_map) ||
15411 	    (VME_OBJECT(entry) == 0) ||
15412 	    (VME_OBJECT(entry)->phys_contiguous &&
15413 	    !entry->superpage_size)) {
15414 		extended->share_mode = SM_EMPTY;
15415 		extended->ref_count = 0;
15416 		return;
15417 	}
15418 
15419 	if (entry->superpage_size) {
15420 		extended->shadow_depth = 0;
15421 		extended->share_mode = SM_LARGE_PAGE;
15422 		extended->ref_count = 1;
15423 		extended->external_pager = 0;
15424 
15425 		/* TODO4K: Superpage in 4k mode? */
15426 		extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15427 		extended->shadow_depth = 0;
15428 		return;
15429 	}
15430 
15431 	effective_page_shift = vm_self_region_page_shift(map);
15432 	effective_page_size = (1 << effective_page_shift);
15433 	effective_page_mask = effective_page_size - 1;
15434 
15435 	offset = vm_map_trunc_page(offset, effective_page_mask);
15436 
15437 	obj = VME_OBJECT(entry);
15438 
15439 	vm_object_lock(obj);
15440 
15441 	if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15442 		ref_count--;
15443 	}
15444 
15445 	if (look_for_pages) {
15446 		for (last_offset = offset + range;
15447 		    offset < last_offset;
15448 		    offset += effective_page_size, va += effective_page_size) {
15449 			if (do_region_footprint) {
15450 				int disp;
15451 
15452 				disp = 0;
15453 				if (map->has_corpse_footprint) {
15454 					/*
15455 					 * Query the page info data we saved
15456 					 * while forking the corpse.
15457 					 */
15458 					vm_map_corpse_footprint_query_page_info(
15459 						map,
15460 						va,
15461 						&disp);
15462 				} else {
15463 					/*
15464 					 * Query the pmap.
15465 					 */
15466 					vm_map_footprint_query_page_info(
15467 						map,
15468 						entry,
15469 						va,
15470 						&disp);
15471 				}
15472 				if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15473 					extended->pages_resident++;
15474 				}
15475 				if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15476 					extended->pages_reusable++;
15477 				}
15478 				if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15479 					extended->pages_dirtied++;
15480 				}
15481 				if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15482 					extended->pages_swapped_out++;
15483 				}
15484 				continue;
15485 			}
15486 
15487 			vm_map_region_look_for_page(map, va, obj,
15488 			    vm_object_trunc_page(offset), ref_count,
15489 			    0, extended, count);
15490 		}
15491 
15492 		if (do_region_footprint) {
15493 			goto collect_object_info;
15494 		}
15495 	} else {
15496 collect_object_info:
15497 		shadow_object = obj->shadow;
15498 		shadow_depth = 0;
15499 
15500 		if (!(obj->internal)) {
15501 			extended->external_pager = 1;
15502 		}
15503 
15504 		if (shadow_object != VM_OBJECT_NULL) {
15505 			vm_object_lock(shadow_object);
15506 			for (;
15507 			    shadow_object != VM_OBJECT_NULL;
15508 			    shadow_depth++) {
15509 				vm_object_t     next_shadow;
15510 
15511 				if (!(shadow_object->internal)) {
15512 					extended->external_pager = 1;
15513 				}
15514 
15515 				next_shadow = shadow_object->shadow;
15516 				if (next_shadow) {
15517 					vm_object_lock(next_shadow);
15518 				}
15519 				vm_object_unlock(shadow_object);
15520 				shadow_object = next_shadow;
15521 			}
15522 		}
15523 		extended->shadow_depth = shadow_depth;
15524 	}
15525 
15526 	if (extended->shadow_depth || entry->needs_copy) {
15527 		extended->share_mode = SM_COW;
15528 	} else {
15529 		if (ref_count == 1) {
15530 			extended->share_mode = SM_PRIVATE;
15531 		} else {
15532 			if (obj->true_share) {
15533 				extended->share_mode = SM_TRUESHARED;
15534 			} else {
15535 				extended->share_mode = SM_SHARED;
15536 			}
15537 		}
15538 	}
15539 	extended->ref_count = ref_count - extended->shadow_depth;
15540 
15541 	for (i = 0; i < extended->shadow_depth; i++) {
15542 		if ((tmp_obj = obj->shadow) == 0) {
15543 			break;
15544 		}
15545 		vm_object_lock(tmp_obj);
15546 		vm_object_unlock(obj);
15547 
15548 		if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
15549 			ref_count--;
15550 		}
15551 
15552 		extended->ref_count += ref_count;
15553 		obj = tmp_obj;
15554 	}
15555 	vm_object_unlock(obj);
15556 
15557 	if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15558 		extended->share_mode = SM_PRIVATE;
15559 	} else if (extended->share_mode == SM_SHARED && !(task_self_region_info_flags() & VM_REGION_INFO_FLAGS_NO_ALIASED)) {
15560 		vm_map_entry_t       cur;
15561 		vm_map_entry_t       last;
15562 		int      my_refs;
15563 
15564 		obj = VME_OBJECT(entry);
15565 		last = vm_map_to_entry(map);
15566 		my_refs = 0;
15567 
15568 		if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15569 			ref_count--;
15570 		}
15571 		for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15572 			if (vm_map_region_has_obj_ref(cur, obj)) {
15573 				my_refs++;
15574 			}
15575 		}
15576 
15577 		if (my_refs == ref_count) {
15578 			extended->share_mode = SM_PRIVATE_ALIASED;
15579 		} else if (my_refs > 1) {
15580 			extended->share_mode = SM_SHARED_ALIASED;
15581 		}
15582 	}
15583 }
15584 
15585 
15586 /* object is locked on entry and locked on return */
15587 
15588 
15589 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15590 vm_map_region_look_for_page(
15591 	__unused vm_map_t               map,
15592 	__unused vm_map_offset_t        va,
15593 	vm_object_t                     object,
15594 	vm_object_offset_t              offset,
15595 	int                             max_refcnt,
15596 	unsigned short                  depth,
15597 	vm_region_extended_info_t       extended,
15598 	mach_msg_type_number_t count)
15599 {
15600 	vm_page_t       p;
15601 	vm_object_t     shadow;
15602 	int             ref_count;
15603 	vm_object_t     caller_object;
15604 
15605 	shadow = object->shadow;
15606 	caller_object = object;
15607 
15608 
15609 	while (TRUE) {
15610 		if (!(object->internal)) {
15611 			extended->external_pager = 1;
15612 		}
15613 
15614 		if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15615 			if (shadow && (max_refcnt == 1)) {
15616 				extended->pages_shared_now_private++;
15617 			}
15618 
15619 			if (!p->vmp_fictitious &&
15620 			    (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15621 				extended->pages_dirtied++;
15622 			} else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15623 				if (p->vmp_reusable || object->all_reusable) {
15624 					extended->pages_reusable++;
15625 				}
15626 			}
15627 
15628 			extended->pages_resident++;
15629 
15630 			if (object != caller_object) {
15631 				vm_object_unlock(object);
15632 			}
15633 
15634 			return;
15635 		}
15636 		if (object->internal &&
15637 		    object->alive &&
15638 		    !object->terminating &&
15639 		    object->pager_ready) {
15640 			if (vm_object_compressor_pager_state_get(object, offset)
15641 			    == VM_EXTERNAL_STATE_EXISTS) {
15642 				/* the pager has that page */
15643 				extended->pages_swapped_out++;
15644 				if (object != caller_object) {
15645 					vm_object_unlock(object);
15646 				}
15647 				return;
15648 			}
15649 		}
15650 
15651 		if (shadow) {
15652 			vm_object_lock(shadow);
15653 
15654 			if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
15655 				ref_count--;
15656 			}
15657 
15658 			if (++depth > extended->shadow_depth) {
15659 				extended->shadow_depth = depth;
15660 			}
15661 
15662 			if (ref_count > max_refcnt) {
15663 				max_refcnt = ref_count;
15664 			}
15665 
15666 			if (object != caller_object) {
15667 				vm_object_unlock(object);
15668 			}
15669 
15670 			offset = offset + object->vo_shadow_offset;
15671 			object = shadow;
15672 			shadow = object->shadow;
15673 			continue;
15674 		}
15675 		if (object != caller_object) {
15676 			vm_object_unlock(object);
15677 		}
15678 		break;
15679 	}
15680 }
15681 
15682 static inline boolean_t
vm_map_region_has_obj_ref(vm_map_entry_t entry,vm_object_t object)15683 vm_map_region_has_obj_ref(
15684 	vm_map_entry_t    entry,
15685 	vm_object_t       object)
15686 {
15687 	vm_object_t cur_obj;
15688 	vm_object_t shadow_obj;
15689 
15690 	if (entry->is_sub_map) {
15691 		return FALSE;
15692 	}
15693 
15694 	cur_obj = VME_OBJECT(entry);
15695 	if (cur_obj == VM_OBJECT_NULL) {
15696 		return FALSE;
15697 	} else if (cur_obj == object) {
15698 		return TRUE;
15699 	}
15700 
15701 	/*
15702 	 * Avoid locks for first shadow check, otherwise diagnostic tools will
15703 	 * spend most of their time obtaining locks in this function when analyzing
15704 	 * processes with many VM entries which may commonly have no shadow chain.
15705 	 *
15706 	 * This is acceptable because:
15707 	 *  - Shadow's fields are not accessed outside of its lock
15708 	 *  - Objects are unlikely to be modified due to:
15709 	 *	  - Many diagnostic tools suspend the task
15710 	 *	  - VM map is locked
15711 	 *	- The rare incorrect return from this function turns a guess into a
15712 	 *	  slightly worse guess
15713 	 *	- Entire shadow chain is not locked as a whole, so can still change
15714 	 *	  while traversing, resulting in incorrect guess even with locking
15715 	 */
15716 	shadow_obj = cur_obj->shadow;
15717 	if (shadow_obj == VM_OBJECT_NULL) {
15718 		return FALSE;
15719 	} else if (shadow_obj == object) {
15720 		return TRUE;
15721 	}
15722 
15723 	vm_object_lock(cur_obj);
15724 
15725 	while ((shadow_obj = cur_obj->shadow)) {
15726 		/* check if object was found before grabbing a lock */
15727 		if (shadow_obj == object) {
15728 			vm_object_unlock(cur_obj);
15729 			return TRUE;
15730 		}
15731 
15732 		vm_object_lock(shadow_obj);
15733 		vm_object_unlock(cur_obj);
15734 		cur_obj = shadow_obj;
15735 	}
15736 
15737 	/* exhausted the shadow chain */
15738 	vm_object_unlock(cur_obj);
15739 	return FALSE;
15740 }
15741 
15742 
15743 /*
15744  *	Routine:	vm_map_simplify
15745  *
15746  *	Description:
15747  *		Attempt to simplify the map representation in
15748  *		the vicinity of the given starting address.
15749  *	Note:
15750  *		This routine is intended primarily to keep the
15751  *		kernel maps more compact -- they generally don't
15752  *		benefit from the "expand a map entry" technology
15753  *		at allocation time because the adjacent entry
15754  *		is often wired down.
15755  */
15756 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)15757 vm_map_simplify_entry(
15758 	vm_map_t        map,
15759 	vm_map_entry_t  this_entry)
15760 {
15761 	vm_map_entry_t  prev_entry;
15762 
15763 	prev_entry = this_entry->vme_prev;
15764 
15765 	if ((this_entry != vm_map_to_entry(map)) &&
15766 	    (prev_entry != vm_map_to_entry(map)) &&
15767 
15768 	    (prev_entry->vme_end == this_entry->vme_start) &&
15769 
15770 	    (prev_entry->is_sub_map == this_entry->is_sub_map) &&
15771 	    (prev_entry->vme_object_value == this_entry->vme_object_value) &&
15772 	    (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
15773 	    ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
15774 	    prev_entry->vme_start))
15775 	    == VME_OFFSET(this_entry)) &&
15776 
15777 	    (prev_entry->behavior == this_entry->behavior) &&
15778 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
15779 	    (prev_entry->protection == this_entry->protection) &&
15780 	    (prev_entry->max_protection == this_entry->max_protection) &&
15781 	    (prev_entry->inheritance == this_entry->inheritance) &&
15782 	    (prev_entry->use_pmap == this_entry->use_pmap) &&
15783 	    (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
15784 	    (prev_entry->no_cache == this_entry->no_cache) &&
15785 	    (prev_entry->vme_permanent == this_entry->vme_permanent) &&
15786 	    (prev_entry->map_aligned == this_entry->map_aligned) &&
15787 	    (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
15788 	    (prev_entry->used_for_jit == this_entry->used_for_jit) &&
15789 #if __arm64e__
15790 	    (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
15791 #endif
15792 	    (prev_entry->csm_associated == this_entry->csm_associated) &&
15793 	    (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
15794 	    (prev_entry->iokit_acct == this_entry->iokit_acct) &&
15795 	    (prev_entry->vme_resilient_codesign ==
15796 	    this_entry->vme_resilient_codesign) &&
15797 	    (prev_entry->vme_resilient_media ==
15798 	    this_entry->vme_resilient_media) &&
15799 	    (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
15800 	    (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
15801 
15802 	    (prev_entry->wired_count == this_entry->wired_count) &&
15803 	    (prev_entry->user_wired_count == this_entry->user_wired_count) &&
15804 
15805 	    ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
15806 	    (prev_entry->in_transition == FALSE) &&
15807 	    (this_entry->in_transition == FALSE) &&
15808 	    (prev_entry->needs_wakeup == FALSE) &&
15809 	    (this_entry->needs_wakeup == FALSE) &&
15810 	    (prev_entry->is_shared == this_entry->is_shared) &&
15811 	    (prev_entry->superpage_size == FALSE) &&
15812 	    (this_entry->superpage_size == FALSE)
15813 	    ) {
15814 		if (prev_entry->vme_permanent) {
15815 			assert(this_entry->vme_permanent);
15816 			prev_entry->vme_permanent = false;
15817 		}
15818 		vm_map_store_entry_unlink(map, prev_entry, true);
15819 		assert(prev_entry->vme_start < this_entry->vme_end);
15820 		if (prev_entry->map_aligned) {
15821 			assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
15822 			    VM_MAP_PAGE_MASK(map)));
15823 		}
15824 		this_entry->vme_start = prev_entry->vme_start;
15825 		VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
15826 
15827 		if (map->holelistenabled) {
15828 			vm_map_store_update_first_free(map, this_entry, TRUE);
15829 		}
15830 
15831 		if (prev_entry->is_sub_map) {
15832 			vm_map_deallocate(VME_SUBMAP(prev_entry));
15833 		} else {
15834 			vm_object_deallocate(VME_OBJECT(prev_entry));
15835 		}
15836 		vm_map_entry_dispose(prev_entry);
15837 		SAVE_HINT_MAP_WRITE(map, this_entry);
15838 	}
15839 }
15840 
15841 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)15842 vm_map_simplify(
15843 	vm_map_t        map,
15844 	vm_map_offset_t start)
15845 {
15846 	vm_map_entry_t  this_entry;
15847 
15848 	vm_map_lock(map);
15849 	if (vm_map_lookup_entry(map, start, &this_entry)) {
15850 		vm_map_simplify_entry(map, this_entry);
15851 		vm_map_simplify_entry(map, this_entry->vme_next);
15852 	}
15853 	vm_map_unlock(map);
15854 }
15855 
15856 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15857 vm_map_simplify_range(
15858 	vm_map_t        map,
15859 	vm_map_offset_t start,
15860 	vm_map_offset_t end)
15861 {
15862 	vm_map_entry_t  entry;
15863 
15864 	/*
15865 	 * The map should be locked (for "write") by the caller.
15866 	 */
15867 
15868 	if (start >= end) {
15869 		/* invalid address range */
15870 		return;
15871 	}
15872 
15873 	start = vm_map_trunc_page(start,
15874 	    VM_MAP_PAGE_MASK(map));
15875 	end = vm_map_round_page(end,
15876 	    VM_MAP_PAGE_MASK(map));
15877 
15878 	if (!vm_map_lookup_entry(map, start, &entry)) {
15879 		/* "start" is not mapped and "entry" ends before "start" */
15880 		if (entry == vm_map_to_entry(map)) {
15881 			/* start with first entry in the map */
15882 			entry = vm_map_first_entry(map);
15883 		} else {
15884 			/* start with next entry */
15885 			entry = entry->vme_next;
15886 		}
15887 	}
15888 
15889 	while (entry != vm_map_to_entry(map) &&
15890 	    entry->vme_start <= end) {
15891 		/* try and coalesce "entry" with its previous entry */
15892 		vm_map_simplify_entry(map, entry);
15893 		entry = entry->vme_next;
15894 	}
15895 }
15896 
15897 
15898 /*
15899  *	Routine:	vm_map_machine_attribute
15900  *	Purpose:
15901  *		Provide machine-specific attributes to mappings,
15902  *		such as cachability etc. for machines that provide
15903  *		them.  NUMA architectures and machines with big/strange
15904  *		caches will use this.
15905  *	Note:
15906  *		Responsibilities for locking and checking are handled here,
15907  *		everything else in the pmap module. If any non-volatile
15908  *		information must be kept, the pmap module should handle
15909  *		it itself. [This assumes that attributes do not
15910  *		need to be inherited, which seems ok to me]
15911  */
15912 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)15913 vm_map_machine_attribute(
15914 	vm_map_t                        map,
15915 	vm_map_offset_t         start,
15916 	vm_map_offset_t         end,
15917 	vm_machine_attribute_t  attribute,
15918 	vm_machine_attribute_val_t* value)              /* IN/OUT */
15919 {
15920 	kern_return_t   ret;
15921 	vm_map_size_t sync_size;
15922 	vm_map_entry_t entry;
15923 
15924 	if (start < vm_map_min(map) || end > vm_map_max(map)) {
15925 		return KERN_INVALID_ADDRESS;
15926 	}
15927 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
15928 		return KERN_INVALID_ADDRESS;
15929 	}
15930 
15931 	/* Figure how much memory we need to flush (in page increments) */
15932 	sync_size = end - start;
15933 
15934 	vm_map_lock(map);
15935 
15936 	if (attribute != MATTR_CACHE) {
15937 		/* If we don't have to find physical addresses, we */
15938 		/* don't have to do an explicit traversal here.    */
15939 		ret = pmap_attribute(map->pmap, start, end - start,
15940 		    attribute, value);
15941 		vm_map_unlock(map);
15942 		return ret;
15943 	}
15944 
15945 	ret = KERN_SUCCESS;                                                                             /* Assume it all worked */
15946 
15947 	while (sync_size) {
15948 		if (vm_map_lookup_entry(map, start, &entry)) {
15949 			vm_map_size_t   sub_size;
15950 			if ((entry->vme_end - start) > sync_size) {
15951 				sub_size = sync_size;
15952 				sync_size = 0;
15953 			} else {
15954 				sub_size = entry->vme_end - start;
15955 				sync_size -= sub_size;
15956 			}
15957 			if (entry->is_sub_map) {
15958 				vm_map_offset_t sub_start;
15959 				vm_map_offset_t sub_end;
15960 
15961 				sub_start = (start - entry->vme_start)
15962 				    + VME_OFFSET(entry);
15963 				sub_end = sub_start + sub_size;
15964 				vm_map_machine_attribute(
15965 					VME_SUBMAP(entry),
15966 					sub_start,
15967 					sub_end,
15968 					attribute, value);
15969 			} else if (VME_OBJECT(entry)) {
15970 				vm_page_t               m;
15971 				vm_object_t             object;
15972 				vm_object_t             base_object;
15973 				vm_object_t             last_object;
15974 				vm_object_offset_t      offset;
15975 				vm_object_offset_t      base_offset;
15976 				vm_map_size_t           range;
15977 				range = sub_size;
15978 				offset = (start - entry->vme_start)
15979 				    + VME_OFFSET(entry);
15980 				offset = vm_object_trunc_page(offset);
15981 				base_offset = offset;
15982 				object = VME_OBJECT(entry);
15983 				base_object = object;
15984 				last_object = NULL;
15985 
15986 				vm_object_lock(object);
15987 
15988 				while (range) {
15989 					m = vm_page_lookup(
15990 						object, offset);
15991 
15992 					if (m && !m->vmp_fictitious) {
15993 						ret =
15994 						    pmap_attribute_cache_sync(
15995 							VM_PAGE_GET_PHYS_PAGE(m),
15996 							PAGE_SIZE,
15997 							attribute, value);
15998 					} else if (object->shadow) {
15999 						offset = offset + object->vo_shadow_offset;
16000 						last_object = object;
16001 						object = object->shadow;
16002 						vm_object_lock(last_object->shadow);
16003 						vm_object_unlock(last_object);
16004 						continue;
16005 					}
16006 					if (range < PAGE_SIZE) {
16007 						range = 0;
16008 					} else {
16009 						range -= PAGE_SIZE;
16010 					}
16011 
16012 					if (base_object != object) {
16013 						vm_object_unlock(object);
16014 						vm_object_lock(base_object);
16015 						object = base_object;
16016 					}
16017 					/* Bump to the next page */
16018 					base_offset += PAGE_SIZE;
16019 					offset = base_offset;
16020 				}
16021 				vm_object_unlock(object);
16022 			}
16023 			start += sub_size;
16024 		} else {
16025 			vm_map_unlock(map);
16026 			return KERN_FAILURE;
16027 		}
16028 	}
16029 
16030 	vm_map_unlock(map);
16031 
16032 	return ret;
16033 }
16034 
16035 /*
16036  *	vm_map_behavior_set:
16037  *
16038  *	Sets the paging reference behavior of the specified address
16039  *	range in the target map.  Paging reference behavior affects
16040  *	how pagein operations resulting from faults on the map will be
16041  *	clustered.
16042  */
16043 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16044 vm_map_behavior_set(
16045 	vm_map_t        map,
16046 	vm_map_offset_t start,
16047 	vm_map_offset_t end,
16048 	vm_behavior_t   new_behavior)
16049 {
16050 	vm_map_entry_t  entry;
16051 	vm_map_entry_t  temp_entry;
16052 
16053 	if (start > end ||
16054 	    start < vm_map_min(map) ||
16055 	    end > vm_map_max(map)) {
16056 		return KERN_NO_SPACE;
16057 	}
16058 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16059 		return KERN_INVALID_ADDRESS;
16060 	}
16061 
16062 	switch (new_behavior) {
16063 	/*
16064 	 * This first block of behaviors all set a persistent state on the specified
16065 	 * memory range.  All we have to do here is to record the desired behavior
16066 	 * in the vm_map_entry_t's.
16067 	 */
16068 
16069 	case VM_BEHAVIOR_DEFAULT:
16070 	case VM_BEHAVIOR_RANDOM:
16071 	case VM_BEHAVIOR_SEQUENTIAL:
16072 	case VM_BEHAVIOR_RSEQNTL:
16073 	case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16074 		vm_map_lock(map);
16075 
16076 		/*
16077 		 *	The entire address range must be valid for the map.
16078 		 *      Note that vm_map_range_check() does a
16079 		 *	vm_map_lookup_entry() internally and returns the
16080 		 *	entry containing the start of the address range if
16081 		 *	the entire range is valid.
16082 		 */
16083 		if (vm_map_range_check(map, start, end, &temp_entry)) {
16084 			entry = temp_entry;
16085 			vm_map_clip_start(map, entry, start);
16086 		} else {
16087 			vm_map_unlock(map);
16088 			return KERN_INVALID_ADDRESS;
16089 		}
16090 
16091 		while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16092 			vm_map_clip_end(map, entry, end);
16093 			if (entry->is_sub_map) {
16094 				assert(!entry->use_pmap);
16095 			}
16096 
16097 			if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16098 				entry->zero_wired_pages = TRUE;
16099 			} else {
16100 				entry->behavior = new_behavior;
16101 			}
16102 			entry = entry->vme_next;
16103 		}
16104 
16105 		vm_map_unlock(map);
16106 		break;
16107 
16108 	/*
16109 	 * The rest of these are different from the above in that they cause
16110 	 * an immediate action to take place as opposed to setting a behavior that
16111 	 * affects future actions.
16112 	 */
16113 
16114 	case VM_BEHAVIOR_WILLNEED:
16115 		return vm_map_willneed(map, start, end);
16116 
16117 	case VM_BEHAVIOR_DONTNEED:
16118 		return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16119 
16120 	case VM_BEHAVIOR_FREE:
16121 		return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16122 
16123 	case VM_BEHAVIOR_REUSABLE:
16124 		return vm_map_reusable_pages(map, start, end);
16125 
16126 	case VM_BEHAVIOR_REUSE:
16127 		return vm_map_reuse_pages(map, start, end);
16128 
16129 	case VM_BEHAVIOR_CAN_REUSE:
16130 		return vm_map_can_reuse(map, start, end);
16131 
16132 #if MACH_ASSERT
16133 	case VM_BEHAVIOR_PAGEOUT:
16134 		return vm_map_pageout(map, start, end);
16135 #endif /* MACH_ASSERT */
16136 
16137 	case VM_BEHAVIOR_ZERO:
16138 		return vm_map_zero(map, start, end);
16139 
16140 	default:
16141 		return KERN_INVALID_ARGUMENT;
16142 	}
16143 
16144 	return KERN_SUCCESS;
16145 }
16146 
16147 
16148 /*
16149  * Internals for madvise(MADV_WILLNEED) system call.
16150  *
16151  * The implementation is to do:-
16152  * a) read-ahead if the mapping corresponds to a mapped regular file
16153  * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16154  */
16155 
16156 
16157 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16158 vm_map_willneed(
16159 	vm_map_t        map,
16160 	vm_map_offset_t start,
16161 	vm_map_offset_t end
16162 	)
16163 {
16164 	vm_map_entry_t                  entry;
16165 	vm_object_t                     object;
16166 	memory_object_t                 pager;
16167 	struct vm_object_fault_info     fault_info = {};
16168 	kern_return_t                   kr;
16169 	vm_object_size_t                len;
16170 	vm_object_offset_t              offset;
16171 
16172 	KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_START,
16173 	    task_pid(current_task()), start, end);
16174 	fault_info.interruptible = THREAD_UNINT;        /* ignored value */
16175 	fault_info.behavior      = VM_BEHAVIOR_SEQUENTIAL;
16176 	fault_info.stealth       = TRUE;
16177 
16178 	/*
16179 	 * The MADV_WILLNEED operation doesn't require any changes to the
16180 	 * vm_map_entry_t's, so the read lock is sufficient.
16181 	 */
16182 
16183 	vm_map_lock_read(map);
16184 
16185 	/*
16186 	 * The madvise semantics require that the address range be fully
16187 	 * allocated with no holes.  Otherwise, we're required to return
16188 	 * an error.
16189 	 */
16190 
16191 	if (!vm_map_range_check(map, start, end, &entry)) {
16192 		vm_map_unlock_read(map);
16193 		KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16194 		    task_pid(current_task()), start, KERN_INVALID_ADDRESS);
16195 		return KERN_INVALID_ADDRESS;
16196 	}
16197 
16198 	/*
16199 	 * Examine each vm_map_entry_t in the range.
16200 	 */
16201 	for (; entry != vm_map_to_entry(map) && start < end;) {
16202 		/*
16203 		 * The first time through, the start address could be anywhere
16204 		 * within the vm_map_entry we found.  So adjust the offset to
16205 		 * correspond.  After that, the offset will always be zero to
16206 		 * correspond to the beginning of the current vm_map_entry.
16207 		 */
16208 		offset = (start - entry->vme_start) + VME_OFFSET(entry);
16209 
16210 		/*
16211 		 * Set the length so we don't go beyond the end of the
16212 		 * map_entry or beyond the end of the range we were given.
16213 		 * This range could span also multiple map entries all of which
16214 		 * map different files, so make sure we only do the right amount
16215 		 * of I/O for each object.  Note that it's possible for there
16216 		 * to be multiple map entries all referring to the same object
16217 		 * but with different page permissions, but it's not worth
16218 		 * trying to optimize that case.
16219 		 */
16220 		len = MIN(entry->vme_end - start, end - start);
16221 
16222 		if ((vm_size_t) len != len) {
16223 			/* 32-bit overflow */
16224 			len = (vm_size_t) (0 - PAGE_SIZE);
16225 		}
16226 		fault_info.cluster_size = (vm_size_t) len;
16227 		fault_info.lo_offset    = offset;
16228 		fault_info.hi_offset    = offset + len;
16229 		fault_info.user_tag     = VME_ALIAS(entry);
16230 		fault_info.pmap_options = 0;
16231 		if (entry->iokit_acct ||
16232 		    (!entry->is_sub_map && !entry->use_pmap)) {
16233 			fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16234 		}
16235 		fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16236 
16237 		/*
16238 		 * If the entry is a submap OR there's no read permission
16239 		 * to this mapping, then just skip it.
16240 		 */
16241 		if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16242 			entry = entry->vme_next;
16243 			start = entry->vme_start;
16244 			continue;
16245 		}
16246 
16247 		object = VME_OBJECT(entry);
16248 
16249 		if (object == NULL ||
16250 		    (object && object->internal)) {
16251 			/*
16252 			 * Memory range backed by anonymous memory.
16253 			 */
16254 			vm_size_t region_size = 0, effective_page_size = 0;
16255 			vm_map_offset_t addr = 0, effective_page_mask = 0;
16256 
16257 			region_size = len;
16258 			addr = start;
16259 
16260 			effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16261 			effective_page_size = effective_page_mask + 1;
16262 
16263 			vm_map_unlock_read(map);
16264 
16265 			while (region_size) {
16266 				vm_pre_fault(
16267 					vm_map_trunc_page(addr, effective_page_mask),
16268 					VM_PROT_READ | VM_PROT_WRITE);
16269 
16270 				region_size -= effective_page_size;
16271 				addr += effective_page_size;
16272 			}
16273 		} else {
16274 			/*
16275 			 * Find the file object backing this map entry.  If there is
16276 			 * none, then we simply ignore the "will need" advice for this
16277 			 * entry and go on to the next one.
16278 			 */
16279 			if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16280 				entry = entry->vme_next;
16281 				start = entry->vme_start;
16282 				continue;
16283 			}
16284 
16285 			vm_object_paging_begin(object);
16286 			pager = object->pager;
16287 			vm_object_unlock(object);
16288 
16289 			/*
16290 			 * The data_request() could take a long time, so let's
16291 			 * release the map lock to avoid blocking other threads.
16292 			 */
16293 			vm_map_unlock_read(map);
16294 
16295 			/*
16296 			 * Get the data from the object asynchronously.
16297 			 *
16298 			 * Note that memory_object_data_request() places limits on the
16299 			 * amount of I/O it will do.  Regardless of the len we
16300 			 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16301 			 * silently truncates the len to that size.  This isn't
16302 			 * necessarily bad since madvise shouldn't really be used to
16303 			 * page in unlimited amounts of data.  Other Unix variants
16304 			 * limit the willneed case as well.  If this turns out to be an
16305 			 * issue for developers, then we can always adjust the policy
16306 			 * here and still be backwards compatible since this is all
16307 			 * just "advice".
16308 			 */
16309 			kr = memory_object_data_request(
16310 				pager,
16311 				vm_object_trunc_page(offset) + object->paging_offset,
16312 				0,      /* ignored */
16313 				VM_PROT_READ,
16314 				(memory_object_fault_info_t)&fault_info);
16315 
16316 			vm_object_lock(object);
16317 			vm_object_paging_end(object);
16318 			vm_object_unlock(object);
16319 
16320 			/*
16321 			 * If we couldn't do the I/O for some reason, just give up on
16322 			 * the madvise.  We still return success to the user since
16323 			 * madvise isn't supposed to fail when the advice can't be
16324 			 * taken.
16325 			 */
16326 
16327 			if (kr != KERN_SUCCESS) {
16328 				KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16329 				    task_pid(current_task()), start, kr);
16330 				return KERN_SUCCESS;
16331 			}
16332 		}
16333 
16334 		start += len;
16335 		if (start >= end) {
16336 			/* done */
16337 			KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16338 			    task_pid(current_task()), start, KERN_SUCCESS);
16339 			return KERN_SUCCESS;
16340 		}
16341 
16342 		/* look up next entry */
16343 		vm_map_lock_read(map);
16344 		if (!vm_map_lookup_entry(map, start, &entry)) {
16345 			/*
16346 			 * There's a new hole in the address range.
16347 			 */
16348 			vm_map_unlock_read(map);
16349 			KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16350 			    task_pid(current_task()), start, KERN_INVALID_ADDRESS);
16351 			return KERN_INVALID_ADDRESS;
16352 		}
16353 	}
16354 
16355 	vm_map_unlock_read(map);
16356 	KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16357 	    task_pid(current_task()), start, KERN_SUCCESS);
16358 	return KERN_SUCCESS;
16359 }
16360 
16361 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16362 vm_map_entry_is_reusable(
16363 	vm_map_entry_t entry)
16364 {
16365 	/* Only user map entries */
16366 
16367 	vm_object_t object;
16368 
16369 	if (entry->is_sub_map) {
16370 		return FALSE;
16371 	}
16372 
16373 	switch (VME_ALIAS(entry)) {
16374 	case VM_MEMORY_MALLOC:
16375 	case VM_MEMORY_MALLOC_SMALL:
16376 	case VM_MEMORY_MALLOC_LARGE:
16377 	case VM_MEMORY_REALLOC:
16378 	case VM_MEMORY_MALLOC_TINY:
16379 	case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16380 	case VM_MEMORY_MALLOC_LARGE_REUSED:
16381 		/*
16382 		 * This is a malloc() memory region: check if it's still
16383 		 * in its original state and can be re-used for more
16384 		 * malloc() allocations.
16385 		 */
16386 		break;
16387 	default:
16388 		/*
16389 		 * Not a malloc() memory region: let the caller decide if
16390 		 * it's re-usable.
16391 		 */
16392 		return TRUE;
16393 	}
16394 
16395 	if (/*entry->is_shared ||*/
16396 		entry->is_sub_map ||
16397 		entry->in_transition ||
16398 		entry->protection != VM_PROT_DEFAULT ||
16399 		entry->max_protection != VM_PROT_ALL ||
16400 		entry->inheritance != VM_INHERIT_DEFAULT ||
16401 		entry->no_cache ||
16402 		entry->vme_permanent ||
16403 		entry->superpage_size != FALSE ||
16404 		entry->zero_wired_pages ||
16405 		entry->wired_count != 0 ||
16406 		entry->user_wired_count != 0) {
16407 		return FALSE;
16408 	}
16409 
16410 	object = VME_OBJECT(entry);
16411 	if (object == VM_OBJECT_NULL) {
16412 		return TRUE;
16413 	}
16414 	if (
16415 #if 0
16416 		/*
16417 		 * Let's proceed even if the VM object is potentially
16418 		 * shared.
16419 		 * We check for this later when processing the actual
16420 		 * VM pages, so the contents will be safe if shared.
16421 		 *
16422 		 * But we can still mark this memory region as "reusable" to
16423 		 * acknowledge that the caller did let us know that the memory
16424 		 * could be re-used and should not be penalized for holding
16425 		 * on to it.  This allows its "resident size" to not include
16426 		 * the reusable range.
16427 		 */
16428 		object->ref_count == 1 &&
16429 #endif
16430 		object->vo_copy == VM_OBJECT_NULL &&
16431 		object->shadow == VM_OBJECT_NULL &&
16432 		object->internal &&
16433 		object->purgable == VM_PURGABLE_DENY &&
16434 		object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16435 		!object->code_signed) {
16436 		return TRUE;
16437 	}
16438 	return FALSE;
16439 }
16440 
16441 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16442 vm_map_reuse_pages(
16443 	vm_map_t        map,
16444 	vm_map_offset_t start,
16445 	vm_map_offset_t end)
16446 {
16447 	vm_map_entry_t                  entry;
16448 	vm_object_t                     object;
16449 	vm_object_offset_t              start_offset, end_offset;
16450 
16451 	/*
16452 	 * The MADV_REUSE operation doesn't require any changes to the
16453 	 * vm_map_entry_t's, so the read lock is sufficient.
16454 	 */
16455 
16456 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16457 		/*
16458 		 * XXX TODO4K
16459 		 * need to figure out what reusable means for a
16460 		 * portion of a native page.
16461 		 */
16462 		return KERN_SUCCESS;
16463 	}
16464 
16465 	vm_map_lock_read(map);
16466 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16467 
16468 	/*
16469 	 * The madvise semantics require that the address range be fully
16470 	 * allocated with no holes.  Otherwise, we're required to return
16471 	 * an error.
16472 	 */
16473 
16474 	if (!vm_map_range_check(map, start, end, &entry)) {
16475 		vm_map_unlock_read(map);
16476 		vm_page_stats_reusable.reuse_pages_failure++;
16477 		return KERN_INVALID_ADDRESS;
16478 	}
16479 
16480 	/*
16481 	 * Examine each vm_map_entry_t in the range.
16482 	 */
16483 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16484 	    entry = entry->vme_next) {
16485 		/*
16486 		 * Sanity check on the VM map entry.
16487 		 */
16488 		if (!vm_map_entry_is_reusable(entry)) {
16489 			vm_map_unlock_read(map);
16490 			vm_page_stats_reusable.reuse_pages_failure++;
16491 			return KERN_INVALID_ADDRESS;
16492 		}
16493 
16494 		/*
16495 		 * The first time through, the start address could be anywhere
16496 		 * within the vm_map_entry we found.  So adjust the offset to
16497 		 * correspond.
16498 		 */
16499 		if (entry->vme_start < start) {
16500 			start_offset = start - entry->vme_start;
16501 		} else {
16502 			start_offset = 0;
16503 		}
16504 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16505 		start_offset += VME_OFFSET(entry);
16506 		end_offset += VME_OFFSET(entry);
16507 
16508 		object = VME_OBJECT(entry);
16509 		if (object != VM_OBJECT_NULL) {
16510 			vm_object_lock(object);
16511 			vm_object_reuse_pages(object, start_offset, end_offset,
16512 			    TRUE);
16513 			vm_object_unlock(object);
16514 		}
16515 
16516 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16517 			/*
16518 			 * XXX
16519 			 * We do not hold the VM map exclusively here.
16520 			 * The "alias" field is not that critical, so it's
16521 			 * safe to update it here, as long as it is the only
16522 			 * one that can be modified while holding the VM map
16523 			 * "shared".
16524 			 */
16525 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16526 		}
16527 	}
16528 
16529 	vm_map_unlock_read(map);
16530 	vm_page_stats_reusable.reuse_pages_success++;
16531 	return KERN_SUCCESS;
16532 }
16533 
16534 
16535 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16536 vm_map_reusable_pages(
16537 	vm_map_t        map,
16538 	vm_map_offset_t start,
16539 	vm_map_offset_t end)
16540 {
16541 	vm_map_entry_t                  entry;
16542 	vm_object_t                     object;
16543 	vm_object_offset_t              start_offset, end_offset;
16544 	vm_map_offset_t                 pmap_offset;
16545 
16546 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16547 		/*
16548 		 * XXX TODO4K
16549 		 * need to figure out what reusable means for a portion
16550 		 * of a native page.
16551 		 */
16552 		return KERN_SUCCESS;
16553 	}
16554 
16555 	/*
16556 	 * The MADV_REUSABLE operation doesn't require any changes to the
16557 	 * vm_map_entry_t's, so the read lock is sufficient.
16558 	 */
16559 
16560 	vm_map_lock_read(map);
16561 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16562 
16563 	/*
16564 	 * The madvise semantics require that the address range be fully
16565 	 * allocated with no holes.  Otherwise, we're required to return
16566 	 * an error.
16567 	 */
16568 
16569 	if (!vm_map_range_check(map, start, end, &entry)) {
16570 		vm_map_unlock_read(map);
16571 		vm_page_stats_reusable.reusable_pages_failure++;
16572 		return KERN_INVALID_ADDRESS;
16573 	}
16574 
16575 	/*
16576 	 * Examine each vm_map_entry_t in the range.
16577 	 */
16578 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16579 	    entry = entry->vme_next) {
16580 		int kill_pages = 0;
16581 		boolean_t reusable_no_write = FALSE;
16582 
16583 		/*
16584 		 * Sanity check on the VM map entry.
16585 		 */
16586 		if (!vm_map_entry_is_reusable(entry)) {
16587 			vm_map_unlock_read(map);
16588 			vm_page_stats_reusable.reusable_pages_failure++;
16589 			return KERN_INVALID_ADDRESS;
16590 		}
16591 
16592 		if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16593 #if __arm64e__
16594 		    && !entry->used_for_tpro
16595 #endif
16596 		    ) {
16597 			/* not writable: can't discard contents */
16598 			vm_map_unlock_read(map);
16599 			vm_page_stats_reusable.reusable_nonwritable++;
16600 			vm_page_stats_reusable.reusable_pages_failure++;
16601 			return KERN_PROTECTION_FAILURE;
16602 		}
16603 
16604 		/*
16605 		 * The first time through, the start address could be anywhere
16606 		 * within the vm_map_entry we found.  So adjust the offset to
16607 		 * correspond.
16608 		 */
16609 		if (entry->vme_start < start) {
16610 			start_offset = start - entry->vme_start;
16611 			pmap_offset = start;
16612 		} else {
16613 			start_offset = 0;
16614 			pmap_offset = entry->vme_start;
16615 		}
16616 		end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16617 		start_offset += VME_OFFSET(entry);
16618 		end_offset += VME_OFFSET(entry);
16619 
16620 		object = VME_OBJECT(entry);
16621 		if (object == VM_OBJECT_NULL) {
16622 			continue;
16623 		}
16624 
16625 		if (entry->protection & VM_PROT_EXECUTE) {
16626 			/*
16627 			 * Executable mappings might be write-protected by
16628 			 * hardware, so do not attempt to write to these pages.
16629 			 */
16630 			reusable_no_write = TRUE;
16631 		}
16632 
16633 		if (entry->vme_xnu_user_debug) {
16634 			/*
16635 			 * User debug pages might be write-protected by hardware,
16636 			 * so do not attempt to write to these pages.
16637 			 */
16638 			reusable_no_write = TRUE;
16639 		}
16640 
16641 		vm_object_lock(object);
16642 		if (((object->ref_count == 1) ||
16643 		    (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16644 		    object->vo_copy == VM_OBJECT_NULL)) &&
16645 		    object->shadow == VM_OBJECT_NULL &&
16646 		    /*
16647 		     * "iokit_acct" entries are billed for their virtual size
16648 		     * (rather than for their resident pages only), so they
16649 		     * wouldn't benefit from making pages reusable, and it
16650 		     * would be hard to keep track of pages that are both
16651 		     * "iokit_acct" and "reusable" in the pmap stats and
16652 		     * ledgers.
16653 		     */
16654 		    !(entry->iokit_acct ||
16655 		    (!entry->is_sub_map && !entry->use_pmap))) {
16656 			if (object->ref_count != 1) {
16657 				vm_page_stats_reusable.reusable_shared++;
16658 			}
16659 			kill_pages = 1;
16660 		} else {
16661 			kill_pages = -1;
16662 		}
16663 		if (kill_pages != -1) {
16664 			vm_object_deactivate_pages(object,
16665 			    start_offset,
16666 			    end_offset - start_offset,
16667 			    kill_pages,
16668 			    TRUE /*reusable_pages*/,
16669 			    reusable_no_write,
16670 			    map->pmap,
16671 			    pmap_offset);
16672 		} else {
16673 			vm_page_stats_reusable.reusable_pages_shared++;
16674 			DTRACE_VM4(vm_map_reusable_pages_shared,
16675 			    unsigned int, VME_ALIAS(entry),
16676 			    vm_map_t, map,
16677 			    vm_map_entry_t, entry,
16678 			    vm_object_t, object);
16679 		}
16680 		vm_object_unlock(object);
16681 
16682 		if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16683 		    VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16684 			/*
16685 			 * XXX
16686 			 * We do not hold the VM map exclusively here.
16687 			 * The "alias" field is not that critical, so it's
16688 			 * safe to update it here, as long as it is the only
16689 			 * one that can be modified while holding the VM map
16690 			 * "shared".
16691 			 */
16692 			VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16693 		}
16694 	}
16695 
16696 	vm_map_unlock_read(map);
16697 	vm_page_stats_reusable.reusable_pages_success++;
16698 	return KERN_SUCCESS;
16699 }
16700 
16701 
16702 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16703 vm_map_can_reuse(
16704 	vm_map_t        map,
16705 	vm_map_offset_t start,
16706 	vm_map_offset_t end)
16707 {
16708 	vm_map_entry_t                  entry;
16709 
16710 	/*
16711 	 * The MADV_REUSABLE operation doesn't require any changes to the
16712 	 * vm_map_entry_t's, so the read lock is sufficient.
16713 	 */
16714 
16715 	vm_map_lock_read(map);
16716 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16717 
16718 	/*
16719 	 * The madvise semantics require that the address range be fully
16720 	 * allocated with no holes.  Otherwise, we're required to return
16721 	 * an error.
16722 	 */
16723 
16724 	if (!vm_map_range_check(map, start, end, &entry)) {
16725 		vm_map_unlock_read(map);
16726 		vm_page_stats_reusable.can_reuse_failure++;
16727 		return KERN_INVALID_ADDRESS;
16728 	}
16729 
16730 	/*
16731 	 * Examine each vm_map_entry_t in the range.
16732 	 */
16733 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16734 	    entry = entry->vme_next) {
16735 		/*
16736 		 * Sanity check on the VM map entry.
16737 		 */
16738 		if (!vm_map_entry_is_reusable(entry)) {
16739 			vm_map_unlock_read(map);
16740 			vm_page_stats_reusable.can_reuse_failure++;
16741 			return KERN_INVALID_ADDRESS;
16742 		}
16743 	}
16744 
16745 	vm_map_unlock_read(map);
16746 	vm_page_stats_reusable.can_reuse_success++;
16747 	return KERN_SUCCESS;
16748 }
16749 
16750 
16751 #if MACH_ASSERT
16752 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16753 vm_map_pageout(
16754 	vm_map_t        map,
16755 	vm_map_offset_t start,
16756 	vm_map_offset_t end)
16757 {
16758 	vm_map_entry_t                  entry;
16759 
16760 	/*
16761 	 * The MADV_PAGEOUT operation doesn't require any changes to the
16762 	 * vm_map_entry_t's, so the read lock is sufficient.
16763 	 */
16764 
16765 	vm_map_lock_read(map);
16766 
16767 	/*
16768 	 * The madvise semantics require that the address range be fully
16769 	 * allocated with no holes.  Otherwise, we're required to return
16770 	 * an error.
16771 	 */
16772 
16773 	if (!vm_map_range_check(map, start, end, &entry)) {
16774 		vm_map_unlock_read(map);
16775 		return KERN_INVALID_ADDRESS;
16776 	}
16777 
16778 	/*
16779 	 * Examine each vm_map_entry_t in the range.
16780 	 */
16781 	for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16782 	    entry = entry->vme_next) {
16783 		vm_object_t     object;
16784 
16785 		/*
16786 		 * Sanity check on the VM map entry.
16787 		 */
16788 		if (entry->is_sub_map) {
16789 			vm_map_t submap;
16790 			vm_map_offset_t submap_start;
16791 			vm_map_offset_t submap_end;
16792 			vm_map_entry_t submap_entry;
16793 
16794 			submap = VME_SUBMAP(entry);
16795 			submap_start = VME_OFFSET(entry);
16796 			submap_end = submap_start + (entry->vme_end -
16797 			    entry->vme_start);
16798 
16799 			vm_map_lock_read(submap);
16800 
16801 			if (!vm_map_range_check(submap,
16802 			    submap_start,
16803 			    submap_end,
16804 			    &submap_entry)) {
16805 				vm_map_unlock_read(submap);
16806 				vm_map_unlock_read(map);
16807 				return KERN_INVALID_ADDRESS;
16808 			}
16809 
16810 			if (submap_entry->is_sub_map) {
16811 				vm_map_unlock_read(submap);
16812 				continue;
16813 			}
16814 
16815 			object = VME_OBJECT(submap_entry);
16816 			if (object == VM_OBJECT_NULL || !object->internal) {
16817 				vm_map_unlock_read(submap);
16818 				continue;
16819 			}
16820 
16821 			vm_object_pageout(object);
16822 
16823 			vm_map_unlock_read(submap);
16824 			submap = VM_MAP_NULL;
16825 			submap_entry = VM_MAP_ENTRY_NULL;
16826 			continue;
16827 		}
16828 
16829 		object = VME_OBJECT(entry);
16830 		if (object == VM_OBJECT_NULL || !object->internal) {
16831 			continue;
16832 		}
16833 
16834 		vm_object_pageout(object);
16835 	}
16836 
16837 	vm_map_unlock_read(map);
16838 	return KERN_SUCCESS;
16839 }
16840 #endif /* MACH_ASSERT */
16841 
16842 /*
16843  * This function determines if the zero operation can be run on the
16844  * respective entry. Additional checks on the object are in
16845  * vm_object_zero_preflight.
16846  */
16847 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)16848 vm_map_zero_entry_preflight(vm_map_entry_t entry)
16849 {
16850 	/*
16851 	 * Zeroing is restricted to writable non-executable entries and non-JIT
16852 	 * regions.
16853 	 */
16854 	if (!(entry->protection & VM_PROT_WRITE) ||
16855 	    (entry->protection & VM_PROT_EXECUTE) ||
16856 	    entry->used_for_jit ||
16857 	    entry->vme_xnu_user_debug) {
16858 		return KERN_PROTECTION_FAILURE;
16859 	}
16860 
16861 	/*
16862 	 * Zeroing for copy on write isn't yet supported. Zeroing is also not
16863 	 * allowed for submaps.
16864 	 */
16865 	if (entry->needs_copy || entry->is_sub_map) {
16866 		return KERN_NO_ACCESS;
16867 	}
16868 
16869 	return KERN_SUCCESS;
16870 }
16871 
16872 /*
16873  * This function translates entry's start and end to offsets in the object
16874  */
16875 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)16876 vm_map_get_bounds_in_object(
16877 	vm_map_entry_t      entry,
16878 	vm_map_offset_t     start,
16879 	vm_map_offset_t     end,
16880 	vm_map_offset_t    *start_offset,
16881 	vm_map_offset_t    *end_offset)
16882 {
16883 	if (entry->vme_start < start) {
16884 		*start_offset = start - entry->vme_start;
16885 	} else {
16886 		*start_offset = 0;
16887 	}
16888 	*end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16889 	*start_offset += VME_OFFSET(entry);
16890 	*end_offset += VME_OFFSET(entry);
16891 }
16892 
16893 /*
16894  * This function iterates through the entries in the requested range
16895  * and zeroes any resident pages in the corresponding objects. Compressed
16896  * pages are dropped instead of being faulted in and zeroed.
16897  */
16898 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16899 vm_map_zero(
16900 	vm_map_t        map,
16901 	vm_map_offset_t start,
16902 	vm_map_offset_t end)
16903 {
16904 	vm_map_entry_t                  entry;
16905 	vm_map_offset_t                 cur = start;
16906 	kern_return_t                   ret;
16907 
16908 	/*
16909 	 * This operation isn't supported where the map page size is less than
16910 	 * the hardware page size. Caller will need to handle error and
16911 	 * explicitly zero memory if needed.
16912 	 */
16913 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16914 		return KERN_NO_ACCESS;
16915 	}
16916 
16917 	/*
16918 	 * The MADV_ZERO operation doesn't require any changes to the
16919 	 * vm_map_entry_t's, so the read lock is sufficient.
16920 	 */
16921 	vm_map_lock_read(map);
16922 	assert(map->pmap != kernel_pmap);       /* protect alias access */
16923 
16924 	/*
16925 	 * The madvise semantics require that the address range be fully
16926 	 * allocated with no holes. Otherwise, we're required to return
16927 	 * an error. This check needs to be redone if the map has changed.
16928 	 */
16929 	if (!vm_map_range_check(map, cur, end, &entry)) {
16930 		vm_map_unlock_read(map);
16931 		return KERN_INVALID_ADDRESS;
16932 	}
16933 
16934 	/*
16935 	 * Examine each vm_map_entry_t in the range.
16936 	 */
16937 	while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
16938 		vm_map_offset_t cur_offset;
16939 		vm_map_offset_t end_offset;
16940 		unsigned int last_timestamp = map->timestamp;
16941 		vm_object_t object = VME_OBJECT(entry);
16942 
16943 		ret = vm_map_zero_entry_preflight(entry);
16944 		if (ret != KERN_SUCCESS) {
16945 			vm_map_unlock_read(map);
16946 			return ret;
16947 		}
16948 
16949 		if (object == VM_OBJECT_NULL) {
16950 			entry = entry->vme_next;
16951 			continue;
16952 		}
16953 
16954 		vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
16955 		vm_object_lock(object);
16956 		/*
16957 		 * Take a reference on the object as vm_object_zero will drop the object
16958 		 * lock when it encounters a busy page.
16959 		 */
16960 		vm_object_reference_locked(object);
16961 		vm_map_unlock_read(map);
16962 
16963 		ret = vm_object_zero(object, cur_offset, end_offset);
16964 		vm_object_unlock(object);
16965 		vm_object_deallocate(object);
16966 		if (ret != KERN_SUCCESS) {
16967 			return ret;
16968 		}
16969 		/*
16970 		 * Update cur as vm_object_zero has succeeded.
16971 		 */
16972 		cur += (end_offset - cur_offset);
16973 		if (cur == end) {
16974 			return KERN_SUCCESS;
16975 		}
16976 
16977 		/*
16978 		 * If the map timestamp has changed, restart by relooking up cur in the
16979 		 * map
16980 		 */
16981 		vm_map_lock_read(map);
16982 		if (last_timestamp != map->timestamp) {
16983 			/*
16984 			 * Relookup cur in the map
16985 			 */
16986 			if (!vm_map_range_check(map, cur, end, &entry)) {
16987 				vm_map_unlock_read(map);
16988 				return KERN_INVALID_ADDRESS;
16989 			}
16990 			continue;
16991 		}
16992 		/*
16993 		 * If the map hasn't changed proceed with the next entry
16994 		 */
16995 		entry = entry->vme_next;
16996 	}
16997 
16998 	vm_map_unlock_read(map);
16999 	return KERN_SUCCESS;
17000 }
17001 
17002 
17003 /*
17004  *	Routine:	vm_map_entry_insert
17005  *
17006  *	Description:	This routine inserts a new vm_entry in a locked map.
17007  */
17008 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17009 vm_map_entry_insert(
17010 	vm_map_t                map,
17011 	vm_map_entry_t          insp_entry,
17012 	vm_map_offset_t         start,
17013 	vm_map_offset_t         end,
17014 	vm_object_t             object,
17015 	vm_object_offset_t      offset,
17016 	vm_map_kernel_flags_t   vmk_flags,
17017 	boolean_t               needs_copy,
17018 	vm_prot_t               cur_protection,
17019 	vm_prot_t               max_protection,
17020 	vm_inherit_t            inheritance,
17021 	boolean_t               clear_map_aligned)
17022 {
17023 	vm_map_entry_t  new_entry;
17024 	boolean_t map_aligned = FALSE;
17025 
17026 	assert(insp_entry != (vm_map_entry_t)0);
17027 	vm_map_lock_assert_exclusive(map);
17028 
17029 	__assert_only vm_object_offset_t      end_offset = 0;
17030 	assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17031 
17032 	if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17033 		map_aligned = TRUE;
17034 	}
17035 	if (clear_map_aligned &&
17036 	    (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17037 	    !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17038 		map_aligned = FALSE;
17039 	}
17040 	if (map_aligned) {
17041 		assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17042 		assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17043 	} else {
17044 		assert(page_aligned(start));
17045 		assert(page_aligned(end));
17046 	}
17047 	assert(start < end);
17048 
17049 	new_entry = vm_map_entry_create(map);
17050 
17051 	new_entry->vme_start = start;
17052 	new_entry->vme_end = end;
17053 
17054 	if (vmk_flags.vmkf_submap) {
17055 		new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17056 		VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17057 	} else {
17058 		VME_OBJECT_SET(new_entry, object, false, 0);
17059 	}
17060 	VME_OFFSET_SET(new_entry, offset);
17061 	VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17062 
17063 	new_entry->map_aligned = map_aligned;
17064 	new_entry->needs_copy = needs_copy;
17065 	new_entry->inheritance = inheritance;
17066 	new_entry->protection = cur_protection;
17067 	new_entry->max_protection = max_protection;
17068 	/*
17069 	 * submap: "use_pmap" means "nested".
17070 	 * default: false.
17071 	 *
17072 	 * object: "use_pmap" means "use pmap accounting" for footprint.
17073 	 * default: true.
17074 	 */
17075 	new_entry->use_pmap = !vmk_flags.vmkf_submap;
17076 	new_entry->no_cache = vmk_flags.vmf_no_cache;
17077 	new_entry->vme_permanent = vmk_flags.vmf_permanent;
17078 	new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17079 	new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17080 	new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17081 
17082 	if (vmk_flags.vmkf_map_jit) {
17083 		if (!(map->jit_entry_exists) ||
17084 		    VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17085 			new_entry->used_for_jit = TRUE;
17086 			map->jit_entry_exists = TRUE;
17087 		}
17088 	}
17089 
17090 	/*
17091 	 *	Insert the new entry into the list.
17092 	 */
17093 
17094 	vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17095 	map->size += end - start;
17096 
17097 	/*
17098 	 *	Update the free space hint and the lookup hint.
17099 	 */
17100 
17101 	SAVE_HINT_MAP_WRITE(map, new_entry);
17102 	return new_entry;
17103 }
17104 
17105 /*
17106  *	Routine:	vm_map_remap_extract
17107  *
17108  *	Description:	This routine returns a vm_entry list from a map.
17109  */
17110 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17111 vm_map_remap_extract(
17112 	vm_map_t                map,
17113 	vm_map_offset_t         addr,
17114 	vm_map_size_t           size,
17115 	boolean_t               copy,
17116 	vm_map_copy_t           map_copy,
17117 	vm_prot_t               *cur_protection,   /* IN/OUT */
17118 	vm_prot_t               *max_protection,   /* IN/OUT */
17119 	/* What, no behavior? */
17120 	vm_inherit_t            inheritance,
17121 	vm_map_kernel_flags_t   vmk_flags)
17122 {
17123 	struct vm_map_header   *map_header = &map_copy->cpy_hdr;
17124 	kern_return_t           result;
17125 	vm_map_size_t           mapped_size;
17126 	vm_map_size_t           tmp_size;
17127 	vm_map_entry_t          src_entry;     /* result of last map lookup */
17128 	vm_map_entry_t          new_entry;
17129 	vm_object_offset_t      offset;
17130 	vm_map_offset_t         map_address;
17131 	vm_map_offset_t         src_start;     /* start of entry to map */
17132 	vm_map_offset_t         src_end;       /* end of region to be mapped */
17133 	vm_object_t             object;
17134 	vm_map_version_t        version;
17135 	boolean_t               src_needs_copy;
17136 	boolean_t               new_entry_needs_copy;
17137 	vm_map_entry_t          saved_src_entry;
17138 	boolean_t               src_entry_was_wired;
17139 	vm_prot_t               max_prot_for_prot_copy;
17140 	vm_map_offset_t         effective_page_mask;
17141 	bool                    pageable, same_map;
17142 	boolean_t               vm_remap_legacy;
17143 	vm_prot_t               required_cur_prot, required_max_prot;
17144 	vm_object_t             new_copy_object;     /* vm_object_copy_* result */
17145 	boolean_t               saved_used_for_jit;  /* Saved used_for_jit. */
17146 
17147 	pageable = vmk_flags.vmkf_copy_pageable;
17148 	same_map = vmk_flags.vmkf_copy_same_map;
17149 
17150 	effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17151 
17152 	assert(map != VM_MAP_NULL);
17153 	assert(size != 0);
17154 	assert(size == vm_map_round_page(size, effective_page_mask));
17155 	assert(inheritance == VM_INHERIT_NONE ||
17156 	    inheritance == VM_INHERIT_COPY ||
17157 	    inheritance == VM_INHERIT_SHARE);
17158 	assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17159 	assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17160 	assert((*cur_protection & *max_protection) == *cur_protection);
17161 
17162 	/*
17163 	 *	Compute start and end of region.
17164 	 */
17165 	src_start = vm_map_trunc_page(addr, effective_page_mask);
17166 	src_end = vm_map_round_page(src_start + size, effective_page_mask);
17167 
17168 	/*
17169 	 *	Initialize map_header.
17170 	 */
17171 	map_header->nentries = 0;
17172 	map_header->entries_pageable = pageable;
17173 //	map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17174 	map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17175 	map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17176 	vm_map_store_init(map_header);
17177 
17178 	if (copy && vmk_flags.vmkf_remap_prot_copy) {
17179 		/*
17180 		 * Special case for vm_map_protect(VM_PROT_COPY):
17181 		 * we want to set the new mappings' max protection to the
17182 		 * specified *max_protection...
17183 		 */
17184 		max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17185 		/* ... but we want to use the vm_remap() legacy mode */
17186 		*max_protection = VM_PROT_NONE;
17187 		*cur_protection = VM_PROT_NONE;
17188 	} else {
17189 		max_prot_for_prot_copy = VM_PROT_NONE;
17190 	}
17191 
17192 	if (*cur_protection == VM_PROT_NONE &&
17193 	    *max_protection == VM_PROT_NONE) {
17194 		/*
17195 		 * vm_remap() legacy mode:
17196 		 * Extract all memory regions in the specified range and
17197 		 * collect the strictest set of protections allowed on the
17198 		 * entire range, so the caller knows what they can do with
17199 		 * the remapped range.
17200 		 * We start with VM_PROT_ALL and we'll remove the protections
17201 		 * missing from each memory region.
17202 		 */
17203 		vm_remap_legacy = TRUE;
17204 		*cur_protection = VM_PROT_ALL;
17205 		*max_protection = VM_PROT_ALL;
17206 		required_cur_prot = VM_PROT_NONE;
17207 		required_max_prot = VM_PROT_NONE;
17208 	} else {
17209 		/*
17210 		 * vm_remap_new() mode:
17211 		 * Extract all memory regions in the specified range and
17212 		 * ensure that they have at least the protections specified
17213 		 * by the caller via *cur_protection and *max_protection.
17214 		 * The resulting mapping should have these protections.
17215 		 */
17216 		vm_remap_legacy = FALSE;
17217 		if (copy) {
17218 			required_cur_prot = VM_PROT_NONE;
17219 			required_max_prot = VM_PROT_READ;
17220 		} else {
17221 			required_cur_prot = *cur_protection;
17222 			required_max_prot = *max_protection;
17223 		}
17224 	}
17225 
17226 	map_address = 0;
17227 	mapped_size = 0;
17228 	result = KERN_SUCCESS;
17229 
17230 	/*
17231 	 *	The specified source virtual space might correspond to
17232 	 *	multiple map entries, need to loop on them.
17233 	 */
17234 	vm_map_lock(map);
17235 
17236 	if (map->pmap == kernel_pmap) {
17237 		map_copy->is_kernel_range = true;
17238 		map_copy->orig_range = kmem_addr_get_range(addr, size);
17239 #if CONFIG_MAP_RANGES
17240 	} else if (map->uses_user_ranges) {
17241 		map_copy->is_user_range = true;
17242 		map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17243 #endif /* CONFIG_MAP_RANGES */
17244 	}
17245 
17246 	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17247 		/*
17248 		 * This address space uses sub-pages so the range might
17249 		 * not be re-mappable in an address space with larger
17250 		 * pages. Re-assemble any broken-up VM map entries to
17251 		 * improve our chances of making it work.
17252 		 */
17253 		vm_map_simplify_range(map, src_start, src_end);
17254 	}
17255 	while (mapped_size != size) {
17256 		vm_map_size_t   entry_size;
17257 
17258 		/*
17259 		 *	Find the beginning of the region.
17260 		 */
17261 		if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17262 			result = KERN_INVALID_ADDRESS;
17263 			break;
17264 		}
17265 
17266 		if (src_start < src_entry->vme_start ||
17267 		    (mapped_size && src_start != src_entry->vme_start)) {
17268 			result = KERN_INVALID_ADDRESS;
17269 			break;
17270 		}
17271 
17272 		tmp_size = size - mapped_size;
17273 		if (src_end > src_entry->vme_end) {
17274 			tmp_size -= (src_end - src_entry->vme_end);
17275 		}
17276 
17277 		entry_size = (vm_map_size_t)(src_entry->vme_end -
17278 		    src_entry->vme_start);
17279 
17280 		if (src_entry->is_sub_map &&
17281 		    vmk_flags.vmkf_copy_single_object) {
17282 			vm_map_t submap;
17283 			vm_map_offset_t submap_start;
17284 			vm_map_size_t submap_size;
17285 			boolean_t submap_needs_copy;
17286 
17287 			/*
17288 			 * No check for "required protection" on "src_entry"
17289 			 * because the protections that matter are the ones
17290 			 * on the submap's VM map entry, which will be checked
17291 			 * during the call to vm_map_remap_extract() below.
17292 			 */
17293 			object = VM_OBJECT_NULL;
17294 
17295 			submap_size = src_entry->vme_end - src_start;
17296 			if (submap_size > size) {
17297 				submap_size = size;
17298 			}
17299 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17300 			submap = VME_SUBMAP(src_entry);
17301 			if (copy) {
17302 				/*
17303 				 * The caller wants a copy-on-write re-mapping,
17304 				 * so let's extract from the submap accordingly.
17305 				 */
17306 				submap_needs_copy = TRUE;
17307 			} else if (src_entry->needs_copy) {
17308 				/*
17309 				 * The caller wants a shared re-mapping but the
17310 				 * submap is mapped with "needs_copy", so its
17311 				 * contents can't be shared as is. Extract the
17312 				 * contents of the submap as "copy-on-write".
17313 				 * The re-mapping won't be shared with the
17314 				 * original mapping but this is equivalent to
17315 				 * what happened with the original "remap from
17316 				 * submap" code.
17317 				 * The shared region is mapped "needs_copy", for
17318 				 * example.
17319 				 */
17320 				submap_needs_copy = TRUE;
17321 			} else {
17322 				/*
17323 				 * The caller wants a shared re-mapping and
17324 				 * this mapping can be shared (no "needs_copy"),
17325 				 * so let's extract from the submap accordingly.
17326 				 * Kernel submaps are mapped without
17327 				 * "needs_copy", for example.
17328 				 */
17329 				submap_needs_copy = FALSE;
17330 			}
17331 			vm_map_reference(submap);
17332 			vm_map_unlock(map);
17333 			src_entry = NULL;
17334 			if (vm_remap_legacy) {
17335 				*cur_protection = VM_PROT_NONE;
17336 				*max_protection = VM_PROT_NONE;
17337 			}
17338 
17339 			DTRACE_VM7(remap_submap_recurse,
17340 			    vm_map_t, map,
17341 			    vm_map_offset_t, addr,
17342 			    vm_map_size_t, size,
17343 			    boolean_t, copy,
17344 			    vm_map_offset_t, submap_start,
17345 			    vm_map_size_t, submap_size,
17346 			    boolean_t, submap_needs_copy);
17347 
17348 			result = vm_map_remap_extract(submap,
17349 			    submap_start,
17350 			    submap_size,
17351 			    submap_needs_copy,
17352 			    map_copy,
17353 			    cur_protection,
17354 			    max_protection,
17355 			    inheritance,
17356 			    vmk_flags);
17357 			vm_map_deallocate(submap);
17358 
17359 			if (result == KERN_SUCCESS &&
17360 			    submap_needs_copy &&
17361 			    !copy) {
17362 				/*
17363 				 * We were asked for a "shared"
17364 				 * re-mapping but had to ask for a
17365 				 * "copy-on-write" remapping of the
17366 				 * submap's mapping to honor the
17367 				 * submap's "needs_copy".
17368 				 * We now need to resolve that
17369 				 * pending "copy-on-write" to
17370 				 * get something we can share.
17371 				 */
17372 				vm_map_entry_t copy_entry;
17373 				vm_object_offset_t copy_offset;
17374 				vm_map_size_t copy_size;
17375 				vm_object_t copy_object;
17376 				copy_entry = vm_map_copy_first_entry(map_copy);
17377 				copy_size = copy_entry->vme_end - copy_entry->vme_start;
17378 				copy_object = VME_OBJECT(copy_entry);
17379 				copy_offset = VME_OFFSET(copy_entry);
17380 				if (copy_object == VM_OBJECT_NULL) {
17381 					assert(copy_offset == 0);
17382 					assert(!copy_entry->needs_copy);
17383 					if (copy_entry->max_protection == VM_PROT_NONE) {
17384 						assert(copy_entry->protection == VM_PROT_NONE);
17385 						/* nothing to share */
17386 					} else {
17387 						assert(copy_offset == 0);
17388 						copy_object = vm_object_allocate(copy_size);
17389 						VME_OFFSET_SET(copy_entry, 0);
17390 						VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17391 						assert(copy_entry->use_pmap);
17392 					}
17393 				} else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17394 					/* already shareable */
17395 					assert(!copy_entry->needs_copy);
17396 				} else if (copy_entry->needs_copy ||
17397 				    copy_object->shadowed ||
17398 				    (copy_object->internal &&
17399 				    !copy_object->true_share &&
17400 				    !copy_entry->is_shared &&
17401 				    copy_object->vo_size > copy_size)) {
17402 					VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17403 					assert(copy_entry->use_pmap);
17404 					if (copy_entry->needs_copy) {
17405 						/* already write-protected */
17406 					} else {
17407 						vm_prot_t prot;
17408 						prot = copy_entry->protection & ~VM_PROT_WRITE;
17409 						vm_object_pmap_protect(copy_object,
17410 						    copy_offset,
17411 						    copy_size,
17412 						    PMAP_NULL,
17413 						    PAGE_SIZE,
17414 						    0,
17415 						    prot);
17416 					}
17417 					copy_entry->needs_copy = FALSE;
17418 				}
17419 				copy_object = VME_OBJECT(copy_entry);
17420 				copy_offset = VME_OFFSET(copy_entry);
17421 				if (copy_object &&
17422 				    copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17423 					copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17424 					copy_object->true_share = TRUE;
17425 				}
17426 			}
17427 
17428 			return result;
17429 		}
17430 
17431 		if (src_entry->is_sub_map) {
17432 			/* protections for submap mapping are irrelevant here */
17433 		} else if (((src_entry->protection & required_cur_prot) !=
17434 		    required_cur_prot) ||
17435 		    ((src_entry->max_protection & required_max_prot) !=
17436 		    required_max_prot)) {
17437 			if (vmk_flags.vmkf_copy_single_object &&
17438 			    mapped_size != 0) {
17439 				/*
17440 				 * Single object extraction.
17441 				 * We can't extract more with the required
17442 				 * protection but we've extracted some, so
17443 				 * stop there and declare success.
17444 				 * The caller should check the size of
17445 				 * the copy entry we've extracted.
17446 				 */
17447 				result = KERN_SUCCESS;
17448 			} else {
17449 				/*
17450 				 * VM range extraction.
17451 				 * Required proctection is not available
17452 				 * for this part of the range: fail.
17453 				 */
17454 				result = KERN_PROTECTION_FAILURE;
17455 			}
17456 			break;
17457 		}
17458 
17459 		if (src_entry->is_sub_map) {
17460 			vm_map_t submap;
17461 			vm_map_offset_t submap_start;
17462 			vm_map_size_t submap_size;
17463 			vm_map_copy_t submap_copy;
17464 			vm_prot_t submap_curprot, submap_maxprot;
17465 			boolean_t submap_needs_copy;
17466 
17467 			/*
17468 			 * No check for "required protection" on "src_entry"
17469 			 * because the protections that matter are the ones
17470 			 * on the submap's VM map entry, which will be checked
17471 			 * during the call to vm_map_copy_extract() below.
17472 			 */
17473 			object = VM_OBJECT_NULL;
17474 			submap_copy = VM_MAP_COPY_NULL;
17475 
17476 			/* find equivalent range in the submap */
17477 			submap = VME_SUBMAP(src_entry);
17478 			submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17479 			submap_size = tmp_size;
17480 			if (copy) {
17481 				/*
17482 				 * The caller wants a copy-on-write re-mapping,
17483 				 * so let's extract from the submap accordingly.
17484 				 */
17485 				submap_needs_copy = TRUE;
17486 			} else if (src_entry->needs_copy) {
17487 				/*
17488 				 * The caller wants a shared re-mapping but the
17489 				 * submap is mapped with "needs_copy", so its
17490 				 * contents can't be shared as is. Extract the
17491 				 * contents of the submap as "copy-on-write".
17492 				 * The re-mapping won't be shared with the
17493 				 * original mapping but this is equivalent to
17494 				 * what happened with the original "remap from
17495 				 * submap" code.
17496 				 * The shared region is mapped "needs_copy", for
17497 				 * example.
17498 				 */
17499 				submap_needs_copy = TRUE;
17500 			} else {
17501 				/*
17502 				 * The caller wants a shared re-mapping and
17503 				 * this mapping can be shared (no "needs_copy"),
17504 				 * so let's extract from the submap accordingly.
17505 				 * Kernel submaps are mapped without
17506 				 * "needs_copy", for example.
17507 				 */
17508 				submap_needs_copy = FALSE;
17509 			}
17510 			/* extra ref to keep submap alive */
17511 			vm_map_reference(submap);
17512 
17513 			DTRACE_VM7(remap_submap_recurse,
17514 			    vm_map_t, map,
17515 			    vm_map_offset_t, addr,
17516 			    vm_map_size_t, size,
17517 			    boolean_t, copy,
17518 			    vm_map_offset_t, submap_start,
17519 			    vm_map_size_t, submap_size,
17520 			    boolean_t, submap_needs_copy);
17521 
17522 			/*
17523 			 * The map can be safely unlocked since we
17524 			 * already hold a reference on the submap.
17525 			 *
17526 			 * No timestamp since we don't care if the map
17527 			 * gets modified while we're down in the submap.
17528 			 * We'll resume the extraction at src_start + tmp_size
17529 			 * anyway.
17530 			 */
17531 			vm_map_unlock(map);
17532 			src_entry = NULL; /* not valid once map is unlocked */
17533 
17534 			if (vm_remap_legacy) {
17535 				submap_curprot = VM_PROT_NONE;
17536 				submap_maxprot = VM_PROT_NONE;
17537 				if (max_prot_for_prot_copy) {
17538 					submap_maxprot = max_prot_for_prot_copy;
17539 				}
17540 			} else {
17541 				assert(!max_prot_for_prot_copy);
17542 				submap_curprot = *cur_protection;
17543 				submap_maxprot = *max_protection;
17544 			}
17545 			result = vm_map_copy_extract(submap,
17546 			    submap_start,
17547 			    submap_size,
17548 			    submap_needs_copy,
17549 			    &submap_copy,
17550 			    &submap_curprot,
17551 			    &submap_maxprot,
17552 			    inheritance,
17553 			    vmk_flags);
17554 
17555 			/* release extra ref on submap */
17556 			vm_map_deallocate(submap);
17557 			submap = VM_MAP_NULL;
17558 
17559 			if (result != KERN_SUCCESS) {
17560 				vm_map_lock(map);
17561 				break;
17562 			}
17563 
17564 			/* transfer submap_copy entries to map_header */
17565 			while (vm_map_copy_first_entry(submap_copy) !=
17566 			    vm_map_copy_to_entry(submap_copy)) {
17567 				vm_map_entry_t copy_entry;
17568 				vm_map_size_t copy_entry_size;
17569 
17570 				copy_entry = vm_map_copy_first_entry(submap_copy);
17571 
17572 				/*
17573 				 * Prevent kernel_object from being exposed to
17574 				 * user space.
17575 				 */
17576 				if (__improbable(copy_entry->vme_kernel_object)) {
17577 					printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17578 					    proc_selfpid(),
17579 					    (get_bsdtask_info(current_task())
17580 					    ? proc_name_address(get_bsdtask_info(current_task()))
17581 					    : "?"));
17582 					DTRACE_VM(extract_kernel_only);
17583 					result = KERN_INVALID_RIGHT;
17584 					vm_map_copy_discard(submap_copy);
17585 					submap_copy = VM_MAP_COPY_NULL;
17586 					vm_map_lock(map);
17587 					break;
17588 				}
17589 
17590 #ifdef __arm64e__
17591 				if (vmk_flags.vmkf_tpro_enforcement_override) {
17592 					copy_entry->used_for_tpro = FALSE;
17593 				}
17594 #endif /* __arm64e__ */
17595 
17596 				vm_map_copy_entry_unlink(submap_copy, copy_entry);
17597 				copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17598 				copy_entry->vme_start = map_address;
17599 				copy_entry->vme_end = map_address + copy_entry_size;
17600 				map_address += copy_entry_size;
17601 				mapped_size += copy_entry_size;
17602 				src_start += copy_entry_size;
17603 				assert(src_start <= src_end);
17604 				_vm_map_store_entry_link(map_header,
17605 				    map_header->links.prev,
17606 				    copy_entry);
17607 			}
17608 			/* done with submap_copy */
17609 			vm_map_copy_discard(submap_copy);
17610 
17611 			if (vm_remap_legacy) {
17612 				*cur_protection &= submap_curprot;
17613 				*max_protection &= submap_maxprot;
17614 			}
17615 
17616 			/* re-acquire the map lock and continue to next entry */
17617 			vm_map_lock(map);
17618 			continue;
17619 		} else {
17620 			object = VME_OBJECT(src_entry);
17621 
17622 			/*
17623 			 * Prevent kernel_object from being exposed to
17624 			 * user space.
17625 			 */
17626 			if (__improbable(is_kernel_object(object))) {
17627 				printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17628 				    proc_selfpid(),
17629 				    (get_bsdtask_info(current_task())
17630 				    ? proc_name_address(get_bsdtask_info(current_task()))
17631 				    : "?"));
17632 				DTRACE_VM(extract_kernel_only);
17633 				result = KERN_INVALID_RIGHT;
17634 				break;
17635 			}
17636 
17637 			if (src_entry->iokit_acct) {
17638 				/*
17639 				 * This entry uses "IOKit accounting".
17640 				 */
17641 			} else if (object != VM_OBJECT_NULL &&
17642 			    object->internal &&
17643 			    (object->purgable != VM_PURGABLE_DENY ||
17644 			    object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17645 				/*
17646 				 * Purgeable objects have their own accounting:
17647 				 * no pmap accounting for them.
17648 				 */
17649 				assertf(!src_entry->use_pmap,
17650 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17651 				    map,
17652 				    src_entry,
17653 				    (uint64_t)src_entry->vme_start,
17654 				    (uint64_t)src_entry->vme_end,
17655 				    src_entry->protection,
17656 				    src_entry->max_protection,
17657 				    VME_ALIAS(src_entry));
17658 			} else {
17659 				/*
17660 				 * Not IOKit or purgeable:
17661 				 * must be accounted by pmap stats.
17662 				 */
17663 				assertf(src_entry->use_pmap,
17664 				    "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17665 				    map,
17666 				    src_entry,
17667 				    (uint64_t)src_entry->vme_start,
17668 				    (uint64_t)src_entry->vme_end,
17669 				    src_entry->protection,
17670 				    src_entry->max_protection,
17671 				    VME_ALIAS(src_entry));
17672 			}
17673 
17674 			if (object == VM_OBJECT_NULL) {
17675 				assert(!src_entry->needs_copy);
17676 				if (src_entry->max_protection == VM_PROT_NONE) {
17677 					assert(src_entry->protection == VM_PROT_NONE);
17678 					/*
17679 					 * No VM object and no permissions:
17680 					 * this must be a reserved range with
17681 					 * nothing to share or copy.
17682 					 * There could also be all sorts of
17683 					 * pmap shenanigans within that reserved
17684 					 * range, so let's just copy the map
17685 					 * entry as is to remap a similar
17686 					 * reserved range.
17687 					 */
17688 					offset = 0; /* no object => no offset */
17689 					goto copy_src_entry;
17690 				}
17691 				object = vm_object_allocate(entry_size);
17692 				VME_OFFSET_SET(src_entry, 0);
17693 				VME_OBJECT_SET(src_entry, object, false, 0);
17694 				assert(src_entry->use_pmap);
17695 				assert(!map->mapped_in_other_pmaps);
17696 			} else if (src_entry->wired_count ||
17697 			    object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17698 				/*
17699 				 * A wired memory region should not have
17700 				 * any pending copy-on-write and needs to
17701 				 * keep pointing at the VM object that
17702 				 * contains the wired pages.
17703 				 * If we're sharing this memory (copy=false),
17704 				 * we'll share this VM object.
17705 				 * If we're copying this memory (copy=true),
17706 				 * we'll call vm_object_copy_slowly() below
17707 				 * and use the new VM object for the remapping.
17708 				 *
17709 				 * Or, we are already using an asymmetric
17710 				 * copy, and therefore we already have
17711 				 * the right object.
17712 				 */
17713 				assert(!src_entry->needs_copy);
17714 			} else if (src_entry->needs_copy || object->shadowed ||
17715 			    (object->internal && !object->true_share &&
17716 			    !src_entry->is_shared &&
17717 			    object->vo_size > entry_size)) {
17718 				bool is_writable;
17719 
17720 				VME_OBJECT_SHADOW(src_entry, entry_size,
17721 				    vm_map_always_shadow(map));
17722 				assert(src_entry->use_pmap);
17723 
17724 				is_writable = false;
17725 				if (src_entry->protection & VM_PROT_WRITE) {
17726 					is_writable = true;
17727 #if __arm64e__
17728 				} else if (src_entry->used_for_tpro) {
17729 					is_writable = true;
17730 #endif /* __arm64e__ */
17731 				}
17732 				if (!src_entry->needs_copy && is_writable) {
17733 					vm_prot_t prot;
17734 
17735 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
17736 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17737 						    __FUNCTION__,
17738 						    map, map->pmap,
17739 						    src_entry,
17740 						    (uint64_t)src_entry->vme_start,
17741 						    (uint64_t)src_entry->vme_end,
17742 						    src_entry->protection);
17743 					}
17744 
17745 					prot = src_entry->protection & ~VM_PROT_WRITE;
17746 
17747 					if (override_nx(map,
17748 					    VME_ALIAS(src_entry))
17749 					    && prot) {
17750 						prot |= VM_PROT_EXECUTE;
17751 					}
17752 
17753 					if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
17754 						panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17755 						    __FUNCTION__,
17756 						    map, map->pmap,
17757 						    src_entry,
17758 						    (uint64_t)src_entry->vme_start,
17759 						    (uint64_t)src_entry->vme_end,
17760 						    prot);
17761 					}
17762 
17763 					if (map->mapped_in_other_pmaps) {
17764 						vm_object_pmap_protect(
17765 							VME_OBJECT(src_entry),
17766 							VME_OFFSET(src_entry),
17767 							entry_size,
17768 							PMAP_NULL,
17769 							PAGE_SIZE,
17770 							src_entry->vme_start,
17771 							prot);
17772 #if MACH_ASSERT
17773 					} else if (__improbable(map->pmap == PMAP_NULL)) {
17774 						extern boolean_t vm_tests_in_progress;
17775 						assert(vm_tests_in_progress);
17776 						/*
17777 						 * Some VM tests (in vm_tests.c)
17778 						 * sometimes want to use a VM
17779 						 * map without a pmap.
17780 						 * Otherwise, this should never
17781 						 * happen.
17782 						 */
17783 #endif /* MACH_ASSERT */
17784 					} else {
17785 						pmap_protect(vm_map_pmap(map),
17786 						    src_entry->vme_start,
17787 						    src_entry->vme_end,
17788 						    prot);
17789 					}
17790 				}
17791 
17792 				object = VME_OBJECT(src_entry);
17793 				src_entry->needs_copy = FALSE;
17794 			}
17795 
17796 
17797 			vm_object_lock(object);
17798 			vm_object_reference_locked(object); /* object ref. for new entry */
17799 			assert(!src_entry->needs_copy);
17800 			if (object->copy_strategy ==
17801 			    MEMORY_OBJECT_COPY_SYMMETRIC) {
17802 				/*
17803 				 * If we want to share this object (copy==0),
17804 				 * it needs to be COPY_DELAY.
17805 				 * If we want to copy this object (copy==1),
17806 				 * we can't just set "needs_copy" on our side
17807 				 * and expect the other side to do the same
17808 				 * (symmetrically), so we can't let the object
17809 				 * stay COPY_SYMMETRIC.
17810 				 * So we always switch from COPY_SYMMETRIC to
17811 				 * COPY_DELAY.
17812 				 */
17813 				object->copy_strategy =
17814 				    MEMORY_OBJECT_COPY_DELAY;
17815 				VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
17816 			}
17817 			vm_object_unlock(object);
17818 		}
17819 
17820 		offset = (VME_OFFSET(src_entry) +
17821 		    (src_start - src_entry->vme_start));
17822 
17823 copy_src_entry:
17824 		new_entry = _vm_map_entry_create(map_header);
17825 		vm_map_entry_copy(map, new_entry, src_entry);
17826 		if (new_entry->is_sub_map) {
17827 			/* clr address space specifics */
17828 			new_entry->use_pmap = FALSE;
17829 		} else if (copy) {
17830 			/*
17831 			 * We're dealing with a copy-on-write operation,
17832 			 * so the resulting mapping should not inherit the
17833 			 * original mapping's accounting settings.
17834 			 * "use_pmap" should be reset to its default (TRUE)
17835 			 * so that the new mapping gets accounted for in
17836 			 * the task's memory footprint.
17837 			 */
17838 			new_entry->use_pmap = TRUE;
17839 		}
17840 		/* "iokit_acct" was cleared in vm_map_entry_copy() */
17841 		assert(!new_entry->iokit_acct);
17842 
17843 		new_entry->map_aligned = FALSE;
17844 
17845 		new_entry->vme_start = map_address;
17846 		new_entry->vme_end = map_address + tmp_size;
17847 		assert(new_entry->vme_start < new_entry->vme_end);
17848 		if (copy && vmk_flags.vmkf_remap_prot_copy) {
17849 			/* security: keep "permanent" and "csm_associated" */
17850 			new_entry->vme_permanent = src_entry->vme_permanent;
17851 			new_entry->csm_associated = src_entry->csm_associated;
17852 			/*
17853 			 * Remapping for vm_map_protect(VM_PROT_COPY)
17854 			 * to convert a read-only mapping into a
17855 			 * copy-on-write version of itself but
17856 			 * with write access:
17857 			 * keep the original inheritance but let's not
17858 			 * add VM_PROT_WRITE to the max protection yet
17859 			 * since we want to do more security checks against
17860 			 * the target map.
17861 			 */
17862 			new_entry->inheritance = src_entry->inheritance;
17863 			new_entry->protection &= max_prot_for_prot_copy;
17864 		} else {
17865 			new_entry->inheritance = inheritance;
17866 			if (!vm_remap_legacy) {
17867 				new_entry->protection = *cur_protection;
17868 				new_entry->max_protection = *max_protection;
17869 			}
17870 		}
17871 #ifdef __arm64e__
17872 		if (copy && vmk_flags.vmkf_tpro_enforcement_override) {
17873 			new_entry->used_for_tpro = FALSE;
17874 		}
17875 #endif /* __arm64e__ */
17876 		VME_OFFSET_SET(new_entry, offset);
17877 
17878 		/*
17879 		 * The new region has to be copied now if required.
17880 		 */
17881 RestartCopy:
17882 		if (!copy) {
17883 			if (src_entry->used_for_jit == TRUE) {
17884 				if (same_map) {
17885 				} else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
17886 					/*
17887 					 * Cannot allow an entry describing a JIT
17888 					 * region to be shared across address spaces.
17889 					 */
17890 					result = KERN_INVALID_ARGUMENT;
17891 					vm_object_deallocate(object);
17892 					vm_map_entry_dispose(new_entry);
17893 					new_entry = VM_MAP_ENTRY_NULL;
17894 					break;
17895 				}
17896 			}
17897 
17898 			if (!src_entry->is_sub_map &&
17899 			    VME_OBJECT(src_entry) == VM_OBJECT_NULL) {
17900 				/* no accessible memory; nothing to share */
17901 				assert(src_entry->protection == VM_PROT_NONE);
17902 				assert(src_entry->max_protection == VM_PROT_NONE);
17903 				src_entry->is_shared = FALSE;
17904 			} else {
17905 				src_entry->is_shared = TRUE;
17906 			}
17907 			if (!new_entry->is_sub_map &&
17908 			    VME_OBJECT(new_entry) == VM_OBJECT_NULL) {
17909 				/* no accessible memory; nothing to share */
17910 				assert(new_entry->protection == VM_PROT_NONE);
17911 				assert(new_entry->max_protection == VM_PROT_NONE);
17912 				new_entry->is_shared = FALSE;
17913 			} else {
17914 				new_entry->is_shared = TRUE;
17915 			}
17916 			if (!(new_entry->is_sub_map)) {
17917 				new_entry->needs_copy = FALSE;
17918 			}
17919 		} else if (src_entry->is_sub_map) {
17920 			/* make this a COW sub_map if not already */
17921 			assert(new_entry->wired_count == 0);
17922 			new_entry->needs_copy = TRUE;
17923 			object = VM_OBJECT_NULL;
17924 		} else if (src_entry->wired_count == 0 &&
17925 		    !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
17926 		    vm_object_copy_quickly(VME_OBJECT(new_entry),
17927 		    VME_OFFSET(new_entry),
17928 		    (new_entry->vme_end -
17929 		    new_entry->vme_start),
17930 		    &src_needs_copy,
17931 		    &new_entry_needs_copy)) {
17932 			new_entry->needs_copy = new_entry_needs_copy;
17933 			new_entry->is_shared = FALSE;
17934 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17935 
17936 			/*
17937 			 * Handle copy_on_write semantics.
17938 			 */
17939 			if (src_needs_copy && !src_entry->needs_copy) {
17940 				vm_prot_t prot;
17941 
17942 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
17943 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17944 					    __FUNCTION__,
17945 					    map, map->pmap, src_entry,
17946 					    (uint64_t)src_entry->vme_start,
17947 					    (uint64_t)src_entry->vme_end,
17948 					    src_entry->protection);
17949 				}
17950 
17951 				prot = src_entry->protection & ~VM_PROT_WRITE;
17952 
17953 				if (override_nx(map,
17954 				    VME_ALIAS(src_entry))
17955 				    && prot) {
17956 					prot |= VM_PROT_EXECUTE;
17957 				}
17958 
17959 				if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
17960 					panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17961 					    __FUNCTION__,
17962 					    map, map->pmap, src_entry,
17963 					    (uint64_t)src_entry->vme_start,
17964 					    (uint64_t)src_entry->vme_end,
17965 					    prot);
17966 				}
17967 
17968 				vm_object_pmap_protect(object,
17969 				    offset,
17970 				    entry_size,
17971 				    ((src_entry->is_shared
17972 				    || map->mapped_in_other_pmaps) ?
17973 				    PMAP_NULL : map->pmap),
17974 				    VM_MAP_PAGE_SIZE(map),
17975 				    src_entry->vme_start,
17976 				    prot);
17977 
17978 				assert(src_entry->wired_count == 0);
17979 				src_entry->needs_copy = TRUE;
17980 			}
17981 			/*
17982 			 * Throw away the old object reference of the new entry.
17983 			 */
17984 			vm_object_deallocate(object);
17985 		} else {
17986 			new_entry->is_shared = FALSE;
17987 			assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17988 
17989 			src_entry_was_wired = (src_entry->wired_count > 0);
17990 			saved_src_entry = src_entry;
17991 			src_entry = VM_MAP_ENTRY_NULL;
17992 
17993 			/*
17994 			 * The map can be safely unlocked since we
17995 			 * already hold a reference on the object.
17996 			 *
17997 			 * Record the timestamp of the map for later
17998 			 * verification, and unlock the map.
17999 			 */
18000 			version.main_timestamp = map->timestamp;
18001 			vm_map_unlock(map);     /* Increments timestamp once! */
18002 
18003 			/*
18004 			 * Perform the copy.
18005 			 */
18006 			if (src_entry_was_wired > 0 ||
18007 			    (debug4k_no_cow_copyin &&
18008 			    VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18009 				vm_object_lock(object);
18010 				result = vm_object_copy_slowly(
18011 					object,
18012 					offset,
18013 					(new_entry->vme_end -
18014 					new_entry->vme_start),
18015 					THREAD_UNINT,
18016 					&new_copy_object);
18017 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18018 				saved_used_for_jit = new_entry->used_for_jit;
18019 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18020 				new_entry->used_for_jit = saved_used_for_jit;
18021 				VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18022 				new_entry->needs_copy = FALSE;
18023 			} else {
18024 				vm_object_offset_t new_offset;
18025 
18026 				new_offset = VME_OFFSET(new_entry);
18027 				result = vm_object_copy_strategically(
18028 					object,
18029 					offset,
18030 					(new_entry->vme_end -
18031 					new_entry->vme_start),
18032 					false, /* forking */
18033 					&new_copy_object,
18034 					&new_offset,
18035 					&new_entry_needs_copy);
18036 				/* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18037 				saved_used_for_jit = new_entry->used_for_jit;
18038 				VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18039 				new_entry->used_for_jit = saved_used_for_jit;
18040 				if (new_offset != VME_OFFSET(new_entry)) {
18041 					VME_OFFSET_SET(new_entry, new_offset);
18042 				}
18043 
18044 				new_entry->needs_copy = new_entry_needs_copy;
18045 			}
18046 
18047 			/*
18048 			 * Throw away the old object reference of the new entry.
18049 			 */
18050 			vm_object_deallocate(object);
18051 
18052 			if (result != KERN_SUCCESS &&
18053 			    result != KERN_MEMORY_RESTART_COPY) {
18054 				vm_map_entry_dispose(new_entry);
18055 				vm_map_lock(map);
18056 				break;
18057 			}
18058 
18059 			/*
18060 			 * Verify that the map has not substantially
18061 			 * changed while the copy was being made.
18062 			 */
18063 
18064 			vm_map_lock(map);
18065 			if (version.main_timestamp + 1 != map->timestamp) {
18066 				/*
18067 				 * Simple version comparison failed.
18068 				 *
18069 				 * Retry the lookup and verify that the
18070 				 * same object/offset are still present.
18071 				 */
18072 				saved_src_entry = VM_MAP_ENTRY_NULL;
18073 				vm_object_deallocate(VME_OBJECT(new_entry));
18074 				vm_map_entry_dispose(new_entry);
18075 				if (result == KERN_MEMORY_RESTART_COPY) {
18076 					result = KERN_SUCCESS;
18077 				}
18078 				continue;
18079 			}
18080 			/* map hasn't changed: src_entry is still valid */
18081 			src_entry = saved_src_entry;
18082 			saved_src_entry = VM_MAP_ENTRY_NULL;
18083 
18084 			if (result == KERN_MEMORY_RESTART_COPY) {
18085 				vm_object_reference(object);
18086 				goto RestartCopy;
18087 			}
18088 		}
18089 
18090 		_vm_map_store_entry_link(map_header,
18091 		    map_header->links.prev, new_entry);
18092 
18093 		/* protections for submap mapping are irrelevant here */
18094 		if (vm_remap_legacy && !src_entry->is_sub_map) {
18095 			*cur_protection &= src_entry->protection;
18096 			*max_protection &= src_entry->max_protection;
18097 		}
18098 
18099 		map_address += tmp_size;
18100 		mapped_size += tmp_size;
18101 		src_start += tmp_size;
18102 
18103 		if (vmk_flags.vmkf_copy_single_object) {
18104 			if (mapped_size != size) {
18105 				DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18106 				    map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18107 				if (src_entry->vme_next != vm_map_to_entry(map) &&
18108 				    src_entry->vme_next->vme_object_value ==
18109 				    src_entry->vme_object_value) {
18110 					/* XXX TODO4K */
18111 					DEBUG4K_ERROR("could have extended copy to next entry...\n");
18112 				}
18113 			}
18114 			break;
18115 		}
18116 	} /* end while */
18117 
18118 	vm_map_unlock(map);
18119 	if (result != KERN_SUCCESS) {
18120 		/*
18121 		 * Free all allocated elements.
18122 		 */
18123 		for (src_entry = map_header->links.next;
18124 		    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18125 		    src_entry = new_entry) {
18126 			new_entry = src_entry->vme_next;
18127 			_vm_map_store_entry_unlink(map_header, src_entry, false);
18128 			if (src_entry->is_sub_map) {
18129 				vm_map_deallocate(VME_SUBMAP(src_entry));
18130 			} else {
18131 				vm_object_deallocate(VME_OBJECT(src_entry));
18132 			}
18133 			vm_map_entry_dispose(src_entry);
18134 		}
18135 	}
18136 	return result;
18137 }
18138 
18139 bool
vm_map_is_exotic(vm_map_t map)18140 vm_map_is_exotic(
18141 	vm_map_t map)
18142 {
18143 	return VM_MAP_IS_EXOTIC(map);
18144 }
18145 
18146 bool
vm_map_is_alien(vm_map_t map)18147 vm_map_is_alien(
18148 	vm_map_t map)
18149 {
18150 	return VM_MAP_IS_ALIEN(map);
18151 }
18152 
18153 #if XNU_TARGET_OS_OSX
18154 void
vm_map_mark_alien(vm_map_t map)18155 vm_map_mark_alien(
18156 	vm_map_t map)
18157 {
18158 	vm_map_lock(map);
18159 	map->is_alien = true;
18160 	vm_map_unlock(map);
18161 }
18162 
18163 void
vm_map_single_jit(vm_map_t map)18164 vm_map_single_jit(
18165 	vm_map_t map)
18166 {
18167 	vm_map_lock(map);
18168 	map->single_jit = true;
18169 	vm_map_unlock(map);
18170 }
18171 #endif /* XNU_TARGET_OS_OSX */
18172 
18173 
18174 
18175 /*
18176  * Callers of this function must call vm_map_copy_require on
18177  * previously created vm_map_copy_t or pass a newly created
18178  * one to ensure that it hasn't been forged.
18179  */
18180 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18181 vm_map_copy_to_physcopy(
18182 	vm_map_copy_t   copy_map,
18183 	vm_map_t        target_map)
18184 {
18185 	vm_map_size_t           size;
18186 	vm_map_entry_t          entry;
18187 	vm_map_entry_t          new_entry;
18188 	vm_object_t             new_object;
18189 	unsigned int            pmap_flags;
18190 	pmap_t                  new_pmap;
18191 	vm_map_t                new_map;
18192 	vm_map_address_t        src_start, src_end, src_cur;
18193 	vm_map_address_t        dst_start, dst_end, dst_cur;
18194 	kern_return_t           kr;
18195 	void                    *kbuf;
18196 
18197 	/*
18198 	 * Perform the equivalent of vm_allocate() and memcpy().
18199 	 * Replace the mappings in "copy_map" with the newly allocated mapping.
18200 	 */
18201 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18202 
18203 	assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18204 
18205 	/* create a new pmap to map "copy_map" */
18206 	pmap_flags = 0;
18207 	assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18208 #if PMAP_CREATE_FORCE_4K_PAGES
18209 	pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18210 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18211 	pmap_flags |= PMAP_CREATE_64BIT;
18212 	new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18213 	if (new_pmap == NULL) {
18214 		return KERN_RESOURCE_SHORTAGE;
18215 	}
18216 
18217 	/* allocate new VM object */
18218 	size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18219 	new_object = vm_object_allocate(size);
18220 	assert(new_object);
18221 
18222 	/* allocate new VM map entry */
18223 	new_entry = vm_map_copy_entry_create(copy_map);
18224 	assert(new_entry);
18225 
18226 	/* finish initializing new VM map entry */
18227 	new_entry->protection = VM_PROT_DEFAULT;
18228 	new_entry->max_protection = VM_PROT_DEFAULT;
18229 	new_entry->use_pmap = TRUE;
18230 
18231 	/* make new VM map entry point to new VM object */
18232 	new_entry->vme_start = 0;
18233 	new_entry->vme_end = size;
18234 	VME_OBJECT_SET(new_entry, new_object, false, 0);
18235 	VME_OFFSET_SET(new_entry, 0);
18236 
18237 	/* create a new pageable VM map to map "copy_map" */
18238 	new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18239 	    VM_MAP_CREATE_PAGEABLE);
18240 	assert(new_map);
18241 	vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18242 
18243 	/* map "copy_map" in the new VM map */
18244 	src_start = 0;
18245 	kr = vm_map_copyout_internal(
18246 		new_map,
18247 		&src_start,
18248 		copy_map,
18249 		copy_map->size,
18250 		FALSE, /* consume_on_success */
18251 		VM_PROT_DEFAULT,
18252 		VM_PROT_DEFAULT,
18253 		VM_INHERIT_DEFAULT);
18254 	assert(kr == KERN_SUCCESS);
18255 	src_end = src_start + copy_map->size;
18256 
18257 	/* map "new_object" in the new VM map */
18258 	vm_object_reference(new_object);
18259 	dst_start = 0;
18260 	kr = vm_map_enter(new_map,
18261 	    &dst_start,
18262 	    size,
18263 	    0,               /* mask */
18264 	    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18265 	    new_object,
18266 	    0,               /* offset */
18267 	    FALSE,               /* needs copy */
18268 	    VM_PROT_DEFAULT,
18269 	    VM_PROT_DEFAULT,
18270 	    VM_INHERIT_DEFAULT);
18271 	assert(kr == KERN_SUCCESS);
18272 	dst_end = dst_start + size;
18273 
18274 	/* get a kernel buffer */
18275 	kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18276 
18277 	/* physically copy "copy_map" mappings to new VM object */
18278 	for (src_cur = src_start, dst_cur = dst_start;
18279 	    src_cur < src_end;
18280 	    src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18281 		vm_size_t bytes;
18282 
18283 		bytes = PAGE_SIZE;
18284 		if (src_cur + PAGE_SIZE > src_end) {
18285 			/* partial copy for last page */
18286 			bytes = src_end - src_cur;
18287 			assert(bytes > 0 && bytes < PAGE_SIZE);
18288 			/* rest of dst page should be zero-filled */
18289 		}
18290 		/* get bytes from src mapping */
18291 		kr = copyinmap(new_map, src_cur, kbuf, bytes);
18292 		if (kr != KERN_SUCCESS) {
18293 			DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18294 		}
18295 		/* put bytes in dst mapping */
18296 		assert(dst_cur < dst_end);
18297 		assert(dst_cur + bytes <= dst_end);
18298 		kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18299 		if (kr != KERN_SUCCESS) {
18300 			DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18301 		}
18302 	}
18303 
18304 	/* free kernel buffer */
18305 	kfree_data(kbuf, PAGE_SIZE);
18306 
18307 	/* destroy new map */
18308 	vm_map_destroy(new_map);
18309 	new_map = VM_MAP_NULL;
18310 
18311 	/* dispose of the old map entries in "copy_map" */
18312 	while (vm_map_copy_first_entry(copy_map) !=
18313 	    vm_map_copy_to_entry(copy_map)) {
18314 		entry = vm_map_copy_first_entry(copy_map);
18315 		vm_map_copy_entry_unlink(copy_map, entry);
18316 		if (entry->is_sub_map) {
18317 			vm_map_deallocate(VME_SUBMAP(entry));
18318 		} else {
18319 			vm_object_deallocate(VME_OBJECT(entry));
18320 		}
18321 		vm_map_copy_entry_dispose(entry);
18322 	}
18323 
18324 	/* change "copy_map"'s page_size to match "target_map" */
18325 	copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18326 	copy_map->offset = 0;
18327 	copy_map->size = size;
18328 
18329 	/* insert new map entry in "copy_map" */
18330 	assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18331 	vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18332 
18333 	DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18334 	return KERN_SUCCESS;
18335 }
18336 
18337 void
18338 vm_map_copy_adjust_get_target_copy_map(
18339 	vm_map_copy_t   copy_map,
18340 	vm_map_copy_t   *target_copy_map_p);
18341 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18342 vm_map_copy_adjust_get_target_copy_map(
18343 	vm_map_copy_t   copy_map,
18344 	vm_map_copy_t   *target_copy_map_p)
18345 {
18346 	vm_map_copy_t   target_copy_map;
18347 	vm_map_entry_t  entry, target_entry;
18348 
18349 	if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18350 		/* the caller already has a "target_copy_map": use it */
18351 		return;
18352 	}
18353 
18354 	/* the caller wants us to create a new copy of "copy_map" */
18355 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18356 	target_copy_map = vm_map_copy_allocate(copy_map->type);
18357 	target_copy_map->offset = copy_map->offset;
18358 	target_copy_map->size = copy_map->size;
18359 	target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18360 	for (entry = vm_map_copy_first_entry(copy_map);
18361 	    entry != vm_map_copy_to_entry(copy_map);
18362 	    entry = entry->vme_next) {
18363 		target_entry = vm_map_copy_entry_create(target_copy_map);
18364 		vm_map_entry_copy_full(target_entry, entry);
18365 		if (target_entry->is_sub_map) {
18366 			vm_map_reference(VME_SUBMAP(target_entry));
18367 		} else {
18368 			vm_object_reference(VME_OBJECT(target_entry));
18369 		}
18370 		vm_map_copy_entry_link(
18371 			target_copy_map,
18372 			vm_map_copy_last_entry(target_copy_map),
18373 			target_entry);
18374 	}
18375 	entry = VM_MAP_ENTRY_NULL;
18376 	*target_copy_map_p = target_copy_map;
18377 }
18378 
18379 /*
18380  * Callers of this function must call vm_map_copy_require on
18381  * previously created vm_map_copy_t or pass a newly created
18382  * one to ensure that it hasn't been forged.
18383  */
18384 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18385 vm_map_copy_trim(
18386 	vm_map_copy_t   copy_map,
18387 	uint16_t        new_page_shift,
18388 	vm_map_offset_t trim_start,
18389 	vm_map_offset_t trim_end)
18390 {
18391 	uint16_t        copy_page_shift;
18392 	vm_map_entry_t  entry, next_entry;
18393 
18394 	assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18395 	assert(copy_map->cpy_hdr.nentries > 0);
18396 
18397 	trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18398 	trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18399 
18400 	/* use the new page_shift to do the clipping */
18401 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18402 	copy_map->cpy_hdr.page_shift = new_page_shift;
18403 
18404 	for (entry = vm_map_copy_first_entry(copy_map);
18405 	    entry != vm_map_copy_to_entry(copy_map);
18406 	    entry = next_entry) {
18407 		next_entry = entry->vme_next;
18408 		if (entry->vme_end <= trim_start) {
18409 			/* entry fully before trim range: skip */
18410 			continue;
18411 		}
18412 		if (entry->vme_start >= trim_end) {
18413 			/* entry fully after trim range: done */
18414 			break;
18415 		}
18416 		/* clip entry if needed */
18417 		vm_map_copy_clip_start(copy_map, entry, trim_start);
18418 		vm_map_copy_clip_end(copy_map, entry, trim_end);
18419 		/* dispose of entry */
18420 		copy_map->size -= entry->vme_end - entry->vme_start;
18421 		vm_map_copy_entry_unlink(copy_map, entry);
18422 		if (entry->is_sub_map) {
18423 			vm_map_deallocate(VME_SUBMAP(entry));
18424 		} else {
18425 			vm_object_deallocate(VME_OBJECT(entry));
18426 		}
18427 		vm_map_copy_entry_dispose(entry);
18428 		entry = VM_MAP_ENTRY_NULL;
18429 	}
18430 
18431 	/* restore copy_map's original page_shift */
18432 	copy_map->cpy_hdr.page_shift = copy_page_shift;
18433 }
18434 
18435 /*
18436  * Make any necessary adjustments to "copy_map" to allow it to be
18437  * mapped into "target_map".
18438  * If no changes were necessary, "target_copy_map" points to the
18439  * untouched "copy_map".
18440  * If changes are necessary, changes will be made to "target_copy_map".
18441  * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18442  * copy the original "copy_map" to it before applying the changes.
18443  * The caller should discard "target_copy_map" if it's not the same as
18444  * the original "copy_map".
18445  */
18446 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18447 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_ut offset_u,vm_map_size_ut size_u,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18448 vm_map_copy_adjust_to_target(
18449 	vm_map_copy_t           src_copy_map,
18450 	vm_map_offset_ut        offset_u,
18451 	vm_map_size_ut          size_u,
18452 	vm_map_t                target_map,
18453 	boolean_t               copy,
18454 	vm_map_copy_t           *target_copy_map_p,
18455 	vm_map_offset_t         *overmap_start_p,
18456 	vm_map_offset_t         *overmap_end_p,
18457 	vm_map_offset_t         *trimmed_start_p)
18458 {
18459 	vm_map_copy_t           copy_map, target_copy_map;
18460 	vm_map_size_t           target_size;
18461 	vm_map_size_t           src_copy_map_size;
18462 	vm_map_size_t           overmap_start, overmap_end;
18463 	int                     misalignments;
18464 	vm_map_entry_t          entry, target_entry;
18465 	vm_map_offset_t         addr_adjustment;
18466 	vm_map_offset_t         new_start, new_end;
18467 	int                     copy_page_mask, target_page_mask;
18468 	uint16_t                copy_page_shift, target_page_shift;
18469 	vm_map_offset_t         trimmed_end;
18470 	vm_map_size_t           map_size;
18471 	kern_return_t           kr;
18472 
18473 	/*
18474 	 * Sanitize any input parameters that are addr/size/prot/inherit
18475 	 */
18476 	kr = vm_map_copy_addr_size_sanitize(
18477 		target_map,
18478 		offset_u,
18479 		size_u,
18480 		VM_SANITIZE_CALLER_MACH_MEMORY_ENTRY_MAP_SIZE,
18481 		&new_start,
18482 		&new_end,
18483 		&map_size);
18484 	if (__improbable(kr != KERN_SUCCESS)) {
18485 		return vm_sanitize_get_kr(kr);
18486 	}
18487 
18488 	/*
18489 	 * Assert that the vm_map_copy is coming from the right
18490 	 * zone and hasn't been forged
18491 	 */
18492 	vm_map_copy_require(src_copy_map);
18493 	assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18494 
18495 	/*
18496 	 * Start working with "src_copy_map" but we'll switch
18497 	 * to "target_copy_map" as soon as we start making adjustments.
18498 	 */
18499 	copy_map = src_copy_map;
18500 	src_copy_map_size = src_copy_map->size;
18501 
18502 	copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18503 	copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18504 	target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18505 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18506 
18507 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), *target_copy_map_p);
18508 
18509 	target_copy_map = *target_copy_map_p;
18510 	if (target_copy_map != VM_MAP_COPY_NULL) {
18511 		vm_map_copy_require(target_copy_map);
18512 	}
18513 
18514 	if (new_end > copy_map->size) {
18515 		DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u));
18516 		return KERN_INVALID_ARGUMENT;
18517 	}
18518 
18519 	/* trim the end */
18520 	trimmed_end = 0;
18521 	new_end = VM_MAP_ROUND_PAGE(new_end, target_page_mask);
18522 	if (new_end < copy_map->size) {
18523 		trimmed_end = src_copy_map_size - new_end;
18524 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18525 		/* get "target_copy_map" if needed and adjust it */
18526 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18527 		    &target_copy_map);
18528 		copy_map = target_copy_map;
18529 		vm_map_copy_trim(target_copy_map, target_page_shift,
18530 		    new_end, copy_map->size);
18531 	}
18532 
18533 	/* trim the start */
18534 	new_start = VM_MAP_TRUNC_PAGE(new_start, target_page_mask);
18535 	if (new_start != 0) {
18536 		DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)0, (uint64_t)new_start);
18537 		/* get "target_copy_map" if needed and adjust it */
18538 		vm_map_copy_adjust_get_target_copy_map(copy_map,
18539 		    &target_copy_map);
18540 		copy_map = target_copy_map;
18541 		vm_map_copy_trim(target_copy_map, target_page_shift,
18542 		    0, new_start);
18543 	}
18544 	*trimmed_start_p = new_start;
18545 
18546 	/* target_size starts with what's left after trimming */
18547 	target_size = copy_map->size;
18548 	assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18549 	    "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18550 	    (uint64_t)target_size, (uint64_t)src_copy_map_size,
18551 	    (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18552 
18553 	/* check for misalignments but don't adjust yet */
18554 	misalignments = 0;
18555 	overmap_start = 0;
18556 	overmap_end = 0;
18557 	if (copy_page_shift < target_page_shift) {
18558 		/*
18559 		 * Remapping from 4K to 16K: check the VM object alignments
18560 		 * throughout the range.
18561 		 * If the start and end of the range are mis-aligned, we can
18562 		 * over-map to re-align, and adjust the "overmap" start/end
18563 		 * and "target_size" of the range accordingly.
18564 		 * If there is any mis-alignment within the range:
18565 		 *     if "copy":
18566 		 *         we can do immediate-copy instead of copy-on-write,
18567 		 *     else:
18568 		 *         no way to remap and share; fail.
18569 		 */
18570 		for (entry = vm_map_copy_first_entry(copy_map);
18571 		    entry != vm_map_copy_to_entry(copy_map);
18572 		    entry = entry->vme_next) {
18573 			vm_object_offset_t object_offset_start, object_offset_end;
18574 
18575 			object_offset_start = VME_OFFSET(entry);
18576 			object_offset_end = object_offset_start;
18577 			object_offset_end += entry->vme_end - entry->vme_start;
18578 			if (object_offset_start & target_page_mask) {
18579 				if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18580 					overmap_start++;
18581 				} else {
18582 					misalignments++;
18583 				}
18584 			}
18585 			if (object_offset_end & target_page_mask) {
18586 				if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18587 					overmap_end++;
18588 				} else {
18589 					misalignments++;
18590 				}
18591 			}
18592 		}
18593 	}
18594 	entry = VM_MAP_ENTRY_NULL;
18595 
18596 	/* decide how to deal with misalignments */
18597 	assert(overmap_start <= 1);
18598 	assert(overmap_end <= 1);
18599 	if (!overmap_start && !overmap_end && !misalignments) {
18600 		/* copy_map is properly aligned for target_map ... */
18601 		if (*trimmed_start_p) {
18602 			/* ... but we trimmed it, so still need to adjust */
18603 		} else {
18604 			/* ... and we didn't trim anything: we're done */
18605 			if (target_copy_map == VM_MAP_COPY_NULL) {
18606 				target_copy_map = copy_map;
18607 			}
18608 			*target_copy_map_p = target_copy_map;
18609 			*overmap_start_p = 0;
18610 			*overmap_end_p = 0;
18611 			DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18612 			return KERN_SUCCESS;
18613 		}
18614 	} else if (misalignments && !copy) {
18615 		/* can't "share" if misaligned */
18616 		DEBUG4K_ADJUST("unsupported sharing\n");
18617 #if MACH_ASSERT
18618 		if (debug4k_panic_on_misaligned_sharing) {
18619 			panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18620 		}
18621 #endif /* MACH_ASSERT */
18622 		DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18623 		return KERN_NOT_SUPPORTED;
18624 	} else {
18625 		/* can't virtual-copy if misaligned (but can physical-copy) */
18626 		DEBUG4K_ADJUST("mis-aligned copying\n");
18627 	}
18628 
18629 	/* get a "target_copy_map" if needed and switch to it */
18630 	vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18631 	copy_map = target_copy_map;
18632 
18633 	if (misalignments && copy) {
18634 		vm_map_size_t target_copy_map_size;
18635 
18636 		/*
18637 		 * Can't do copy-on-write with misaligned mappings.
18638 		 * Replace the mappings with a physical copy of the original
18639 		 * mappings' contents.
18640 		 */
18641 		target_copy_map_size = target_copy_map->size;
18642 		kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18643 		if (kr != KERN_SUCCESS) {
18644 			return kr;
18645 		}
18646 		*target_copy_map_p = target_copy_map;
18647 		*overmap_start_p = 0;
18648 		*overmap_end_p = target_copy_map->size - target_copy_map_size;
18649 		DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18650 		return KERN_SUCCESS;
18651 	}
18652 
18653 	/* apply the adjustments */
18654 	misalignments = 0;
18655 	overmap_start = 0;
18656 	overmap_end = 0;
18657 	/* remove copy_map->offset, so that everything starts at offset 0 */
18658 	addr_adjustment = copy_map->offset;
18659 	/* also remove whatever we trimmed from the start */
18660 	addr_adjustment += *trimmed_start_p;
18661 	for (target_entry = vm_map_copy_first_entry(target_copy_map);
18662 	    target_entry != vm_map_copy_to_entry(target_copy_map);
18663 	    target_entry = target_entry->vme_next) {
18664 		vm_object_offset_t object_offset_start, object_offset_end;
18665 
18666 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18667 		object_offset_start = VME_OFFSET(target_entry);
18668 		if (object_offset_start & target_page_mask) {
18669 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18670 			if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18671 				/*
18672 				 * start of 1st entry is mis-aligned:
18673 				 * re-adjust by over-mapping.
18674 				 */
18675 				overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18676 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18677 				VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18678 			} else {
18679 				misalignments++;
18680 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18681 				assert(copy);
18682 			}
18683 		}
18684 
18685 		if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18686 			target_size += overmap_start;
18687 		} else {
18688 			target_entry->vme_start += overmap_start;
18689 		}
18690 		target_entry->vme_end += overmap_start;
18691 
18692 		object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18693 		if (object_offset_end & target_page_mask) {
18694 			DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18695 			if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18696 				/*
18697 				 * end of last entry is mis-aligned: re-adjust by over-mapping.
18698 				 */
18699 				overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18700 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18701 				target_entry->vme_end += overmap_end;
18702 				target_size += overmap_end;
18703 			} else {
18704 				misalignments++;
18705 				DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18706 				assert(copy);
18707 			}
18708 		}
18709 		target_entry->vme_start -= addr_adjustment;
18710 		target_entry->vme_end -= addr_adjustment;
18711 		DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18712 	}
18713 
18714 	target_copy_map->size = target_size;
18715 	target_copy_map->offset += overmap_start;
18716 	target_copy_map->offset -= addr_adjustment;
18717 	target_copy_map->cpy_hdr.page_shift = target_page_shift;
18718 
18719 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
18720 //	assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
18721 	assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
18722 	assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
18723 
18724 	*target_copy_map_p = target_copy_map;
18725 	*overmap_start_p = overmap_start;
18726 	*overmap_end_p = overmap_end;
18727 
18728 	DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18729 	return KERN_SUCCESS;
18730 }
18731 
18732 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)18733 vm_map_range_physical_size(
18734 	vm_map_t         map,
18735 	vm_map_address_t start,
18736 	mach_vm_size_t   size,
18737 	mach_vm_size_t * phys_size)
18738 {
18739 	kern_return_t   kr;
18740 	vm_map_copy_t   copy_map, target_copy_map;
18741 	vm_map_offset_t adjusted_start, adjusted_end;
18742 	vm_map_size_t   adjusted_size;
18743 	vm_prot_t       cur_prot, max_prot;
18744 	vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
18745 	vm_map_kernel_flags_t vmk_flags;
18746 
18747 	if (size == 0) {
18748 		DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
18749 		*phys_size = 0;
18750 		return KERN_SUCCESS;
18751 	}
18752 
18753 	adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
18754 	adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
18755 	if (__improbable(os_add_overflow(start, size, &end) ||
18756 	    adjusted_end <= adjusted_start)) {
18757 		/* wraparound */
18758 		printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
18759 		*phys_size = 0;
18760 		return KERN_INVALID_ARGUMENT;
18761 	}
18762 	if (__improbable(vm_map_range_overflows(map, start, size))) {
18763 		*phys_size = 0;
18764 		return KERN_INVALID_ADDRESS;
18765 	}
18766 	assert(adjusted_end > adjusted_start);
18767 	adjusted_size = adjusted_end - adjusted_start;
18768 	*phys_size = adjusted_size;
18769 	if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
18770 		return KERN_SUCCESS;
18771 	}
18772 	if (start == 0) {
18773 		adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
18774 		adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
18775 		if (__improbable(adjusted_end <= adjusted_start)) {
18776 			/* wraparound */
18777 			printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
18778 			*phys_size = 0;
18779 			return KERN_INVALID_ARGUMENT;
18780 		}
18781 		assert(adjusted_end > adjusted_start);
18782 		adjusted_size = adjusted_end - adjusted_start;
18783 		*phys_size = adjusted_size;
18784 		return KERN_SUCCESS;
18785 	}
18786 
18787 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
18788 	vmk_flags.vmkf_copy_pageable = TRUE;
18789 	vmk_flags.vmkf_copy_same_map = TRUE;
18790 	assert(adjusted_size != 0);
18791 	cur_prot = VM_PROT_NONE; /* legacy mode */
18792 	max_prot = VM_PROT_NONE; /* legacy mode */
18793 	kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
18794 	    FALSE /* copy */,
18795 	    &copy_map,
18796 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
18797 	    vmk_flags);
18798 	if (kr != KERN_SUCCESS) {
18799 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18800 		//assert(0);
18801 		*phys_size = 0;
18802 		return kr;
18803 	}
18804 	assert(copy_map != VM_MAP_COPY_NULL);
18805 	target_copy_map = copy_map;
18806 	DEBUG4K_ADJUST("adjusting...\n");
18807 	kr = vm_map_copy_adjust_to_target(
18808 		copy_map,
18809 		start - adjusted_start, /* offset */
18810 		size, /* size */
18811 		kernel_map,
18812 		FALSE,                          /* copy */
18813 		&target_copy_map,
18814 		&overmap_start,
18815 		&overmap_end,
18816 		&trimmed_start);
18817 	if (kr == KERN_SUCCESS) {
18818 		if (target_copy_map->size != *phys_size) {
18819 			DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
18820 		}
18821 		*phys_size = target_copy_map->size;
18822 	} else {
18823 		DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18824 		//assert(0);
18825 		*phys_size = 0;
18826 	}
18827 	vm_map_copy_discard(copy_map);
18828 	copy_map = VM_MAP_COPY_NULL;
18829 
18830 	return kr;
18831 }
18832 
18833 static inline kern_return_t
vm_map_remap_sanitize(vm_map_t src_map,vm_map_t target_map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_offset_ut memory_address_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * target_addr,vm_map_address_t * mask,vm_map_offset_t * memory_address,vm_map_offset_t * memory_end,vm_map_size_t * memory_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)18834 vm_map_remap_sanitize(
18835 	vm_map_t                src_map,
18836 	vm_map_t                target_map,
18837 	vm_map_address_ut       address_u,
18838 	vm_map_size_ut          size_u,
18839 	vm_map_offset_ut        mask_u,
18840 	vm_map_offset_ut        memory_address_u,
18841 	vm_prot_ut              cur_protection_u,
18842 	vm_prot_ut              max_protection_u,
18843 	vm_inherit_ut           inheritance_u,
18844 	vm_map_kernel_flags_t   vmk_flags,
18845 	vm_map_address_t       *target_addr,
18846 	vm_map_address_t       *mask,
18847 	vm_map_offset_t        *memory_address,
18848 	vm_map_offset_t        *memory_end,
18849 	vm_map_size_t          *memory_size,
18850 	vm_prot_t              *cur_protection,
18851 	vm_prot_t              *max_protection,
18852 	vm_inherit_t           *inheritance)
18853 {
18854 	kern_return_t           result;
18855 	vm_sanitize_flags_t     vm_sanitize_flags;
18856 
18857 	result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_VM_MAP_REMAP,
18858 	    inheritance);
18859 	if (__improbable(result != KERN_SUCCESS)) {
18860 		return result;
18861 	}
18862 
18863 	result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
18864 	    VM_SANITIZE_CALLER_VM_MAP_REMAP, target_map,
18865 	    cur_protection, max_protection);
18866 	if (__improbable(result != KERN_SUCCESS)) {
18867 		return result;
18868 	}
18869 
18870 	result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, mask);
18871 	if (__improbable(result != KERN_SUCCESS)) {
18872 		return result;
18873 	}
18874 
18875 	/*
18876 	 * If the user is requesting that we return the address of the
18877 	 * first byte of the data (rather than the base of the page),
18878 	 * then we use different rounding semantics: specifically,
18879 	 * we assume that (memory_address, size) describes a region
18880 	 * all of whose pages we must cover, rather than a base to be truncated
18881 	 * down and a size to be added to that base.  So we figure out
18882 	 * the highest page that the requested region includes and make
18883 	 * sure that the size will cover it.
18884 	 *
18885 	 * The key example we're worried about it is of the form:
18886 	 *
18887 	 *              memory_address = 0x1ff0, size = 0x20
18888 	 *
18889 	 * With the old semantics, we round down the memory_address to 0x1000
18890 	 * and round up the size to 0x1000, resulting in our covering *only*
18891 	 * page 0x1000.  With the new semantics, we'd realize that the region covers
18892 	 * 0x1ff0-0x2010, and compute a size of 0x2000.  Thus, we cover both page
18893 	 * 0x1000 and page 0x2000 in the region we remap.
18894 	 *
18895 	 * VM_SANITIZE_FLAGS_REALIGN_START asks for the old (broken) semantics.
18896 	 */
18897 	vm_sanitize_flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS;
18898 	if (!vmk_flags.vmf_return_data_addr) {
18899 		vm_sanitize_flags |= VM_SANITIZE_FLAGS_REALIGN_START;
18900 	}
18901 
18902 	result = vm_sanitize_addr_size(memory_address_u, size_u,
18903 	    VM_SANITIZE_CALLER_VM_MAP_REMAP, src_map,
18904 	    vm_sanitize_flags, memory_address, memory_end,
18905 	    memory_size);
18906 	if (__improbable(result != KERN_SUCCESS)) {
18907 		return result;
18908 	}
18909 
18910 	*target_addr = vm_sanitize_addr(target_map, address_u);
18911 	return KERN_SUCCESS;
18912 }
18913 
18914 /*
18915  *	Routine:	vm_remap
18916  *
18917  *			Map portion of a task's address space.
18918  *			Mapped region must not overlap more than
18919  *			one vm memory object. Protections and
18920  *			inheritance attributes remain the same
18921  *			as in the original task and are	out parameters.
18922  *			Source and Target task can be identical
18923  *			Other attributes are identical as for vm_map()
18924  */
18925 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_ut * address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_ut memory_address_u,boolean_t copy,vm_prot_ut * cur_protection_u,vm_prot_ut * max_protection_u,vm_inherit_ut inheritance_u)18926 vm_map_remap(
18927 	vm_map_t                target_map,
18928 	vm_map_address_ut      *address_u,
18929 	vm_map_size_ut          size_u,
18930 	vm_map_offset_ut        mask_u,
18931 	vm_map_kernel_flags_t   vmk_flags,
18932 	vm_map_t                src_map,
18933 	vm_map_offset_ut        memory_address_u,
18934 	boolean_t               copy,
18935 	vm_prot_ut             *cur_protection_u, /* IN/OUT */
18936 	vm_prot_ut             *max_protection_u, /* IN/OUT */
18937 	vm_inherit_ut           inheritance_u)
18938 {
18939 	vm_map_address_t        target_addr, mask;
18940 	vm_map_size_t           target_size;
18941 	vm_map_offset_t         memory_address, memory_end;
18942 	vm_map_size_t           memory_size;
18943 	vm_prot_t               cur_protection, max_protection;
18944 	vm_inherit_t            inheritance;
18945 	kern_return_t           result;
18946 	vm_map_entry_t          insp_entry = VM_MAP_ENTRY_NULL;
18947 	vm_map_copy_t           copy_map;
18948 	vm_map_offset_t         offset_in_mapping;
18949 	vm_map_size_t           src_page_mask, target_page_mask;
18950 	vm_map_size_t           initial_size;
18951 	VM_MAP_ZAP_DECLARE(zap_list);
18952 
18953 	if (target_map == VM_MAP_NULL || src_map == VM_MAP_NULL) {
18954 		return KERN_INVALID_ARGUMENT;
18955 	}
18956 	src_page_mask    = VM_MAP_PAGE_MASK(src_map);
18957 	target_page_mask = VM_MAP_PAGE_MASK(target_map);
18958 
18959 	if (src_page_mask != target_page_mask) {
18960 		if (copy) {
18961 			DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18962 		} else {
18963 			DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18964 		}
18965 	}
18966 
18967 	/*
18968 	 * Sanitize any input parameters that are addr/size/prot/inherit
18969 	 */
18970 	result = vm_map_remap_sanitize(src_map,
18971 	    target_map,
18972 	    *address_u,
18973 	    size_u,
18974 	    mask_u,
18975 	    memory_address_u,
18976 	    *cur_protection_u,
18977 	    *max_protection_u,
18978 	    inheritance_u,
18979 	    vmk_flags,
18980 	    &target_addr,
18981 	    &mask,
18982 	    &memory_address,
18983 	    &memory_end,
18984 	    &memory_size,
18985 	    &cur_protection,
18986 	    &max_protection,
18987 	    &inheritance);
18988 	if (__improbable(result != KERN_SUCCESS)) {
18989 		return vm_sanitize_get_kr(result);
18990 	}
18991 
18992 	if (vmk_flags.vmf_return_data_addr) {
18993 		/*
18994 		 * This is safe to unwrap now that the quantities
18995 		 * have been validated and rounded up normally.
18996 		 */
18997 		offset_in_mapping = vm_sanitize_offset_in_page(src_map,
18998 		    memory_address_u);
18999 		initial_size = VM_SANITIZE_UNSAFE_UNWRAP(size_u);
19000 	} else {
19001 		/*
19002 		 * IMPORTANT:
19003 		 * This legacy code path is broken: for the range mentioned
19004 		 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19005 		 * two 4k pages, it yields [ memory_address = 0x1000,
19006 		 * size = 0x1000 ], which covers only the first 4k page.
19007 		 * BUT some code unfortunately depends on this bug, so we
19008 		 * can't fix it without breaking something.
19009 		 * New code should get automatically opted in the new
19010 		 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19011 		 */
19012 		offset_in_mapping = 0;
19013 		initial_size = memory_size;
19014 	}
19015 
19016 	if (vmk_flags.vmf_resilient_media) {
19017 		/* must be copy-on-write to be "media resilient" */
19018 		if (!copy) {
19019 			return KERN_INVALID_ARGUMENT;
19020 		}
19021 	}
19022 
19023 	vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19024 	vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19025 
19026 	assert(memory_size != 0);
19027 	result = vm_map_copy_extract(src_map,
19028 	    memory_address,
19029 	    memory_size,
19030 	    copy, &copy_map,
19031 	    &cur_protection, /* IN/OUT */
19032 	    &max_protection, /* IN/OUT */
19033 	    inheritance,
19034 	    vmk_flags);
19035 	if (result != KERN_SUCCESS) {
19036 		return result;
19037 	}
19038 	assert(copy_map != VM_MAP_COPY_NULL);
19039 
19040 	/*
19041 	 * Handle the policy for vm map ranges
19042 	 *
19043 	 * If the maps differ, the target_map policy applies like for vm_map()
19044 	 * For same mapping remaps, we preserve the range.
19045 	 */
19046 	if (vmk_flags.vmkf_copy_same_map) {
19047 		vmk_flags.vmkf_range_id = copy_map->orig_range;
19048 	} else {
19049 		vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, memory_size);
19050 	}
19051 
19052 	target_size = memory_size;
19053 	if (src_page_mask != target_page_mask) {
19054 		vm_map_copy_t   target_copy_map;
19055 		vm_map_offset_t overmap_start = 0;
19056 		vm_map_offset_t overmap_end   = 0;
19057 		vm_map_offset_t trimmed_start = 0;
19058 
19059 		target_copy_map = copy_map; /* can modify "copy_map" itself */
19060 		DEBUG4K_ADJUST("adjusting...\n");
19061 		result = vm_map_copy_adjust_to_target(
19062 			copy_map,
19063 			offset_in_mapping, /* offset */
19064 			initial_size,
19065 			target_map,
19066 			copy,
19067 			&target_copy_map,
19068 			&overmap_start,
19069 			&overmap_end,
19070 			&trimmed_start);
19071 		if (result != KERN_SUCCESS) {
19072 			DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19073 			vm_map_copy_discard(copy_map);
19074 			return result;
19075 		}
19076 		if (trimmed_start == 0) {
19077 			/* nothing trimmed: no adjustment needed */
19078 		} else if (trimmed_start >= offset_in_mapping) {
19079 			/* trimmed more than offset_in_mapping: nothing left */
19080 			assert(overmap_start == 0);
19081 			assert(overmap_end == 0);
19082 			offset_in_mapping = 0;
19083 		} else {
19084 			/* trimmed some of offset_in_mapping: adjust */
19085 			assert(overmap_start == 0);
19086 			assert(overmap_end == 0);
19087 			offset_in_mapping -= trimmed_start;
19088 		}
19089 		offset_in_mapping += overmap_start;
19090 		target_size = target_copy_map->size;
19091 	}
19092 
19093 	/*
19094 	 * Allocate/check a range of free virtual address
19095 	 * space for the target
19096 	 */
19097 	target_size = vm_map_round_page(target_size, target_page_mask);
19098 
19099 	if (target_size == 0) {
19100 		vm_map_copy_discard(copy_map);
19101 		return KERN_INVALID_ARGUMENT;
19102 	}
19103 
19104 	vm_map_lock(target_map);
19105 
19106 	if (!vmk_flags.vmf_fixed) {
19107 		result = vm_map_locate_space_anywhere(target_map, target_size,
19108 		    mask, vmk_flags, &target_addr, &insp_entry);
19109 	} else {
19110 		/*
19111 		 * vm_map_locate_space_fixed will reject overflowing
19112 		 * target_addr + target_size values
19113 		 */
19114 		result = vm_map_locate_space_fixed(target_map, target_addr,
19115 		    target_size, mask, vmk_flags, &insp_entry, &zap_list);
19116 
19117 		if (result == KERN_MEMORY_PRESENT) {
19118 			assert(!vmk_flags.vmkf_already);
19119 			insp_entry = VM_MAP_ENTRY_NULL;
19120 			result = KERN_NO_SPACE;
19121 		}
19122 	}
19123 
19124 	if (result == KERN_SUCCESS) {
19125 		while (vm_map_copy_first_entry(copy_map) !=
19126 		    vm_map_copy_to_entry(copy_map)) {
19127 			vm_map_entry_t entry = vm_map_copy_first_entry(copy_map);
19128 
19129 			vm_map_copy_entry_unlink(copy_map, entry);
19130 
19131 			if (vmk_flags.vmkf_remap_prot_copy) {
19132 				/*
19133 				 * This vm_map_remap() is for a
19134 				 * vm_protect(VM_PROT_COPY), so the caller
19135 				 * expects to be allowed to add write access
19136 				 * to this new mapping.  This is done by
19137 				 * adding VM_PROT_WRITE to each entry's
19138 				 * max_protection... unless some security
19139 				 * settings disallow it.
19140 				 */
19141 				bool allow_write = false;
19142 				if (entry->vme_permanent) {
19143 					/* immutable mapping... */
19144 					if ((entry->max_protection & VM_PROT_EXECUTE) &&
19145 					    developer_mode_state()) {
19146 						/*
19147 						 * ... but executable and
19148 						 * possibly being debugged,
19149 						 * so let's allow it to become
19150 						 * writable, for breakpoints
19151 						 * and dtrace probes, for
19152 						 * example.
19153 						 */
19154 						allow_write = true;
19155 					} else {
19156 						printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19157 						    proc_selfpid(),
19158 						    (get_bsdtask_info(current_task())
19159 						    ? proc_name_address(get_bsdtask_info(current_task()))
19160 						    : "?"),
19161 						    (uint64_t)memory_address,
19162 						    (uint64_t)memory_size,
19163 						    entry->protection,
19164 						    entry->max_protection,
19165 						    developer_mode_state());
19166 						DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19167 						    vm_map_entry_t, entry,
19168 						    vm_map_offset_t, entry->vme_start,
19169 						    vm_map_offset_t, entry->vme_end,
19170 						    vm_prot_t, entry->protection,
19171 						    vm_prot_t, entry->max_protection,
19172 						    int, VME_ALIAS(entry));
19173 					}
19174 				} else {
19175 					allow_write = true;
19176 				}
19177 
19178 				/*
19179 				 * VM_PROT_COPY: allow this mapping to become
19180 				 * writable, unless it was "permanent".
19181 				 */
19182 				if (allow_write) {
19183 					entry->max_protection |= VM_PROT_WRITE;
19184 				}
19185 			}
19186 			if (vmk_flags.vmf_resilient_codesign) {
19187 				/* no codesigning -> read-only access */
19188 				entry->max_protection = VM_PROT_READ;
19189 				entry->protection = VM_PROT_READ;
19190 				entry->vme_resilient_codesign = TRUE;
19191 			}
19192 			entry->vme_start += target_addr;
19193 			entry->vme_end += target_addr;
19194 			assert(!entry->map_aligned);
19195 			if (vmk_flags.vmf_resilient_media &&
19196 			    !entry->is_sub_map &&
19197 			    (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19198 			    VME_OBJECT(entry)->internal)) {
19199 				entry->vme_resilient_media = TRUE;
19200 			}
19201 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19202 			assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19203 			assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19204 			vm_map_store_entry_link(target_map, insp_entry, entry,
19205 			    vmk_flags);
19206 			insp_entry = entry;
19207 		}
19208 	}
19209 
19210 	if (vmk_flags.vmf_resilient_codesign) {
19211 		cur_protection = VM_PROT_READ;
19212 		max_protection = VM_PROT_READ;
19213 	}
19214 
19215 	if (result == KERN_SUCCESS) {
19216 		target_map->size += target_size;
19217 		SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19218 	}
19219 	vm_map_unlock(target_map);
19220 
19221 	vm_map_zap_dispose(&zap_list);
19222 
19223 	if (result == KERN_SUCCESS && target_map->wiring_required) {
19224 		result = vm_map_wire_nested(target_map, target_addr,
19225 		    target_addr + target_size, cur_protection, VM_KERN_MEMORY_MLOCK,
19226 		    TRUE, PMAP_NULL, 0, NULL);
19227 	}
19228 
19229 	if (result == KERN_SUCCESS) {
19230 #if KASAN
19231 		if (target_map->pmap == kernel_pmap) {
19232 			kasan_notify_address(target_addr, target_size);
19233 		}
19234 #endif
19235 		/*
19236 		 * If requested, return the address of the data pointed to by the
19237 		 * request, rather than the base of the resulting page.
19238 		 */
19239 		if (vmk_flags.vmf_return_data_addr) {
19240 			target_addr += offset_in_mapping;
19241 		}
19242 
19243 		/*
19244 		 * Update OUT parameters.
19245 		 */
19246 		*address_u = vm_sanitize_wrap_addr(target_addr);
19247 
19248 		*cur_protection_u = vm_sanitize_wrap_prot(cur_protection);
19249 		*max_protection_u = vm_sanitize_wrap_prot(max_protection);
19250 	}
19251 
19252 	if (src_page_mask != target_page_mask) {
19253 		DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx  result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)target_size, copy, target_map, (uint64_t)target_addr, (uint64_t)offset_in_mapping, result);
19254 	}
19255 	vm_map_copy_discard(copy_map);
19256 	copy_map = VM_MAP_COPY_NULL;
19257 
19258 	return result;
19259 }
19260 
19261 /*
19262  *	vm_map_switch:
19263  *
19264  *	Set the address map for the current thread to the specified map
19265  */
19266 
19267 vm_map_t
vm_map_switch(vm_map_t map)19268 vm_map_switch(
19269 	vm_map_t        map)
19270 {
19271 	thread_t        thread = current_thread();
19272 	vm_map_t        oldmap = thread->map;
19273 
19274 
19275 	/*
19276 	 *	Deactivate the current map and activate the requested map
19277 	 */
19278 	mp_disable_preemption();
19279 	PMAP_SWITCH_USER(thread, map, cpu_number());
19280 	mp_enable_preemption();
19281 	return oldmap;
19282 }
19283 
19284 static inline kern_return_t
vm_map_rw_user_sanitize(vm_map_t map,vm_map_address_ut addr_u,vm_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_address_t * addr,vm_map_address_t * end,vm_map_size_t * size)19285 vm_map_rw_user_sanitize(
19286 	vm_map_t                map,
19287 	vm_map_address_ut       addr_u,
19288 	vm_size_ut              size_u,
19289 	vm_sanitize_caller_t    vm_sanitize_caller,
19290 	vm_map_address_t       *addr,
19291 	vm_map_address_t       *end,
19292 	vm_map_size_t          *size)
19293 {
19294 	return vm_sanitize_addr_size(addr_u, size_u,
19295 	           vm_sanitize_caller, map,
19296 	           VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
19297 	           addr, end, size);
19298 }
19299 
19300 /*
19301  *	Routine:	vm_map_write_user
19302  *
19303  *	Description:
19304  *		Copy out data from a kernel space into space in the
19305  *		destination map. The space must already exist in the
19306  *		destination map.
19307  *		NOTE:  This routine should only be called by threads
19308  *		which can block on a page fault. i.e. kernel mode user
19309  *		threads.
19310  *
19311  */
19312 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_ut dst_addr_u,vm_size_ut size_u)19313 vm_map_write_user(
19314 	vm_map_t                map,
19315 	void                   *src_p,
19316 	vm_map_address_ut       dst_addr_u,
19317 	vm_size_ut              size_u)
19318 {
19319 	kern_return_t    kr;
19320 	vm_map_address_t dst_addr, dst_end;
19321 	vm_map_size_t    size;
19322 
19323 	/*
19324 	 * src_p isn't validated: [src_p, src_p + size_u)
19325 	 * is trusted kernel input.
19326 	 *
19327 	 * dst_addr_u and size_u are untrusted and need to be sanitized.
19328 	 */
19329 	kr = vm_map_rw_user_sanitize(map,
19330 	    dst_addr_u,
19331 	    size_u,
19332 	    VM_SANITIZE_CALLER_VM_MAP_WRITE_USER,
19333 	    &dst_addr,
19334 	    &dst_end,
19335 	    &size);
19336 	if (__improbable(kr != KERN_SUCCESS)) {
19337 		return vm_sanitize_get_kr(kr);
19338 	}
19339 
19340 	if (current_map() == map) {
19341 		if (copyout(src_p, dst_addr, size)) {
19342 			kr = KERN_INVALID_ADDRESS;
19343 		}
19344 	} else {
19345 		vm_map_t        oldmap;
19346 
19347 		/* take on the identity of the target map while doing */
19348 		/* the transfer */
19349 
19350 		vm_map_reference(map);
19351 		oldmap = vm_map_switch(map);
19352 		if (copyout(src_p, dst_addr, size)) {
19353 			kr = KERN_INVALID_ADDRESS;
19354 		}
19355 		vm_map_switch(oldmap);
19356 		vm_map_deallocate(map);
19357 	}
19358 	return kr;
19359 }
19360 
19361 /*
19362  *	Routine:	vm_map_read_user
19363  *
19364  *	Description:
19365  *		Copy in data from a user space source map into the
19366  *		kernel map. The space must already exist in the
19367  *		kernel map.
19368  *		NOTE:  This routine should only be called by threads
19369  *		which can block on a page fault. i.e. kernel mode user
19370  *		threads.
19371  *
19372  */
19373 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_ut src_addr_u,void * dst_p,vm_size_ut size_u)19374 vm_map_read_user(
19375 	vm_map_t                map,
19376 	vm_map_address_ut       src_addr_u,
19377 	void                   *dst_p,
19378 	vm_size_ut              size_u)
19379 {
19380 	kern_return_t    kr;
19381 	vm_map_address_t src_addr, src_end;
19382 	vm_map_size_t    size;
19383 
19384 	/*
19385 	 * dst_p isn't validated: [dst_p, dst_p + size_u)
19386 	 * is trusted kernel input.
19387 	 *
19388 	 * src_addr_u and size_u are untrusted and need to be sanitized.
19389 	 */
19390 	kr = vm_map_rw_user_sanitize(map,
19391 	    src_addr_u,
19392 	    size_u,
19393 	    VM_SANITIZE_CALLER_VM_MAP_READ_USER,
19394 	    &src_addr,
19395 	    &src_end,
19396 	    &size);
19397 	if (__improbable(kr != KERN_SUCCESS)) {
19398 		return vm_sanitize_get_kr(kr);
19399 	}
19400 
19401 	if (current_map() == map) {
19402 		if (copyin(src_addr, dst_p, size)) {
19403 			kr = KERN_INVALID_ADDRESS;
19404 		}
19405 	} else {
19406 		vm_map_t        oldmap;
19407 
19408 		/* take on the identity of the target map while doing */
19409 		/* the transfer */
19410 
19411 		vm_map_reference(map);
19412 		oldmap = vm_map_switch(map);
19413 		if (copyin(src_addr, dst_p, size)) {
19414 			kr = KERN_INVALID_ADDRESS;
19415 		}
19416 		vm_map_switch(oldmap);
19417 		vm_map_deallocate(map);
19418 	}
19419 	return kr;
19420 }
19421 
19422 
19423 /*
19424  *	vm_map_check_protection:
19425  *
19426  *	Assert that the target map allows the specified
19427  *	privilege on the entire address region given.
19428  *	The entire region must be allocated.
19429  */
19430 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)19431 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
19432     vm_map_offset_t end, vm_prot_t protection)
19433 {
19434 	vm_map_entry_t entry;
19435 	vm_map_entry_t tmp_entry;
19436 
19437 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
19438 		return FALSE;
19439 	}
19440 
19441 	vm_map_lock(map);
19442 
19443 	if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
19444 		vm_map_unlock(map);
19445 		return FALSE;
19446 	}
19447 
19448 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19449 		vm_map_unlock(map);
19450 		return FALSE;
19451 	}
19452 
19453 	entry = tmp_entry;
19454 
19455 	while (start < end) {
19456 		if (entry == vm_map_to_entry(map)) {
19457 			vm_map_unlock(map);
19458 			return FALSE;
19459 		}
19460 
19461 		/*
19462 		 *	No holes allowed!
19463 		 */
19464 
19465 		if (start < entry->vme_start) {
19466 			vm_map_unlock(map);
19467 			return FALSE;
19468 		}
19469 
19470 		/*
19471 		 * Check protection associated with entry.
19472 		 */
19473 
19474 		if ((entry->protection & protection) != protection) {
19475 			vm_map_unlock(map);
19476 			return FALSE;
19477 		}
19478 
19479 		/* go to next entry */
19480 
19481 		start = entry->vme_end;
19482 		entry = entry->vme_next;
19483 	}
19484 	vm_map_unlock(map);
19485 	return TRUE;
19486 }
19487 
19488 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)19489 vm_map_purgable_control(
19490 	vm_map_t                map,
19491 	vm_map_offset_t         address,
19492 	vm_purgable_t           control,
19493 	int                     *state)
19494 {
19495 	vm_map_entry_t          entry;
19496 	vm_object_t             object;
19497 	kern_return_t           kr;
19498 	boolean_t               was_nonvolatile;
19499 
19500 	/*
19501 	 * Vet all the input parameters and current type and state of the
19502 	 * underlaying object.  Return with an error if anything is amiss.
19503 	 */
19504 	if (map == VM_MAP_NULL) {
19505 		return KERN_INVALID_ARGUMENT;
19506 	}
19507 
19508 	if (control != VM_PURGABLE_SET_STATE &&
19509 	    control != VM_PURGABLE_GET_STATE &&
19510 	    control != VM_PURGABLE_PURGE_ALL &&
19511 	    control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19512 		return KERN_INVALID_ARGUMENT;
19513 	}
19514 
19515 	if (control == VM_PURGABLE_PURGE_ALL) {
19516 		vm_purgeable_object_purge_all();
19517 		return KERN_SUCCESS;
19518 	}
19519 
19520 	if ((control == VM_PURGABLE_SET_STATE ||
19521 	    control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19522 	    (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19523 	    ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19524 		return KERN_INVALID_ARGUMENT;
19525 	}
19526 
19527 	vm_map_lock_read(map);
19528 
19529 	if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19530 		/*
19531 		 * Must pass a valid non-submap address.
19532 		 */
19533 		vm_map_unlock_read(map);
19534 		return KERN_INVALID_ADDRESS;
19535 	}
19536 
19537 	if ((entry->protection & VM_PROT_WRITE) == 0 &&
19538 	    control != VM_PURGABLE_GET_STATE) {
19539 		/*
19540 		 * Can't apply purgable controls to something you can't write.
19541 		 */
19542 		vm_map_unlock_read(map);
19543 		return KERN_PROTECTION_FAILURE;
19544 	}
19545 
19546 	object = VME_OBJECT(entry);
19547 	if (object == VM_OBJECT_NULL ||
19548 	    object->purgable == VM_PURGABLE_DENY) {
19549 		/*
19550 		 * Object must already be present and be purgeable.
19551 		 */
19552 		vm_map_unlock_read(map);
19553 		return KERN_INVALID_ARGUMENT;
19554 	}
19555 
19556 	vm_object_lock(object);
19557 
19558 #if 00
19559 	if (VME_OFFSET(entry) != 0 ||
19560 	    entry->vme_end - entry->vme_start != object->vo_size) {
19561 		/*
19562 		 * Can only apply purgable controls to the whole (existing)
19563 		 * object at once.
19564 		 */
19565 		vm_map_unlock_read(map);
19566 		vm_object_unlock(object);
19567 		return KERN_INVALID_ARGUMENT;
19568 	}
19569 #endif
19570 
19571 	assert(!entry->is_sub_map);
19572 	assert(!entry->use_pmap); /* purgeable has its own accounting */
19573 
19574 	vm_map_unlock_read(map);
19575 
19576 	was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19577 
19578 	kr = vm_object_purgable_control(object, control, state);
19579 
19580 	if (was_nonvolatile &&
19581 	    object->purgable != VM_PURGABLE_NONVOLATILE &&
19582 	    map->pmap == kernel_pmap) {
19583 #if DEBUG
19584 		object->vo_purgeable_volatilizer = kernel_task;
19585 #endif /* DEBUG */
19586 	}
19587 
19588 	vm_object_unlock(object);
19589 
19590 	return kr;
19591 }
19592 
19593 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19594 vm_map_footprint_query_page_info(
19595 	vm_map_t        map,
19596 	vm_map_entry_t  map_entry,
19597 	vm_map_offset_t curr_s_offset,
19598 	int             *disposition_p)
19599 {
19600 	int             pmap_disp;
19601 	vm_object_t     object = VM_OBJECT_NULL;
19602 	int             disposition;
19603 	int             effective_page_size;
19604 
19605 	vm_map_lock_assert_held(map);
19606 	assert(!map->has_corpse_footprint);
19607 	assert(curr_s_offset >= map_entry->vme_start);
19608 	assert(curr_s_offset < map_entry->vme_end);
19609 
19610 	if (map_entry->is_sub_map) {
19611 		if (!map_entry->use_pmap) {
19612 			/* nested pmap: no footprint */
19613 			*disposition_p = 0;
19614 			return;
19615 		}
19616 	} else {
19617 		object = VME_OBJECT(map_entry);
19618 		if (object == VM_OBJECT_NULL) {
19619 			/* nothing mapped here: no need to ask */
19620 			*disposition_p = 0;
19621 			return;
19622 		}
19623 	}
19624 
19625 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19626 
19627 	pmap_disp = 0;
19628 
19629 	/*
19630 	 * Query the pmap.
19631 	 */
19632 	pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19633 
19634 	/*
19635 	 * Compute this page's disposition.
19636 	 */
19637 	disposition = 0;
19638 
19639 	/* deal with "alternate accounting" first */
19640 	if (!map_entry->is_sub_map &&
19641 	    object->vo_no_footprint) {
19642 		/* does not count in footprint */
19643 //		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19644 	} else if (!map_entry->is_sub_map &&
19645 	    !object->internal &&
19646 	    object->vo_ledger_tag &&
19647 	    VM_OBJECT_OWNER(object) != NULL &&
19648 	    VM_OBJECT_OWNER(object)->map == map) {
19649 		/* owned external object: wired pages count in footprint */
19650 		assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19651 		if ((((curr_s_offset
19652 		    - map_entry->vme_start
19653 		    + VME_OFFSET(map_entry))
19654 		    / effective_page_size) <
19655 		    object->wired_page_count)) {
19656 			/*
19657 			 * External object owned by this task: report the first
19658 			 * "#wired" pages as "resident" (to show that they
19659 			 * contribute to the footprint) but not "dirty"
19660 			 * (to avoid double-counting with the fake "owned"
19661 			 * region we'll report at the end of the address space
19662 			 * to account for all (mapped or not) owned memory
19663 			 * owned by this task.
19664 			 */
19665 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19666 		}
19667 	} else if (!map_entry->is_sub_map &&
19668 	    object->internal &&
19669 	    (object->purgable == VM_PURGABLE_NONVOLATILE ||
19670 	    (object->purgable == VM_PURGABLE_DENY &&
19671 	    object->vo_ledger_tag)) &&
19672 	    VM_OBJECT_OWNER(object) != NULL &&
19673 	    VM_OBJECT_OWNER(object)->map == map) {
19674 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19675 		if ((((curr_s_offset
19676 		    - map_entry->vme_start
19677 		    + VME_OFFSET(map_entry))
19678 		    / effective_page_size) <
19679 		    (object->resident_page_count +
19680 		    vm_compressor_pager_get_count(object->pager)))) {
19681 			/*
19682 			 * Non-volatile purgeable object owned
19683 			 * by this task: report the first
19684 			 * "#resident + #compressed" pages as
19685 			 * "resident" (to show that they
19686 			 * contribute to the footprint) but not
19687 			 * "dirty" (to avoid double-counting
19688 			 * with the fake "non-volatile" region
19689 			 * we'll report at the end of the
19690 			 * address space to account for all
19691 			 * (mapped or not) non-volatile memory
19692 			 * owned by this task.
19693 			 */
19694 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19695 		}
19696 	} else if (!map_entry->is_sub_map &&
19697 	    object->internal &&
19698 	    (object->purgable == VM_PURGABLE_VOLATILE ||
19699 	    object->purgable == VM_PURGABLE_EMPTY) &&
19700 	    VM_OBJECT_OWNER(object) != NULL &&
19701 	    VM_OBJECT_OWNER(object)->map == map) {
19702 		if (object->internal) {
19703 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19704 		}
19705 		if ((((curr_s_offset
19706 		    - map_entry->vme_start
19707 		    + VME_OFFSET(map_entry))
19708 		    / effective_page_size) <
19709 		    object->wired_page_count)) {
19710 			/*
19711 			 * Volatile|empty purgeable object owned
19712 			 * by this task: report the first
19713 			 * "#wired" pages as "resident" (to
19714 			 * show that they contribute to the
19715 			 * footprint) but not "dirty" (to avoid
19716 			 * double-counting with the fake
19717 			 * "non-volatile" region we'll report
19718 			 * at the end of the address space to
19719 			 * account for all (mapped or not)
19720 			 * non-volatile memory owned by this
19721 			 * task.
19722 			 */
19723 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19724 		}
19725 	} else if (!map_entry->is_sub_map &&
19726 	    map_entry->iokit_acct &&
19727 	    object->internal &&
19728 	    object->purgable == VM_PURGABLE_DENY) {
19729 		/*
19730 		 * Non-purgeable IOKit memory: phys_footprint
19731 		 * includes the entire virtual mapping.
19732 		 */
19733 		assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19734 		disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19735 		disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19736 	} else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
19737 	    PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
19738 		/* alternate accounting */
19739 #if __arm64__ && (DEVELOPMENT || DEBUG)
19740 		if (map->pmap->footprint_was_suspended) {
19741 			/*
19742 			 * The assertion below can fail if dyld
19743 			 * suspended footprint accounting
19744 			 * while doing some adjustments to
19745 			 * this page;  the mapping would say
19746 			 * "use pmap accounting" but the page
19747 			 * would be marked "alternate
19748 			 * accounting".
19749 			 */
19750 		} else
19751 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
19752 		{
19753 			assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19754 		}
19755 		disposition = 0;
19756 	} else {
19757 		if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
19758 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19759 			disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19760 			disposition |= VM_PAGE_QUERY_PAGE_REF;
19761 			if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
19762 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19763 			} else {
19764 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19765 			}
19766 			if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
19767 				disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19768 			}
19769 		} else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
19770 			assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19771 			disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19772 		}
19773 	}
19774 
19775 	*disposition_p = disposition;
19776 }
19777 
19778 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)19779 vm_map_page_query_internal(
19780 	vm_map_t        target_map,
19781 	vm_map_offset_t offset,
19782 	int             *disposition,
19783 	int             *ref_count)
19784 {
19785 	kern_return_t                   kr;
19786 	vm_page_info_basic_data_t       info;
19787 	mach_msg_type_number_t          count;
19788 
19789 	count = VM_PAGE_INFO_BASIC_COUNT;
19790 	kr = vm_map_page_info(target_map,
19791 	    offset,
19792 	    VM_PAGE_INFO_BASIC,
19793 	    (vm_page_info_t) &info,
19794 	    &count);
19795 	if (kr == KERN_SUCCESS) {
19796 		*disposition = info.disposition;
19797 		*ref_count = info.ref_count;
19798 	} else {
19799 		*disposition = 0;
19800 		*ref_count = 0;
19801 	}
19802 
19803 	return kr;
19804 }
19805 
19806 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19807 vm_map_page_info(
19808 	vm_map_t                map,
19809 	vm_map_offset_t         offset,
19810 	vm_page_info_flavor_t   flavor,
19811 	vm_page_info_t          info,
19812 	mach_msg_type_number_t  *count)
19813 {
19814 	return vm_map_page_range_info_internal(map,
19815 	           offset, /* start of range */
19816 	           (offset + 1), /* this will get rounded in the call to the page boundary */
19817 	           (int)-1, /* effective_page_shift: unspecified */
19818 	           flavor,
19819 	           info,
19820 	           count);
19821 }
19822 
19823 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19824 vm_map_page_range_info_internal(
19825 	vm_map_t                map,
19826 	vm_map_offset_t         start_offset,
19827 	vm_map_offset_t         end_offset,
19828 	int                     effective_page_shift,
19829 	vm_page_info_flavor_t   flavor,
19830 	vm_page_info_t          info,
19831 	mach_msg_type_number_t  *count)
19832 {
19833 	vm_map_entry_t          map_entry = VM_MAP_ENTRY_NULL;
19834 	vm_object_t             object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
19835 	vm_page_t               m = VM_PAGE_NULL;
19836 	kern_return_t           retval = KERN_SUCCESS;
19837 	int                     disposition = 0;
19838 	int                     ref_count = 0;
19839 	int                     depth = 0, info_idx = 0;
19840 	vm_page_info_basic_t    basic_info = 0;
19841 	vm_map_offset_t         offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
19842 	vm_map_offset_t         start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
19843 	boolean_t               do_region_footprint;
19844 	ledger_amount_t         ledger_resident, ledger_compressed;
19845 	int                     effective_page_size;
19846 	vm_map_offset_t         effective_page_mask;
19847 
19848 	switch (flavor) {
19849 	case VM_PAGE_INFO_BASIC:
19850 		if (*count != VM_PAGE_INFO_BASIC_COUNT) {
19851 			/*
19852 			 * The "vm_page_info_basic_data" structure was not
19853 			 * properly padded, so allow the size to be off by
19854 			 * one to maintain backwards binary compatibility...
19855 			 */
19856 			if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
19857 				return KERN_INVALID_ARGUMENT;
19858 			}
19859 		}
19860 		break;
19861 	default:
19862 		return KERN_INVALID_ARGUMENT;
19863 	}
19864 
19865 	if (effective_page_shift == -1) {
19866 		effective_page_shift = vm_self_region_page_shift_safely(map);
19867 		if (effective_page_shift == -1) {
19868 			return KERN_INVALID_ARGUMENT;
19869 		}
19870 	}
19871 	effective_page_size = (1 << effective_page_shift);
19872 	effective_page_mask = effective_page_size - 1;
19873 
19874 	do_region_footprint = task_self_region_footprint();
19875 	disposition = 0;
19876 	ref_count = 0;
19877 	depth = 0;
19878 	info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
19879 	retval = KERN_SUCCESS;
19880 
19881 	if (__improbable(vm_map_range_overflows(map, start_offset, end_offset - start_offset))) {
19882 		return KERN_INVALID_ADDRESS;
19883 	}
19884 
19885 	offset_in_page = start_offset & effective_page_mask;
19886 	start = vm_map_trunc_page(start_offset, effective_page_mask);
19887 	end = vm_map_round_page(end_offset, effective_page_mask);
19888 
19889 	if (end < start) {
19890 		return KERN_INVALID_ARGUMENT;
19891 	}
19892 
19893 	assert((end - start) <= MAX_PAGE_RANGE_QUERY);
19894 
19895 	vm_map_lock_read(map);
19896 
19897 	task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
19898 
19899 	for (curr_s_offset = start; curr_s_offset < end;) {
19900 		/*
19901 		 * New lookup needs reset of these variables.
19902 		 */
19903 		curr_object = object = VM_OBJECT_NULL;
19904 		offset_in_object = 0;
19905 		ref_count = 0;
19906 		depth = 0;
19907 
19908 		if (do_region_footprint &&
19909 		    curr_s_offset >= vm_map_last_entry(map)->vme_end) {
19910 			/*
19911 			 * Request for "footprint" info about a page beyond
19912 			 * the end of address space: this must be for
19913 			 * the fake region vm_map_region_recurse_64()
19914 			 * reported to account for non-volatile purgeable
19915 			 * memory owned by this task.
19916 			 */
19917 			disposition = 0;
19918 
19919 			if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
19920 			    (unsigned) ledger_compressed) {
19921 				/*
19922 				 * We haven't reported all the "non-volatile
19923 				 * compressed" pages yet, so report this fake
19924 				 * page as "compressed".
19925 				 */
19926 				disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19927 			} else {
19928 				/*
19929 				 * We've reported all the non-volatile
19930 				 * compressed page but not all the non-volatile
19931 				 * pages , so report this fake page as
19932 				 * "resident dirty".
19933 				 */
19934 				disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19935 				disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19936 				disposition |= VM_PAGE_QUERY_PAGE_REF;
19937 			}
19938 			switch (flavor) {
19939 			case VM_PAGE_INFO_BASIC:
19940 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19941 				basic_info->disposition = disposition;
19942 				basic_info->ref_count = 1;
19943 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19944 				basic_info->offset = 0;
19945 				basic_info->depth = 0;
19946 
19947 				info_idx++;
19948 				break;
19949 			}
19950 			curr_s_offset += effective_page_size;
19951 			continue;
19952 		}
19953 
19954 		/*
19955 		 * First, find the map entry covering "curr_s_offset", going down
19956 		 * submaps if necessary.
19957 		 */
19958 		if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
19959 			/* no entry -> no object -> no page */
19960 
19961 			if (curr_s_offset < vm_map_min(map)) {
19962 				/*
19963 				 * Illegal address that falls below map min.
19964 				 */
19965 				curr_e_offset = MIN(end, vm_map_min(map));
19966 			} else if (curr_s_offset >= vm_map_max(map)) {
19967 				/*
19968 				 * Illegal address that falls on/after map max.
19969 				 */
19970 				curr_e_offset = end;
19971 			} else if (map_entry == vm_map_to_entry(map)) {
19972 				/*
19973 				 * Hit a hole.
19974 				 */
19975 				if (map_entry->vme_next == vm_map_to_entry(map)) {
19976 					/*
19977 					 * Empty map.
19978 					 */
19979 					curr_e_offset = MIN(map->max_offset, end);
19980 				} else {
19981 					/*
19982 					 * Hole at start of the map.
19983 					 */
19984 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19985 				}
19986 			} else {
19987 				if (map_entry->vme_next == vm_map_to_entry(map)) {
19988 					/*
19989 					 * Hole at the end of the map.
19990 					 */
19991 					curr_e_offset = MIN(map->max_offset, end);
19992 				} else {
19993 					curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19994 				}
19995 			}
19996 
19997 			assert(curr_e_offset >= curr_s_offset);
19998 
19999 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20000 
20001 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20002 
20003 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20004 
20005 			curr_s_offset = curr_e_offset;
20006 
20007 			info_idx += num_pages;
20008 
20009 			continue;
20010 		}
20011 
20012 		/* compute offset from this map entry's start */
20013 		offset_in_object = curr_s_offset - map_entry->vme_start;
20014 
20015 		/* compute offset into this map entry's object (or submap) */
20016 		offset_in_object += VME_OFFSET(map_entry);
20017 
20018 		if (map_entry->is_sub_map) {
20019 			vm_map_t sub_map = VM_MAP_NULL;
20020 			vm_page_info_t submap_info = 0;
20021 			vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20022 
20023 			range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20024 
20025 			submap_s_offset = offset_in_object;
20026 			submap_e_offset = submap_s_offset + range_len;
20027 
20028 			sub_map = VME_SUBMAP(map_entry);
20029 
20030 			vm_map_reference(sub_map);
20031 			vm_map_unlock_read(map);
20032 
20033 			submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20034 
20035 			assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20036 			    "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20037 
20038 			retval = vm_map_page_range_info_internal(sub_map,
20039 			    submap_s_offset,
20040 			    submap_e_offset,
20041 			    effective_page_shift,
20042 			    VM_PAGE_INFO_BASIC,
20043 			    (vm_page_info_t) submap_info,
20044 			    count);
20045 
20046 			assert(retval == KERN_SUCCESS);
20047 
20048 			vm_map_lock_read(map);
20049 			vm_map_deallocate(sub_map);
20050 
20051 			/* Move the "info" index by the number of pages we inspected.*/
20052 			info_idx += range_len >> effective_page_shift;
20053 
20054 			/* Move our current offset by the size of the range we inspected.*/
20055 			curr_s_offset += range_len;
20056 
20057 			continue;
20058 		}
20059 
20060 		object = VME_OBJECT(map_entry);
20061 
20062 		if (object == VM_OBJECT_NULL) {
20063 			/*
20064 			 * We don't have an object here and, hence,
20065 			 * no pages to inspect. We'll fill up the
20066 			 * info structure appropriately.
20067 			 */
20068 
20069 			curr_e_offset = MIN(map_entry->vme_end, end);
20070 
20071 			uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20072 
20073 			void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20074 
20075 			bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20076 
20077 			curr_s_offset = curr_e_offset;
20078 
20079 			info_idx += num_pages;
20080 
20081 			continue;
20082 		}
20083 
20084 		if (do_region_footprint) {
20085 			disposition = 0;
20086 			if (map->has_corpse_footprint) {
20087 				/*
20088 				 * Query the page info data we saved
20089 				 * while forking the corpse.
20090 				 */
20091 				vm_map_corpse_footprint_query_page_info(
20092 					map,
20093 					curr_s_offset,
20094 					&disposition);
20095 			} else {
20096 				/*
20097 				 * Query the live pmap for footprint info
20098 				 * about this page.
20099 				 */
20100 				vm_map_footprint_query_page_info(
20101 					map,
20102 					map_entry,
20103 					curr_s_offset,
20104 					&disposition);
20105 			}
20106 			switch (flavor) {
20107 			case VM_PAGE_INFO_BASIC:
20108 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20109 				basic_info->disposition = disposition;
20110 				basic_info->ref_count = 1;
20111 				basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20112 				basic_info->offset = 0;
20113 				basic_info->depth = 0;
20114 
20115 				info_idx++;
20116 				break;
20117 			}
20118 			curr_s_offset += effective_page_size;
20119 			continue;
20120 		}
20121 
20122 		vm_object_reference(object);
20123 		/*
20124 		 * Shared mode -- so we can allow other readers
20125 		 * to grab the lock too.
20126 		 */
20127 		vm_object_lock_shared(object);
20128 
20129 		curr_e_offset = MIN(map_entry->vme_end, end);
20130 
20131 		vm_map_unlock_read(map);
20132 
20133 		map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20134 
20135 		curr_object = object;
20136 
20137 		for (; curr_s_offset < curr_e_offset;) {
20138 			if (object == curr_object) {
20139 				ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
20140 			} else {
20141 				ref_count = curr_object->ref_count;
20142 			}
20143 
20144 			curr_offset_in_object = offset_in_object;
20145 
20146 			for (;;) {
20147 				m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20148 
20149 				if (m != VM_PAGE_NULL) {
20150 					disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20151 					break;
20152 				} else {
20153 					if (curr_object->internal &&
20154 					    curr_object->alive &&
20155 					    !curr_object->terminating &&
20156 					    curr_object->pager_ready) {
20157 						if (vm_object_compressor_pager_state_get(curr_object, vm_object_trunc_page(curr_offset_in_object))
20158 						    == VM_EXTERNAL_STATE_EXISTS) {
20159 							/* the pager has that page */
20160 							disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20161 							break;
20162 						}
20163 					}
20164 
20165 					/*
20166 					 * Go down the VM object shadow chain until we find the page
20167 					 * we're looking for.
20168 					 */
20169 
20170 					if (curr_object->shadow != VM_OBJECT_NULL) {
20171 						vm_object_t shadow = VM_OBJECT_NULL;
20172 
20173 						curr_offset_in_object += curr_object->vo_shadow_offset;
20174 						shadow = curr_object->shadow;
20175 
20176 						vm_object_lock_shared(shadow);
20177 						vm_object_unlock(curr_object);
20178 
20179 						curr_object = shadow;
20180 						depth++;
20181 						continue;
20182 					} else {
20183 						break;
20184 					}
20185 				}
20186 			}
20187 
20188 			/* The ref_count is not strictly accurate, it measures the number   */
20189 			/* of entities holding a ref on the object, they may not be mapping */
20190 			/* the object or may not be mapping the section holding the         */
20191 			/* target page but its still a ball park number and though an over- */
20192 			/* count, it picks up the copy-on-write cases                       */
20193 
20194 			/* We could also get a picture of page sharing from pmap_attributes */
20195 			/* but this would under count as only faulted-in mappings would     */
20196 			/* show up.							    */
20197 
20198 			if ((curr_object == object) && curr_object->shadow) {
20199 				disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20200 			}
20201 
20202 			if (!curr_object->internal) {
20203 				disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20204 			}
20205 
20206 			if (m != VM_PAGE_NULL) {
20207 				if (m->vmp_fictitious) {
20208 					disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20209 				} else {
20210 					if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20211 						disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20212 					}
20213 
20214 					if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20215 						disposition |= VM_PAGE_QUERY_PAGE_REF;
20216 					}
20217 
20218 					if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20219 						disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20220 					}
20221 
20222 					/*
20223 					 * XXX TODO4K:
20224 					 * when this routine deals with 4k
20225 					 * pages, check the appropriate CS bit
20226 					 * here.
20227 					 */
20228 					if (m->vmp_cs_validated) {
20229 						disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20230 					}
20231 					if (m->vmp_cs_tainted) {
20232 						disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20233 					}
20234 					if (m->vmp_cs_nx) {
20235 						disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20236 					}
20237 					if (m->vmp_reusable || curr_object->all_reusable) {
20238 						disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20239 					}
20240 				}
20241 			}
20242 
20243 			switch (flavor) {
20244 			case VM_PAGE_INFO_BASIC:
20245 				basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20246 				basic_info->disposition = disposition;
20247 				basic_info->ref_count = ref_count;
20248 				basic_info->object_id = (vm_object_id_t) (uintptr_t)
20249 				    VM_KERNEL_ADDRHASH(curr_object);
20250 				basic_info->offset =
20251 				    (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20252 				basic_info->depth = depth;
20253 
20254 				info_idx++;
20255 				break;
20256 			}
20257 
20258 			disposition = 0;
20259 			offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20260 
20261 			/*
20262 			 * Move to next offset in the range and in our object.
20263 			 */
20264 			curr_s_offset += effective_page_size;
20265 			offset_in_object += effective_page_size;
20266 			curr_offset_in_object = offset_in_object;
20267 
20268 			if (curr_object != object) {
20269 				vm_object_unlock(curr_object);
20270 
20271 				curr_object = object;
20272 
20273 				vm_object_lock_shared(curr_object);
20274 			} else {
20275 				vm_object_lock_yield_shared(curr_object);
20276 			}
20277 		}
20278 
20279 		vm_object_unlock(curr_object);
20280 		vm_object_deallocate(curr_object);
20281 
20282 		vm_map_lock_read(map);
20283 	}
20284 
20285 	vm_map_unlock_read(map);
20286 	return retval;
20287 }
20288 
20289 /*
20290  *	vm_map_msync
20291  *
20292  *	Synchronises the memory range specified with its backing store
20293  *	image by either flushing or cleaning the contents to the appropriate
20294  *	memory manager engaging in a memory object synchronize dialog with
20295  *	the manager.  The client doesn't return until the manager issues
20296  *	m_o_s_completed message.  MIG Magically converts user task parameter
20297  *	to the task's address map.
20298  *
20299  *	interpretation of sync_flags
20300  *	VM_SYNC_INVALIDATE	- discard pages, only return precious
20301  *				  pages to manager.
20302  *
20303  *	VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20304  *				- discard pages, write dirty or precious
20305  *				  pages back to memory manager.
20306  *
20307  *	VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20308  *				- write dirty or precious pages back to
20309  *				  the memory manager.
20310  *
20311  *	VM_SYNC_CONTIGUOUS	- does everything normally, but if there
20312  *				  is a hole in the region, and we would
20313  *				  have returned KERN_SUCCESS, return
20314  *				  KERN_INVALID_ADDRESS instead.
20315  *
20316  *	NOTE
20317  *	The memory object attributes have not yet been implemented, this
20318  *	function will have to deal with the invalidate attribute
20319  *
20320  *	RETURNS
20321  *	KERN_INVALID_TASK		Bad task parameter
20322  *	KERN_INVALID_ARGUMENT		both sync and async were specified.
20323  *	KERN_SUCCESS			The usual.
20324  *	KERN_INVALID_ADDRESS		There was a hole in the region.
20325  */
20326 
20327 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)20328 vm_map_msync(
20329 	vm_map_t                map,
20330 	vm_map_address_t        address,
20331 	vm_map_size_t           size,
20332 	vm_sync_t               sync_flags)
20333 {
20334 	vm_map_entry_t          entry;
20335 	vm_map_size_t           amount_left;
20336 	vm_object_offset_t      offset;
20337 	vm_object_offset_t      start_offset, end_offset;
20338 	boolean_t               do_sync_req;
20339 	boolean_t               had_hole = FALSE;
20340 	vm_map_offset_t         pmap_offset;
20341 
20342 	if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20343 	    (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20344 		return KERN_INVALID_ARGUMENT;
20345 	}
20346 
20347 	if (__improbable(vm_map_range_overflows(map, address, size))) {
20348 		return KERN_INVALID_ADDRESS;
20349 	}
20350 
20351 	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20352 		DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20353 	}
20354 
20355 	/*
20356 	 * align address and size on page boundaries
20357 	 */
20358 	size = (vm_map_round_page(address + size,
20359 	    VM_MAP_PAGE_MASK(map)) -
20360 	    vm_map_trunc_page(address,
20361 	    VM_MAP_PAGE_MASK(map)));
20362 	address = vm_map_trunc_page(address,
20363 	    VM_MAP_PAGE_MASK(map));
20364 
20365 	if (map == VM_MAP_NULL) {
20366 		return KERN_INVALID_TASK;
20367 	}
20368 
20369 	if (size == 0) {
20370 		return KERN_SUCCESS;
20371 	}
20372 
20373 	amount_left = size;
20374 
20375 	while (amount_left > 0) {
20376 		vm_object_size_t        flush_size;
20377 		vm_object_t             object;
20378 
20379 		vm_map_lock(map);
20380 		if (!vm_map_lookup_entry(map,
20381 		    address,
20382 		    &entry)) {
20383 			vm_map_size_t   skip;
20384 
20385 			/*
20386 			 * hole in the address map.
20387 			 */
20388 			had_hole = TRUE;
20389 
20390 			if (sync_flags & VM_SYNC_KILLPAGES) {
20391 				/*
20392 				 * For VM_SYNC_KILLPAGES, there should be
20393 				 * no holes in the range, since we couldn't
20394 				 * prevent someone else from allocating in
20395 				 * that hole and we wouldn't want to "kill"
20396 				 * their pages.
20397 				 */
20398 				vm_map_unlock(map);
20399 				break;
20400 			}
20401 
20402 			/*
20403 			 * Check for empty map.
20404 			 */
20405 			if (entry == vm_map_to_entry(map) &&
20406 			    entry->vme_next == entry) {
20407 				vm_map_unlock(map);
20408 				break;
20409 			}
20410 			/*
20411 			 * Check that we don't wrap and that
20412 			 * we have at least one real map entry.
20413 			 */
20414 			if ((map->hdr.nentries == 0) ||
20415 			    (entry->vme_next->vme_start < address)) {
20416 				vm_map_unlock(map);
20417 				break;
20418 			}
20419 			/*
20420 			 * Move up to the next entry if needed
20421 			 */
20422 			skip = (entry->vme_next->vme_start - address);
20423 			if (skip >= amount_left) {
20424 				amount_left = 0;
20425 			} else {
20426 				amount_left -= skip;
20427 			}
20428 			address = entry->vme_next->vme_start;
20429 			vm_map_unlock(map);
20430 			continue;
20431 		}
20432 
20433 		offset = address - entry->vme_start;
20434 		pmap_offset = address;
20435 
20436 		/*
20437 		 * do we have more to flush than is contained in this
20438 		 * entry ?
20439 		 */
20440 		if (amount_left + entry->vme_start + offset > entry->vme_end) {
20441 			flush_size = entry->vme_end -
20442 			    (entry->vme_start + offset);
20443 		} else {
20444 			flush_size = amount_left;
20445 		}
20446 		amount_left -= flush_size;
20447 		address += flush_size;
20448 
20449 		if (entry->is_sub_map == TRUE) {
20450 			vm_map_t        local_map;
20451 			vm_map_offset_t local_offset;
20452 
20453 			local_map = VME_SUBMAP(entry);
20454 			local_offset = VME_OFFSET(entry);
20455 			vm_map_reference(local_map);
20456 			vm_map_unlock(map);
20457 			if (vm_map_msync(
20458 				    local_map,
20459 				    local_offset,
20460 				    flush_size,
20461 				    sync_flags) == KERN_INVALID_ADDRESS) {
20462 				had_hole = TRUE;
20463 			}
20464 			vm_map_deallocate(local_map);
20465 			continue;
20466 		}
20467 		object = VME_OBJECT(entry);
20468 
20469 		/*
20470 		 * We can't sync this object if the object has not been
20471 		 * created yet
20472 		 */
20473 		if (object == VM_OBJECT_NULL) {
20474 			vm_map_unlock(map);
20475 			continue;
20476 		}
20477 		offset += VME_OFFSET(entry);
20478 
20479 		vm_object_lock(object);
20480 
20481 		if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20482 			int kill_pages = 0;
20483 
20484 			if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20485 				/*
20486 				 * This is a destructive operation and so we
20487 				 * err on the side of limiting the range of
20488 				 * the operation.
20489 				 */
20490 				start_offset = vm_object_round_page(offset);
20491 				end_offset = vm_object_trunc_page(offset + flush_size);
20492 
20493 				if (end_offset <= start_offset) {
20494 					vm_object_unlock(object);
20495 					vm_map_unlock(map);
20496 					continue;
20497 				}
20498 
20499 				pmap_offset += start_offset - offset;
20500 			} else {
20501 				start_offset = offset;
20502 				end_offset = offset + flush_size;
20503 			}
20504 
20505 			if (sync_flags & VM_SYNC_KILLPAGES) {
20506 				if (((object->ref_count == 1) ||
20507 				    ((object->copy_strategy !=
20508 				    MEMORY_OBJECT_COPY_SYMMETRIC) &&
20509 				    (object->vo_copy == VM_OBJECT_NULL))) &&
20510 				    (object->shadow == VM_OBJECT_NULL)) {
20511 					if (object->ref_count != 1) {
20512 						vm_page_stats_reusable.free_shared++;
20513 					}
20514 					kill_pages = 1;
20515 				} else {
20516 					kill_pages = -1;
20517 				}
20518 			}
20519 			if (kill_pages != -1) {
20520 				vm_object_deactivate_pages(
20521 					object,
20522 					start_offset,
20523 					(vm_object_size_t) (end_offset - start_offset),
20524 					kill_pages,
20525 					FALSE, /* reusable_pages */
20526 					FALSE, /* reusable_no_write */
20527 					map->pmap,
20528 					pmap_offset);
20529 			}
20530 			vm_object_unlock(object);
20531 			vm_map_unlock(map);
20532 			continue;
20533 		}
20534 		/*
20535 		 * We can't sync this object if there isn't a pager.
20536 		 * Don't bother to sync internal objects, since there can't
20537 		 * be any "permanent" storage for these objects anyway.
20538 		 */
20539 		if ((object->pager == MEMORY_OBJECT_NULL) ||
20540 		    (object->internal) || (object->private)) {
20541 			vm_object_unlock(object);
20542 			vm_map_unlock(map);
20543 			continue;
20544 		}
20545 		/*
20546 		 * keep reference on the object until syncing is done
20547 		 */
20548 		vm_object_reference_locked(object);
20549 		vm_object_unlock(object);
20550 
20551 		vm_map_unlock(map);
20552 
20553 		if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20554 			start_offset = vm_object_trunc_page(offset);
20555 			end_offset = vm_object_round_page(offset + flush_size);
20556 		} else {
20557 			start_offset = offset;
20558 			end_offset = offset + flush_size;
20559 		}
20560 
20561 		do_sync_req = vm_object_sync(object,
20562 		    start_offset,
20563 		    (end_offset - start_offset),
20564 		    sync_flags & VM_SYNC_INVALIDATE,
20565 		    ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20566 		    (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20567 		    sync_flags & VM_SYNC_SYNCHRONOUS);
20568 
20569 		if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20570 			/*
20571 			 * clear out the clustering and read-ahead hints
20572 			 */
20573 			vm_object_lock(object);
20574 
20575 			object->pages_created = 0;
20576 			object->pages_used = 0;
20577 			object->sequential = 0;
20578 			object->last_alloc = 0;
20579 
20580 			vm_object_unlock(object);
20581 		}
20582 		vm_object_deallocate(object);
20583 	} /* while */
20584 
20585 	/* for proper msync() behaviour */
20586 	if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20587 		return KERN_INVALID_ADDRESS;
20588 	}
20589 
20590 	return KERN_SUCCESS;
20591 }/* vm_msync */
20592 
20593 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20594 vm_named_entry_associate_vm_object(
20595 	vm_named_entry_t        named_entry,
20596 	vm_object_t             object,
20597 	vm_object_offset_t      offset,
20598 	vm_object_size_t        size,
20599 	vm_prot_t               prot)
20600 {
20601 	vm_map_copy_t copy;
20602 	vm_map_entry_t copy_entry;
20603 
20604 	assert(!named_entry->is_sub_map);
20605 	assert(!named_entry->is_copy);
20606 	assert(!named_entry->is_object);
20607 	assert(!named_entry->internal);
20608 	assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20609 
20610 	copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20611 	copy->offset = offset;
20612 	copy->size = size;
20613 	copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20614 
20615 	copy_entry = vm_map_copy_entry_create(copy);
20616 	copy_entry->protection = prot;
20617 	copy_entry->max_protection = prot;
20618 	copy_entry->use_pmap = TRUE;
20619 	copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20620 	copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20621 	VME_OBJECT_SET(copy_entry, object, false, 0);
20622 	VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20623 	vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20624 
20625 	named_entry->backing.copy = copy;
20626 	named_entry->is_object = TRUE;
20627 	if (object->internal) {
20628 		named_entry->internal = TRUE;
20629 	}
20630 
20631 	DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
20632 	    named_entry, copy, object, offset, size, prot);
20633 }
20634 
20635 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)20636 vm_named_entry_to_vm_object(
20637 	vm_named_entry_t named_entry)
20638 {
20639 	vm_map_copy_t   copy;
20640 	vm_map_entry_t  copy_entry;
20641 	vm_object_t     object;
20642 
20643 	assert(!named_entry->is_sub_map);
20644 	assert(!named_entry->is_copy);
20645 	assert(named_entry->is_object);
20646 	copy = named_entry->backing.copy;
20647 	assert(copy != VM_MAP_COPY_NULL);
20648 	/*
20649 	 * Assert that the vm_map_copy is coming from the right
20650 	 * zone and hasn't been forged
20651 	 */
20652 	vm_map_copy_require(copy);
20653 	assert(copy->cpy_hdr.nentries == 1);
20654 	copy_entry = vm_map_copy_first_entry(copy);
20655 	object = VME_OBJECT(copy_entry);
20656 
20657 	DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
20658 
20659 	return object;
20660 }
20661 
20662 /*
20663  *	Routine:	convert_port_entry_to_map
20664  *	Purpose:
20665  *		Convert from a port specifying an entry or a task
20666  *		to a map. Doesn't consume the port ref; produces a map ref,
20667  *		which may be null.  Unlike convert_port_to_map, the
20668  *		port may be task or a named entry backed.
20669  *	Conditions:
20670  *		Nothing locked.
20671  */
20672 
20673 vm_map_t
convert_port_entry_to_map(ipc_port_t port)20674 convert_port_entry_to_map(
20675 	ipc_port_t      port)
20676 {
20677 	vm_map_t map = VM_MAP_NULL;
20678 	vm_named_entry_t named_entry;
20679 
20680 	if (!IP_VALID(port)) {
20681 		return VM_MAP_NULL;
20682 	}
20683 
20684 	if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
20685 		return convert_port_to_map(port);
20686 	}
20687 
20688 	named_entry = mach_memory_entry_from_port(port);
20689 
20690 	if ((named_entry->is_sub_map) &&
20691 	    (named_entry->protection & VM_PROT_WRITE)) {
20692 		map = named_entry->backing.map;
20693 		if (map->pmap != PMAP_NULL) {
20694 			if (map->pmap == kernel_pmap) {
20695 				panic("userspace has access "
20696 				    "to a kernel map %p", map);
20697 			}
20698 			pmap_require(map->pmap);
20699 		}
20700 		vm_map_reference(map);
20701 	}
20702 
20703 	return map;
20704 }
20705 
20706 /*
20707  * Export routines to other components for the things we access locally through
20708  * macros.
20709  */
20710 #undef current_map
20711 vm_map_t
current_map(void)20712 current_map(void)
20713 {
20714 	return current_map_fast();
20715 }
20716 
20717 /*
20718  *	vm_map_reference:
20719  *
20720  *	Takes a reference on the specified map.
20721  */
20722 void
vm_map_reference(vm_map_t map)20723 vm_map_reference(
20724 	vm_map_t        map)
20725 {
20726 	if (__probable(map != VM_MAP_NULL)) {
20727 		vm_map_require(map);
20728 		os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
20729 	}
20730 }
20731 
20732 /*
20733  *	vm_map_deallocate:
20734  *
20735  *	Removes a reference from the specified map,
20736  *	destroying it if no references remain.
20737  *	The map should not be locked.
20738  */
20739 void
vm_map_deallocate(vm_map_t map)20740 vm_map_deallocate(
20741 	vm_map_t        map)
20742 {
20743 	if (__probable(map != VM_MAP_NULL)) {
20744 		vm_map_require(map);
20745 		if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
20746 			vm_map_destroy(map);
20747 		}
20748 	}
20749 }
20750 
20751 void
vm_map_inspect_deallocate(vm_map_inspect_t map)20752 vm_map_inspect_deallocate(
20753 	vm_map_inspect_t      map)
20754 {
20755 	vm_map_deallocate((vm_map_t)map);
20756 }
20757 
20758 void
vm_map_read_deallocate(vm_map_read_t map)20759 vm_map_read_deallocate(
20760 	vm_map_read_t      map)
20761 {
20762 	vm_map_deallocate((vm_map_t)map);
20763 }
20764 
20765 
20766 void
vm_map_disable_NX(vm_map_t map)20767 vm_map_disable_NX(vm_map_t map)
20768 {
20769 	if (map == NULL) {
20770 		return;
20771 	}
20772 	if (map->pmap == NULL) {
20773 		return;
20774 	}
20775 
20776 	pmap_disable_NX(map->pmap);
20777 }
20778 
20779 void
vm_map_disallow_data_exec(vm_map_t map)20780 vm_map_disallow_data_exec(vm_map_t map)
20781 {
20782 	if (map == NULL) {
20783 		return;
20784 	}
20785 
20786 	map->map_disallow_data_exec = TRUE;
20787 }
20788 
20789 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
20790  * more descriptive.
20791  */
20792 void
vm_map_set_32bit(vm_map_t map)20793 vm_map_set_32bit(vm_map_t map)
20794 {
20795 #if defined(__arm64__)
20796 	map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
20797 #else
20798 	map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
20799 #endif
20800 }
20801 
20802 
20803 void
vm_map_set_64bit(vm_map_t map)20804 vm_map_set_64bit(vm_map_t map)
20805 {
20806 #if defined(__arm64__)
20807 	map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
20808 #else
20809 	map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
20810 #endif
20811 }
20812 
20813 /*
20814  * Expand the maximum size of an existing map to 64GB.
20815  */
20816 void
vm_map_set_jumbo(vm_map_t map)20817 vm_map_set_jumbo(vm_map_t map)
20818 {
20819 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
20820 	vm_map_set_max_addr(map, ~0, false);
20821 #else /* arm64 */
20822 	(void) map;
20823 #endif
20824 }
20825 
20826 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
20827 /*
20828  * Expand the maximum size of an existing map to the maximum supported.
20829  */
20830 void
vm_map_set_extra_jumbo(vm_map_t map)20831 vm_map_set_extra_jumbo(vm_map_t map)
20832 {
20833 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
20834 	vm_map_set_max_addr(map, ~0, true);
20835 #else /* arm64 */
20836 	(void) map;
20837 #endif
20838 }
20839 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
20840 
20841 /*
20842  * This map has a JIT entitlement
20843  */
20844 void
vm_map_set_jit_entitled(vm_map_t map)20845 vm_map_set_jit_entitled(vm_map_t map)
20846 {
20847 #if defined (__arm64__)
20848 	pmap_set_jit_entitled(map->pmap);
20849 #else /* arm64 */
20850 	(void) map;
20851 #endif
20852 }
20853 
20854 /*
20855  * Get status of this maps TPRO flag
20856  */
20857 boolean_t
vm_map_tpro(vm_map_t map)20858 vm_map_tpro(vm_map_t map)
20859 {
20860 #if defined (__arm64e__)
20861 	return pmap_get_tpro(map->pmap);
20862 #else /* arm64e */
20863 	(void) map;
20864 	return FALSE;
20865 #endif
20866 }
20867 
20868 /*
20869  * This map has TPRO enabled
20870  */
20871 void
vm_map_set_tpro(vm_map_t map)20872 vm_map_set_tpro(vm_map_t map)
20873 {
20874 #if defined (__arm64e__)
20875 	pmap_set_tpro(map->pmap);
20876 #else /* arm64e */
20877 	(void) map;
20878 #endif
20879 }
20880 
20881 /*
20882  * Does this map have TPRO enforcement enabled
20883  */
20884 boolean_t
vm_map_tpro_enforcement(vm_map_t map)20885 vm_map_tpro_enforcement(vm_map_t map)
20886 {
20887 	return map->tpro_enforcement;
20888 }
20889 
20890 /*
20891  * Set TPRO enforcement for this map
20892  */
20893 void
vm_map_set_tpro_enforcement(vm_map_t map)20894 vm_map_set_tpro_enforcement(vm_map_t map)
20895 {
20896 	if (vm_map_tpro(map)) {
20897 		vm_map_lock(map);
20898 		map->tpro_enforcement = TRUE;
20899 		vm_map_unlock(map);
20900 	}
20901 }
20902 
20903 /*
20904  * Enable TPRO on the requested region
20905  *
20906  * Note:
20907  *     This routine is primarily intended to be called during/soon after map
20908  *     creation before the associated task has been released to run. It is only
20909  *     currently safe when we have no resident pages.
20910  */
20911 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)20912 vm_map_set_tpro_range(
20913 	__unused vm_map_t map,
20914 	__unused vm_map_address_t start,
20915 	__unused vm_map_address_t end)
20916 {
20917 	return TRUE;
20918 }
20919 
20920 /*
20921  * Expand the maximum size of an existing map.
20922  */
20923 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset,__unused bool extra_jumbo)20924 vm_map_set_max_addr(
20925 	vm_map_t map,
20926 	vm_map_offset_t new_max_offset,
20927 	__unused bool extra_jumbo)
20928 {
20929 #if defined(__arm64__)
20930 	vm_map_offset_t max_supported_offset;
20931 	vm_map_offset_t old_max_offset;
20932 	unsigned int option = ARM_PMAP_MAX_OFFSET_JUMBO;
20933 
20934 	vm_map_lock(map);
20935 
20936 	old_max_offset = map->max_offset;
20937 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
20938 	if (extra_jumbo) {
20939 		option = ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO;
20940 	}
20941 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
20942 	max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), option);
20943 
20944 	new_max_offset = trunc_page(new_max_offset);
20945 
20946 	/* The address space cannot be shrunk using this routine. */
20947 	if (old_max_offset >= new_max_offset) {
20948 		vm_map_unlock(map);
20949 		return;
20950 	}
20951 
20952 	if (max_supported_offset < new_max_offset) {
20953 		new_max_offset = max_supported_offset;
20954 	}
20955 
20956 	map->max_offset = new_max_offset;
20957 
20958 	/*
20959 	 * Disable the following chunk of code that extends the "holes" list
20960 	 * to accomodate a larger VM map.
20961 	 * In `vm_map_create_options()`, we now set the end of the "holes" list to
20962 	 * max(map->max_offset, MACH_VM_MAX_ADDRESS) for all platforms.
20963 	 * MACH_VM_MAX_ADDRESS is the largest virtual address a userspace process
20964 	 * can map, so any `new_max_offset` value will be <= MACH_VM_MAX_ADDRESS.
20965 	 * The "holes" list does not need to be adjusted.
20966 	 */
20967 #if 0
20968 	if (map->holelistenabled) {
20969 		if (map->holes_list->prev->vme_end == old_max_offset) {
20970 			/*
20971 			 * There is already a hole at the end of the map; simply make it bigger.
20972 			 */
20973 			map->holes_list->prev->vme_end = map->max_offset;
20974 		} else {
20975 			/*
20976 			 * There is no hole at the end, so we need to create a new hole
20977 			 * for the new empty space we're creating.
20978 			 */
20979 			struct vm_map_links *new_hole;
20980 
20981 			new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
20982 			new_hole->start = old_max_offset;
20983 			new_hole->end = map->max_offset;
20984 			new_hole->prev = map->holes_list->prev;
20985 			new_hole->next = (struct vm_map_entry *)map->holes_list;
20986 			map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
20987 			map->holes_list->prev = (struct vm_map_entry *)new_hole;
20988 		}
20989 	}
20990 #endif
20991 
20992 	vm_map_unlock(map);
20993 #else
20994 	(void)map;
20995 	(void)new_max_offset;
20996 #endif
20997 }
20998 
20999 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21000 vm_compute_max_offset(boolean_t is64)
21001 {
21002 #if defined(__arm64__)
21003 	return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21004 #else
21005 	return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21006 #endif
21007 }
21008 
21009 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21010 vm_map_get_max_aslr_slide_section(
21011 	vm_map_t                map __unused,
21012 	int64_t                 *max_sections,
21013 	int64_t                 *section_size)
21014 {
21015 #if defined(__arm64__)
21016 	*max_sections = 3;
21017 	*section_size = ARM_TT_TWIG_SIZE;
21018 #else
21019 	*max_sections = 1;
21020 	*section_size = 0;
21021 #endif
21022 }
21023 
21024 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21025 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21026 {
21027 #if defined(__arm64__)
21028 	/* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21029 	 * limited embedded address space; this is also meant to minimize pmap
21030 	 * memory usage on 16KB page systems.
21031 	 */
21032 	return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21033 #else
21034 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21035 #endif
21036 }
21037 
21038 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21039 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21040 {
21041 #if defined(__arm64__)
21042 	/* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21043 	 * of independent entropy on 16KB page systems.
21044 	 */
21045 	return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21046 #else
21047 	return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21048 #endif
21049 }
21050 
21051 boolean_t
vm_map_is_64bit(vm_map_t map)21052 vm_map_is_64bit(
21053 	vm_map_t map)
21054 {
21055 	return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21056 }
21057 
21058 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21059 vm_map_has_hard_pagezero(
21060 	vm_map_t        map,
21061 	vm_map_offset_t pagezero_size)
21062 {
21063 	/*
21064 	 * XXX FBDP
21065 	 * We should lock the VM map (for read) here but we can get away
21066 	 * with it for now because there can't really be any race condition:
21067 	 * the VM map's min_offset is changed only when the VM map is created
21068 	 * and when the zero page is established (when the binary gets loaded),
21069 	 * and this routine gets called only when the task terminates and the
21070 	 * VM map is being torn down, and when a new map is created via
21071 	 * load_machfile()/execve().
21072 	 */
21073 	return map->min_offset >= pagezero_size;
21074 }
21075 
21076 /*
21077  * Raise a VM map's maximun offset.
21078  */
21079 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21080 vm_map_raise_max_offset(
21081 	vm_map_t        map,
21082 	vm_map_offset_t new_max_offset)
21083 {
21084 	kern_return_t   ret;
21085 
21086 	vm_map_lock(map);
21087 	ret = KERN_INVALID_ADDRESS;
21088 
21089 	if (new_max_offset >= map->max_offset) {
21090 		if (!vm_map_is_64bit(map)) {
21091 			if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21092 				map->max_offset = new_max_offset;
21093 				ret = KERN_SUCCESS;
21094 			}
21095 		} else {
21096 			if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21097 				map->max_offset = new_max_offset;
21098 				ret = KERN_SUCCESS;
21099 			}
21100 		}
21101 	}
21102 
21103 	vm_map_unlock(map);
21104 	return ret;
21105 }
21106 
21107 
21108 /*
21109  * Raise a VM map's minimum offset.
21110  * To strictly enforce "page zero" reservation.
21111  */
21112 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21113 vm_map_raise_min_offset(
21114 	vm_map_t        map,
21115 	vm_map_offset_t new_min_offset)
21116 {
21117 	vm_map_entry_t  first_entry;
21118 
21119 	new_min_offset = vm_map_round_page(new_min_offset,
21120 	    VM_MAP_PAGE_MASK(map));
21121 
21122 	vm_map_lock(map);
21123 
21124 	if (new_min_offset < map->min_offset) {
21125 		/*
21126 		 * Can't move min_offset backwards, as that would expose
21127 		 * a part of the address space that was previously, and for
21128 		 * possibly good reasons, inaccessible.
21129 		 */
21130 		vm_map_unlock(map);
21131 		return KERN_INVALID_ADDRESS;
21132 	}
21133 	if (new_min_offset >= map->max_offset) {
21134 		/* can't go beyond the end of the address space */
21135 		vm_map_unlock(map);
21136 		return KERN_INVALID_ADDRESS;
21137 	}
21138 
21139 	first_entry = vm_map_first_entry(map);
21140 	if (first_entry != vm_map_to_entry(map) &&
21141 	    first_entry->vme_start < new_min_offset) {
21142 		/*
21143 		 * Some memory was already allocated below the new
21144 		 * minimun offset.  It's too late to change it now...
21145 		 */
21146 		vm_map_unlock(map);
21147 		return KERN_NO_SPACE;
21148 	}
21149 
21150 	map->min_offset = new_min_offset;
21151 
21152 	if (map->holelistenabled) {
21153 		assert(map->holes_list);
21154 		map->holes_list->start = new_min_offset;
21155 		assert(new_min_offset < map->holes_list->end);
21156 	}
21157 
21158 	vm_map_unlock(map);
21159 
21160 	return KERN_SUCCESS;
21161 }
21162 
21163 /*
21164  * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21165  * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21166  * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21167  * have to reach over to the BSD data structures.
21168  */
21169 
21170 uint64_t vm_map_set_size_limit_count = 0;
21171 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21172 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21173 {
21174 	kern_return_t kr;
21175 
21176 	vm_map_lock(map);
21177 	if (new_size_limit < map->size) {
21178 		/* new limit should not be lower than its current size */
21179 		DTRACE_VM2(vm_map_set_size_limit_fail,
21180 		    vm_map_size_t, map->size,
21181 		    uint64_t, new_size_limit);
21182 		kr = KERN_FAILURE;
21183 	} else if (new_size_limit == map->size_limit) {
21184 		/* no change */
21185 		kr = KERN_SUCCESS;
21186 	} else {
21187 		/* set new limit */
21188 		DTRACE_VM2(vm_map_set_size_limit,
21189 		    vm_map_size_t, map->size,
21190 		    uint64_t, new_size_limit);
21191 		if (new_size_limit != RLIM_INFINITY) {
21192 			vm_map_set_size_limit_count++;
21193 		}
21194 		map->size_limit = new_size_limit;
21195 		kr = KERN_SUCCESS;
21196 	}
21197 	vm_map_unlock(map);
21198 	return kr;
21199 }
21200 
21201 uint64_t vm_map_set_data_limit_count = 0;
21202 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21203 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21204 {
21205 	kern_return_t kr;
21206 
21207 	vm_map_lock(map);
21208 	if (new_data_limit < map->size) {
21209 		/* new limit should not be lower than its current size */
21210 		DTRACE_VM2(vm_map_set_data_limit_fail,
21211 		    vm_map_size_t, map->size,
21212 		    uint64_t, new_data_limit);
21213 		kr = KERN_FAILURE;
21214 	} else if (new_data_limit == map->data_limit) {
21215 		/* no change */
21216 		kr = KERN_SUCCESS;
21217 	} else {
21218 		/* set new limit */
21219 		DTRACE_VM2(vm_map_set_data_limit,
21220 		    vm_map_size_t, map->size,
21221 		    uint64_t, new_data_limit);
21222 		if (new_data_limit != RLIM_INFINITY) {
21223 			vm_map_set_data_limit_count++;
21224 		}
21225 		map->data_limit = new_data_limit;
21226 		kr = KERN_SUCCESS;
21227 	}
21228 	vm_map_unlock(map);
21229 	return kr;
21230 }
21231 
21232 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21233 vm_map_set_user_wire_limit(vm_map_t     map,
21234     vm_size_t    limit)
21235 {
21236 	vm_map_lock(map);
21237 	map->user_wire_limit = limit;
21238 	vm_map_unlock(map);
21239 }
21240 
21241 
21242 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21243 vm_map_switch_protect(vm_map_t     map,
21244     boolean_t    val)
21245 {
21246 	vm_map_lock(map);
21247 	map->switch_protect = val;
21248 	vm_map_unlock(map);
21249 }
21250 
21251 extern int cs_process_enforcement_enable;
21252 boolean_t
vm_map_cs_enforcement(vm_map_t map)21253 vm_map_cs_enforcement(
21254 	vm_map_t map)
21255 {
21256 	if (cs_process_enforcement_enable) {
21257 		return TRUE;
21258 	}
21259 	return map->cs_enforcement;
21260 }
21261 
21262 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21263 vm_map_cs_wx_enable(
21264 	__unused vm_map_t map)
21265 {
21266 #if CODE_SIGNING_MONITOR
21267 	kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21268 	if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21269 		return KERN_SUCCESS;
21270 	}
21271 	return ret;
21272 #else
21273 	/* The VM manages WX memory entirely on its own */
21274 	return KERN_SUCCESS;
21275 #endif
21276 }
21277 
21278 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21279 vm_map_csm_allow_jit(
21280 	__unused vm_map_t map)
21281 {
21282 #if CODE_SIGNING_MONITOR
21283 	return csm_allow_jit_region(vm_map_pmap(map));
21284 #else
21285 	/* No code signing monitor to enforce JIT policy */
21286 	return KERN_SUCCESS;
21287 #endif
21288 }
21289 
21290 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21291 vm_map_cs_debugged_set(
21292 	vm_map_t map,
21293 	boolean_t val)
21294 {
21295 	vm_map_lock(map);
21296 	map->cs_debugged = val;
21297 	vm_map_unlock(map);
21298 }
21299 
21300 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21301 vm_map_cs_enforcement_set(
21302 	vm_map_t map,
21303 	boolean_t val)
21304 {
21305 	vm_map_lock(map);
21306 	map->cs_enforcement = val;
21307 	pmap_set_vm_map_cs_enforced(map->pmap, val);
21308 	vm_map_unlock(map);
21309 }
21310 
21311 /*
21312  * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21313  * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21314  * bump both counters.
21315  */
21316 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21317 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21318 {
21319 	pmap_t pmap = vm_map_pmap(map);
21320 
21321 	ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21322 	ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21323 }
21324 
21325 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21326 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21327 {
21328 	pmap_t pmap = vm_map_pmap(map);
21329 
21330 	ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21331 	ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21332 }
21333 
21334 /* Add (generate) code signature for memory range */
21335 #if CONFIG_DYNAMIC_CODE_SIGNING
21336 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21337 vm_map_sign(vm_map_t map,
21338     vm_map_offset_t start,
21339     vm_map_offset_t end)
21340 {
21341 	vm_map_entry_t entry;
21342 	vm_page_t m;
21343 	vm_object_t object;
21344 
21345 	/*
21346 	 * Vet all the input parameters and current type and state of the
21347 	 * underlaying object.  Return with an error if anything is amiss.
21348 	 */
21349 	if (map == VM_MAP_NULL) {
21350 		return KERN_INVALID_ARGUMENT;
21351 	}
21352 
21353 	if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21354 		return KERN_INVALID_ADDRESS;
21355 	}
21356 
21357 	vm_map_lock_read(map);
21358 
21359 	if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21360 		/*
21361 		 * Must pass a valid non-submap address.
21362 		 */
21363 		vm_map_unlock_read(map);
21364 		return KERN_INVALID_ADDRESS;
21365 	}
21366 
21367 	if ((entry->vme_start > start) || (entry->vme_end < end)) {
21368 		/*
21369 		 * Map entry doesn't cover the requested range. Not handling
21370 		 * this situation currently.
21371 		 */
21372 		vm_map_unlock_read(map);
21373 		return KERN_INVALID_ARGUMENT;
21374 	}
21375 
21376 	object = VME_OBJECT(entry);
21377 	if (object == VM_OBJECT_NULL) {
21378 		/*
21379 		 * Object must already be present or we can't sign.
21380 		 */
21381 		vm_map_unlock_read(map);
21382 		return KERN_INVALID_ARGUMENT;
21383 	}
21384 
21385 	vm_object_lock(object);
21386 	vm_map_unlock_read(map);
21387 
21388 	while (start < end) {
21389 		uint32_t refmod;
21390 
21391 		m = vm_page_lookup(object,
21392 		    start - entry->vme_start + VME_OFFSET(entry));
21393 		if (m == VM_PAGE_NULL) {
21394 			/* shoud we try to fault a page here? we can probably
21395 			 * demand it exists and is locked for this request */
21396 			vm_object_unlock(object);
21397 			return KERN_FAILURE;
21398 		}
21399 		/* deal with special page status */
21400 		if (m->vmp_busy ||
21401 		    (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
21402 			vm_object_unlock(object);
21403 			return KERN_FAILURE;
21404 		}
21405 
21406 		/* Page is OK... now "validate" it */
21407 		/* This is the place where we'll call out to create a code
21408 		 * directory, later */
21409 		/* XXX TODO4K: deal with 4k subpages individually? */
21410 		m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21411 
21412 		/* The page is now "clean" for codesigning purposes. That means
21413 		 * we don't consider it as modified (wpmapped) anymore. But
21414 		 * we'll disconnect the page so we note any future modification
21415 		 * attempts. */
21416 		m->vmp_wpmapped = FALSE;
21417 		refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21418 
21419 		/* Pull the dirty status from the pmap, since we cleared the
21420 		 * wpmapped bit */
21421 		if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21422 			SET_PAGE_DIRTY(m, FALSE);
21423 		}
21424 
21425 		/* On to the next page */
21426 		start += PAGE_SIZE;
21427 	}
21428 	vm_object_unlock(object);
21429 
21430 	return KERN_SUCCESS;
21431 }
21432 #endif
21433 
21434 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)21435 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21436 {
21437 	vm_map_entry_t  entry = VM_MAP_ENTRY_NULL;
21438 	vm_map_entry_t  next_entry;
21439 	kern_return_t   kr = KERN_SUCCESS;
21440 	VM_MAP_ZAP_DECLARE(zap_list);
21441 
21442 	vm_map_lock(map);
21443 
21444 	for (entry = vm_map_first_entry(map);
21445 	    entry != vm_map_to_entry(map);
21446 	    entry = next_entry) {
21447 		next_entry = entry->vme_next;
21448 
21449 		if (!entry->is_sub_map &&
21450 		    VME_OBJECT(entry) &&
21451 		    (VME_OBJECT(entry)->internal == TRUE) &&
21452 		    (VME_OBJECT(entry)->ref_count == 1)) {
21453 			*reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21454 			*reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21455 
21456 			(void)vm_map_delete(map, entry->vme_start,
21457 			    entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
21458 			    KMEM_GUARD_NONE, &zap_list);
21459 		}
21460 	}
21461 
21462 	vm_map_unlock(map);
21463 
21464 	vm_map_zap_dispose(&zap_list);
21465 
21466 	return kr;
21467 }
21468 
21469 
21470 #if DEVELOPMENT || DEBUG
21471 
21472 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)21473 vm_map_disconnect_page_mappings(
21474 	vm_map_t map,
21475 	boolean_t do_unnest)
21476 {
21477 	vm_map_entry_t entry;
21478 	ledger_amount_t byte_count = 0;
21479 
21480 	if (do_unnest == TRUE) {
21481 #ifndef NO_NESTED_PMAP
21482 		vm_map_lock(map);
21483 
21484 		for (entry = vm_map_first_entry(map);
21485 		    entry != vm_map_to_entry(map);
21486 		    entry = entry->vme_next) {
21487 			if (entry->is_sub_map && entry->use_pmap) {
21488 				/*
21489 				 * Make sure the range between the start of this entry and
21490 				 * the end of this entry is no longer nested, so that
21491 				 * we will only remove mappings from the pmap in use by this
21492 				 * this task
21493 				 */
21494 				vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
21495 			}
21496 		}
21497 		vm_map_unlock(map);
21498 #endif
21499 	}
21500 	vm_map_lock_read(map);
21501 
21502 	ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
21503 
21504 	for (entry = vm_map_first_entry(map);
21505 	    entry != vm_map_to_entry(map);
21506 	    entry = entry->vme_next) {
21507 		if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
21508 		    (VME_OBJECT(entry)->phys_contiguous))) {
21509 			continue;
21510 		}
21511 		if (entry->is_sub_map) {
21512 			assert(!entry->use_pmap);
21513 		}
21514 
21515 		pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
21516 	}
21517 	vm_map_unlock_read(map);
21518 
21519 	return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
21520 }
21521 
21522 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)21523 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
21524 {
21525 	vm_object_t object = NULL;
21526 	vm_object_offset_t offset;
21527 	vm_prot_t prot;
21528 	boolean_t wired;
21529 	vm_map_version_t version;
21530 	vm_map_t real_map;
21531 	int result = KERN_FAILURE;
21532 
21533 	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
21534 	vm_map_lock(map);
21535 
21536 	result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
21537 	    OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
21538 	    NULL, &real_map, NULL);
21539 	if (object == NULL) {
21540 		result = KERN_MEMORY_ERROR;
21541 	} else if (object->pager) {
21542 		result = vm_compressor_pager_inject_error(object->pager,
21543 		    offset);
21544 	} else {
21545 		result = KERN_MEMORY_PRESENT;
21546 	}
21547 
21548 	if (object != NULL) {
21549 		vm_object_unlock(object);
21550 	}
21551 
21552 	if (real_map != map) {
21553 		vm_map_unlock(real_map);
21554 	}
21555 	vm_map_unlock(map);
21556 
21557 	return result;
21558 }
21559 
21560 /* iterate over map entries. Call the first argument block for the number of entries and the second for every entry
21561  * returns: KERN_SUCCESS if iteration completed ok,
21562  *      error code if callback returned an error
21563  *      KERN_FAILURE if there was a race of adding/removing entries during the iteration and the number of entries
21564  *      iterated is different from the number in the first call
21565  */
21566 static kern_return_t
21567 vm_map_entries_foreach_locked(vm_map_t map, kern_return_t (^count_handler)(int nentries),
21568     kern_return_t (^entry_handler)(void* entry))
21569 {
21570 	vm_map_lock_assert_held(map);
21571 	int nentries = map->hdr.nentries;
21572 	kern_return_t error = count_handler(nentries);
21573 	if (error) {
21574 		return error;
21575 	}
21576 
21577 	/* iterate until we loop back to the map, see get_vmmap_entries() */
21578 	vm_map_entry_t entry = vm_map_first_entry(map);
21579 	int count = 0;
21580 	while (entry != vm_map_to_entry(map)) {
21581 		error = entry_handler(entry);
21582 		if (error != KERN_SUCCESS) {
21583 			return error;
21584 		}
21585 		entry = entry->vme_next;
21586 		++count;
21587 		if (count > nentries) {
21588 			/* nentries and entries iteration don't agree on how many entries there are, shouldn't really happen */
21589 			return KERN_FAILURE;
21590 		}
21591 	}
21592 	if (count < nentries) {
21593 		return KERN_FAILURE;
21594 	}
21595 	return KERN_SUCCESS;
21596 }
21597 
21598 kern_return_t
21599 vm_map_entries_foreach(vm_map_t map, kern_return_t (^count_handler)(int nentries),
21600     kern_return_t (^entry_handler)(void* entry))
21601 {
21602 	vm_map_lock_read(map);
21603 	kern_return_t error = vm_map_entries_foreach_locked(map, count_handler, entry_handler);
21604 	vm_map_unlock_read(map);
21605 	return error;
21606 }
21607 
21608 /*
21609  * Dump info about the entry into the given buffer.
21610  * return true on success, false if there was not enough space in the give buffer
21611  * argument size in: bytes free in the given buffer, out: bytes written
21612  */
21613 kern_return_t
vm_map_dump_entry_and_compressor_pager(void * pentry,char * buf,size_t * size)21614 vm_map_dump_entry_and_compressor_pager(void* pentry, char *buf, size_t *size)
21615 {
21616 	size_t insize = *size;
21617 	kern_return_t kr;
21618 	size_t offset = 0;
21619 
21620 	*size = 0;
21621 	if (sizeof(struct vm_map_entry_info) > insize) {
21622 		return KERN_NO_SPACE;
21623 	}
21624 
21625 	vm_map_entry_t entry = (vm_map_entry_t)pentry;
21626 	struct vm_map_entry_info *out_entry = (struct vm_map_entry_info*)buf;
21627 	out_entry->vmei_start = entry->vme_start;
21628 	out_entry->vmei_end = entry->vme_end;
21629 	out_entry->vmei_alias = VME_ALIAS(entry);
21630 	out_entry->vmei_offset = VME_OFFSET(entry);
21631 	out_entry->vmei_is_sub_map = entry->is_sub_map;
21632 	out_entry->vmei_protection = entry->protection;
21633 	offset += sizeof(struct vm_map_entry_info);
21634 
21635 	out_entry->vmei_slot_mapping_count = 0;
21636 	out_entry->vmei_is_compressor_pager = false;
21637 	*size = offset;
21638 	if (out_entry->vmei_is_sub_map) {
21639 		return KERN_SUCCESS; // TODO: sub_map interrogation not supported yet
21640 	}
21641 	/* have a vm_object? */
21642 	vm_object_t object = VME_OBJECT(entry);
21643 	if (object == VM_OBJECT_NULL || !object->internal) {
21644 		return KERN_SUCCESS;
21645 	}
21646 	/* objects has a pager? */
21647 	memory_object_t pager = object->pager;
21648 	if (pager != MEMORY_OBJECT_NULL) {
21649 		return KERN_SUCCESS;
21650 	}
21651 	bool is_compressor = false;
21652 	unsigned int slot_mapping_count = 0;
21653 	size_t pager_info_size = insize - offset;
21654 	kr = vm_compressor_pager_dump(pager, buf + offset, &pager_info_size, &is_compressor, &slot_mapping_count);
21655 	if (kr != KERN_SUCCESS) {
21656 		/* didn't have enough space for everything we want to write, caller needs to retry */
21657 		return kr;
21658 	}
21659 	offset += pager_info_size;
21660 	/* if we got here, is_compressor should be true due to the object->internal check above, so this assignment
21661 	 * is just for sanity sake */
21662 	out_entry->vmei_is_compressor_pager = is_compressor;
21663 	out_entry->vmei_slot_mapping_count = slot_mapping_count;
21664 	*size = offset;
21665 	return KERN_SUCCESS;
21666 }
21667 
21668 
21669 #endif
21670 
21671 
21672 #if CONFIG_FREEZE
21673 
21674 
21675 extern struct freezer_context freezer_context_global;
21676 AbsoluteTime c_freezer_last_yield_ts = 0;
21677 
21678 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
21679 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
21680 
21681 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)21682 vm_map_freeze(
21683 	task_t       task,
21684 	unsigned int *purgeable_count,
21685 	unsigned int *wired_count,
21686 	unsigned int *clean_count,
21687 	unsigned int *dirty_count,
21688 	unsigned int dirty_budget,
21689 	unsigned int *shared_count,
21690 	int          *freezer_error_code,
21691 	boolean_t    eval_only)
21692 {
21693 	vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
21694 	kern_return_t   kr = KERN_SUCCESS;
21695 	boolean_t       evaluation_phase = TRUE;
21696 	vm_object_t     cur_shared_object = NULL;
21697 	int             cur_shared_obj_ref_cnt = 0;
21698 	unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
21699 
21700 	*purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
21701 
21702 	/*
21703 	 * We need the exclusive lock here so that we can
21704 	 * block any page faults or lookups while we are
21705 	 * in the middle of freezing this vm map.
21706 	 */
21707 	vm_map_t map = task->map;
21708 
21709 	vm_map_lock(map);
21710 
21711 	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
21712 
21713 	if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21714 		if (vm_compressor_low_on_space()) {
21715 			*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21716 		}
21717 
21718 		if (vm_swap_low_on_space()) {
21719 			*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21720 		}
21721 
21722 		kr = KERN_NO_SPACE;
21723 		goto done;
21724 	}
21725 
21726 	if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
21727 		/*
21728 		 * In-memory compressor backing the freezer. No disk.
21729 		 * So no need to do the evaluation phase.
21730 		 */
21731 		evaluation_phase = FALSE;
21732 
21733 		if (eval_only == TRUE) {
21734 			/*
21735 			 * We don't support 'eval_only' mode
21736 			 * in this non-swap config.
21737 			 */
21738 			*freezer_error_code = FREEZER_ERROR_GENERIC;
21739 			kr = KERN_INVALID_ARGUMENT;
21740 			goto done;
21741 		}
21742 
21743 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21744 		clock_get_uptime(&c_freezer_last_yield_ts);
21745 	}
21746 again:
21747 
21748 	for (entry2 = vm_map_first_entry(map);
21749 	    entry2 != vm_map_to_entry(map);
21750 	    entry2 = entry2->vme_next) {
21751 		vm_object_t src_object;
21752 
21753 		if (entry2->is_sub_map) {
21754 			continue;
21755 		}
21756 
21757 		src_object = VME_OBJECT(entry2);
21758 		if (!src_object ||
21759 		    src_object->phys_contiguous ||
21760 		    !src_object->internal) {
21761 			continue;
21762 		}
21763 
21764 		/* If eligible, scan the entry, moving eligible pages over to our parent object */
21765 
21766 		if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
21767 			/*
21768 			 * We skip purgeable objects during evaluation phase only.
21769 			 * If we decide to freeze this process, we'll explicitly
21770 			 * purge these objects before we go around again with
21771 			 * 'evaluation_phase' set to FALSE.
21772 			 */
21773 
21774 			if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
21775 				/*
21776 				 * We want to purge objects that may not belong to this task but are mapped
21777 				 * in this task alone. Since we already purged this task's purgeable memory
21778 				 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
21779 				 * on this task's purgeable objects. Hence the check for only volatile objects.
21780 				 */
21781 				if (evaluation_phase ||
21782 				    src_object->purgable != VM_PURGABLE_VOLATILE ||
21783 				    src_object->ref_count != 1) {
21784 					continue;
21785 				}
21786 				vm_object_lock(src_object);
21787 				if (src_object->purgable == VM_PURGABLE_VOLATILE &&
21788 				    src_object->ref_count == 1) {
21789 					purgeable_q_t old_queue;
21790 
21791 					/* object should be on a purgeable queue */
21792 					assert(src_object->objq.next != NULL &&
21793 					    src_object->objq.prev != NULL);
21794 					/* move object from its volatile queue to the nonvolatile queue */
21795 					old_queue = vm_purgeable_object_remove(src_object);
21796 					assert(old_queue);
21797 					if (src_object->purgeable_when_ripe) {
21798 						/* remove a token from that volatile queue */
21799 						vm_page_lock_queues();
21800 						vm_purgeable_token_delete_first(old_queue);
21801 						vm_page_unlock_queues();
21802 					}
21803 					/* purge the object */
21804 					vm_object_purge(src_object, 0);
21805 				}
21806 				vm_object_unlock(src_object);
21807 				continue;
21808 			}
21809 
21810 			/*
21811 			 * Pages belonging to this object could be swapped to disk.
21812 			 * Make sure it's not a shared object because we could end
21813 			 * up just bringing it back in again.
21814 			 *
21815 			 * We try to optimize somewhat by checking for objects that are mapped
21816 			 * more than once within our own map. But we don't do full searches,
21817 			 * we just look at the entries following our current entry.
21818 			 */
21819 
21820 			if (src_object->ref_count > 1) {
21821 				if (src_object != cur_shared_object) {
21822 					obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21823 					dirty_shared_count += obj_pages_snapshot;
21824 
21825 					cur_shared_object = src_object;
21826 					cur_shared_obj_ref_cnt = 1;
21827 					continue;
21828 				} else {
21829 					cur_shared_obj_ref_cnt++;
21830 					if (src_object->ref_count == cur_shared_obj_ref_cnt) {
21831 						/*
21832 						 * Fall through to below and treat this object as private.
21833 						 * So deduct its pages from our shared total and add it to the
21834 						 * private total.
21835 						 */
21836 
21837 						dirty_shared_count -= obj_pages_snapshot;
21838 						dirty_private_count += obj_pages_snapshot;
21839 					} else {
21840 						continue;
21841 					}
21842 				}
21843 			}
21844 
21845 
21846 			if (src_object->ref_count == 1) {
21847 				dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21848 			}
21849 
21850 			if (evaluation_phase == TRUE) {
21851 				continue;
21852 			}
21853 		}
21854 
21855 		uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
21856 		*wired_count += src_object->wired_page_count;
21857 
21858 		if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21859 			if (vm_compressor_low_on_space()) {
21860 				*freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21861 			}
21862 
21863 			if (vm_swap_low_on_space()) {
21864 				*freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21865 			}
21866 
21867 			kr = KERN_NO_SPACE;
21868 			break;
21869 		}
21870 		if (paged_out_count >= dirty_budget) {
21871 			break;
21872 		}
21873 		dirty_budget -= paged_out_count;
21874 	}
21875 
21876 	*shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
21877 	if (evaluation_phase) {
21878 		unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
21879 
21880 		if (dirty_shared_count > shared_pages_threshold) {
21881 			*freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
21882 			kr = KERN_FAILURE;
21883 			goto done;
21884 		}
21885 
21886 		if (dirty_shared_count &&
21887 		    ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
21888 			*freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
21889 			kr = KERN_FAILURE;
21890 			goto done;
21891 		}
21892 
21893 		evaluation_phase = FALSE;
21894 		dirty_shared_count = dirty_private_count = 0;
21895 
21896 		freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21897 		clock_get_uptime(&c_freezer_last_yield_ts);
21898 
21899 		if (eval_only) {
21900 			kr = KERN_SUCCESS;
21901 			goto done;
21902 		}
21903 
21904 		vm_purgeable_purge_task_owned(task);
21905 
21906 		goto again;
21907 	} else {
21908 		kr = KERN_SUCCESS;
21909 	}
21910 
21911 done:
21912 	vm_map_unlock(map);
21913 
21914 	if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
21915 		vm_object_compressed_freezer_done();
21916 	}
21917 	return kr;
21918 }
21919 
21920 #endif
21921 
21922 /*
21923  * vm_map_entry_should_cow_for_true_share:
21924  *
21925  * Determines if the map entry should be clipped and setup for copy-on-write
21926  * to avoid applying "true_share" to a large VM object when only a subset is
21927  * targeted.
21928  *
21929  * For now, we target only the map entries created for the Objective C
21930  * Garbage Collector, which initially have the following properties:
21931  *	- alias == VM_MEMORY_MALLOC
21932  *      - wired_count == 0
21933  *      - !needs_copy
21934  * and a VM object with:
21935  *      - internal
21936  *      - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
21937  *      - !true_share
21938  *      - vo_size == ANON_CHUNK_SIZE
21939  *
21940  * Only non-kernel map entries.
21941  */
21942 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)21943 vm_map_entry_should_cow_for_true_share(
21944 	vm_map_entry_t  entry)
21945 {
21946 	vm_object_t     object;
21947 
21948 	if (entry->is_sub_map) {
21949 		/* entry does not point at a VM object */
21950 		return FALSE;
21951 	}
21952 
21953 	if (entry->needs_copy) {
21954 		/* already set for copy_on_write: done! */
21955 		return FALSE;
21956 	}
21957 
21958 	if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
21959 	    VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
21960 		/* not a malloc heap or Obj-C Garbage Collector heap */
21961 		return FALSE;
21962 	}
21963 
21964 	if (entry->wired_count) {
21965 		/* wired: can't change the map entry... */
21966 		vm_counters.should_cow_but_wired++;
21967 		return FALSE;
21968 	}
21969 
21970 	object = VME_OBJECT(entry);
21971 
21972 	if (object == VM_OBJECT_NULL) {
21973 		/* no object yet... */
21974 		return FALSE;
21975 	}
21976 
21977 	if (!object->internal) {
21978 		/* not an internal object */
21979 		return FALSE;
21980 	}
21981 
21982 	if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
21983 		/* not the default copy strategy */
21984 		return FALSE;
21985 	}
21986 
21987 	if (object->true_share) {
21988 		/* already true_share: too late to avoid it */
21989 		return FALSE;
21990 	}
21991 
21992 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
21993 	    object->vo_size != ANON_CHUNK_SIZE) {
21994 		/* ... not an object created for the ObjC Garbage Collector */
21995 		return FALSE;
21996 	}
21997 
21998 	if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
21999 	    object->vo_size != 2048 * 4096) {
22000 		/* ... not a "MALLOC_SMALL" heap */
22001 		return FALSE;
22002 	}
22003 
22004 	/*
22005 	 * All the criteria match: we have a large object being targeted for "true_share".
22006 	 * To limit the adverse side-effects linked with "true_share", tell the caller to
22007 	 * try and avoid setting up the entire object for "true_share" by clipping the
22008 	 * targeted range and setting it up for copy-on-write.
22009 	 */
22010 	return TRUE;
22011 }
22012 
22013 uint64_t vm_map_range_overflows_count = 0;
22014 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22015 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22016 vm_map_range_overflows(
22017 	vm_map_t map,
22018 	vm_map_offset_t addr,
22019 	vm_map_size_t size)
22020 {
22021 	vm_map_offset_t start, end, sum;
22022 	vm_map_offset_t pgmask;
22023 
22024 	if (size == 0) {
22025 		/* empty range -> no overflow */
22026 		return false;
22027 	}
22028 	pgmask = vm_map_page_mask(map);
22029 	start = vm_map_trunc_page_mask(addr, pgmask);
22030 	end = vm_map_round_page_mask(addr + size, pgmask);
22031 	if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22032 		vm_map_range_overflows_count++;
22033 		if (vm_map_range_overflows_log) {
22034 			printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22035 			    proc_selfpid(),
22036 			    proc_best_name(current_proc()),
22037 			    (uint64_t)addr,
22038 			    (uint64_t)size,
22039 			    (uint64_t)pgmask);
22040 		}
22041 		DTRACE_VM4(vm_map_range_overflows,
22042 		    vm_map_t, map,
22043 		    uint32_t, pgmask,
22044 		    uint64_t, (uint64_t)addr,
22045 		    uint64_t, (uint64_t)size);
22046 		return true;
22047 	}
22048 	return false;
22049 }
22050 
22051 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22052 vm_map_round_page_mask(
22053 	vm_map_offset_t offset,
22054 	vm_map_offset_t mask)
22055 {
22056 	return VM_MAP_ROUND_PAGE(offset, mask);
22057 }
22058 
22059 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22060 vm_map_trunc_page_mask(
22061 	vm_map_offset_t offset,
22062 	vm_map_offset_t mask)
22063 {
22064 	return VM_MAP_TRUNC_PAGE(offset, mask);
22065 }
22066 
22067 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22068 vm_map_page_aligned(
22069 	vm_map_offset_t offset,
22070 	vm_map_offset_t mask)
22071 {
22072 	return ((offset) & mask) == 0;
22073 }
22074 
22075 int
vm_map_page_shift(vm_map_t map)22076 vm_map_page_shift(
22077 	vm_map_t map)
22078 {
22079 	return VM_MAP_PAGE_SHIFT(map);
22080 }
22081 
22082 int
vm_map_page_size(vm_map_t map)22083 vm_map_page_size(
22084 	vm_map_t map)
22085 {
22086 	return VM_MAP_PAGE_SIZE(map);
22087 }
22088 
22089 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22090 vm_map_page_mask(
22091 	vm_map_t map)
22092 {
22093 	return VM_MAP_PAGE_MASK(map);
22094 }
22095 
22096 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22097 vm_map_set_page_shift(
22098 	vm_map_t        map,
22099 	int             pageshift)
22100 {
22101 	if (map->hdr.nentries != 0) {
22102 		/* too late to change page size */
22103 		return KERN_FAILURE;
22104 	}
22105 
22106 	map->hdr.page_shift = (uint16_t)pageshift;
22107 
22108 	return KERN_SUCCESS;
22109 }
22110 
22111 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22112 vm_map_query_volatile(
22113 	vm_map_t        map,
22114 	mach_vm_size_t  *volatile_virtual_size_p,
22115 	mach_vm_size_t  *volatile_resident_size_p,
22116 	mach_vm_size_t  *volatile_compressed_size_p,
22117 	mach_vm_size_t  *volatile_pmap_size_p,
22118 	mach_vm_size_t  *volatile_compressed_pmap_size_p)
22119 {
22120 	mach_vm_size_t  volatile_virtual_size;
22121 	mach_vm_size_t  volatile_resident_count;
22122 	mach_vm_size_t  volatile_compressed_count;
22123 	mach_vm_size_t  volatile_pmap_count;
22124 	mach_vm_size_t  volatile_compressed_pmap_count;
22125 	mach_vm_size_t  resident_count;
22126 	vm_map_entry_t  entry;
22127 	vm_object_t     object;
22128 
22129 	/* map should be locked by caller */
22130 
22131 	volatile_virtual_size = 0;
22132 	volatile_resident_count = 0;
22133 	volatile_compressed_count = 0;
22134 	volatile_pmap_count = 0;
22135 	volatile_compressed_pmap_count = 0;
22136 
22137 	for (entry = vm_map_first_entry(map);
22138 	    entry != vm_map_to_entry(map);
22139 	    entry = entry->vme_next) {
22140 		mach_vm_size_t  pmap_resident_bytes, pmap_compressed_bytes;
22141 
22142 		if (entry->is_sub_map) {
22143 			continue;
22144 		}
22145 		if (!(entry->protection & VM_PROT_WRITE)) {
22146 			continue;
22147 		}
22148 		object = VME_OBJECT(entry);
22149 		if (object == VM_OBJECT_NULL) {
22150 			continue;
22151 		}
22152 		if (object->purgable != VM_PURGABLE_VOLATILE &&
22153 		    object->purgable != VM_PURGABLE_EMPTY) {
22154 			continue;
22155 		}
22156 		if (VME_OFFSET(entry)) {
22157 			/*
22158 			 * If the map entry has been split and the object now
22159 			 * appears several times in the VM map, we don't want
22160 			 * to count the object's resident_page_count more than
22161 			 * once.  We count it only for the first one, starting
22162 			 * at offset 0 and ignore the other VM map entries.
22163 			 */
22164 			continue;
22165 		}
22166 		resident_count = object->resident_page_count;
22167 		if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22168 			resident_count = 0;
22169 		} else {
22170 			resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22171 		}
22172 
22173 		volatile_virtual_size += entry->vme_end - entry->vme_start;
22174 		volatile_resident_count += resident_count;
22175 		if (object->pager) {
22176 			volatile_compressed_count +=
22177 			    vm_compressor_pager_get_count(object->pager);
22178 		}
22179 		pmap_compressed_bytes = 0;
22180 		pmap_resident_bytes =
22181 		    pmap_query_resident(map->pmap,
22182 		    entry->vme_start,
22183 		    entry->vme_end,
22184 		    &pmap_compressed_bytes);
22185 		volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22186 		volatile_compressed_pmap_count += (pmap_compressed_bytes
22187 		    / PAGE_SIZE);
22188 	}
22189 
22190 	/* map is still locked on return */
22191 
22192 	*volatile_virtual_size_p = volatile_virtual_size;
22193 	*volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22194 	*volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22195 	*volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22196 	*volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22197 
22198 	return KERN_SUCCESS;
22199 }
22200 
22201 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22202 vm_map_sizes(vm_map_t map,
22203     vm_map_size_t * psize,
22204     vm_map_size_t * pfree,
22205     vm_map_size_t * plargest_free)
22206 {
22207 	vm_map_entry_t  entry;
22208 	vm_map_offset_t prev;
22209 	vm_map_size_t   free, total_free, largest_free;
22210 	boolean_t       end;
22211 
22212 	if (!map) {
22213 		*psize = *pfree = *plargest_free = 0;
22214 		return;
22215 	}
22216 	total_free = largest_free = 0;
22217 
22218 	vm_map_lock_read(map);
22219 	if (psize) {
22220 		*psize = map->max_offset - map->min_offset;
22221 	}
22222 
22223 	prev = map->min_offset;
22224 	for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22225 		end = (entry == vm_map_to_entry(map));
22226 
22227 		if (end) {
22228 			free = entry->vme_end   - prev;
22229 		} else {
22230 			free = entry->vme_start - prev;
22231 		}
22232 
22233 		total_free += free;
22234 		if (free > largest_free) {
22235 			largest_free = free;
22236 		}
22237 
22238 		if (end) {
22239 			break;
22240 		}
22241 		prev = entry->vme_end;
22242 	}
22243 	vm_map_unlock_read(map);
22244 	if (pfree) {
22245 		*pfree = total_free;
22246 	}
22247 	if (plargest_free) {
22248 		*plargest_free = largest_free;
22249 	}
22250 }
22251 
22252 #if VM_SCAN_FOR_SHADOW_CHAIN
22253 int
vm_map_shadow_max(vm_map_t map)22254 vm_map_shadow_max(
22255 	vm_map_t map)
22256 {
22257 	int             shadows, shadows_max;
22258 	vm_map_entry_t  entry;
22259 	vm_object_t     object, next_object;
22260 
22261 	if (map == NULL) {
22262 		return 0;
22263 	}
22264 
22265 	shadows_max = 0;
22266 
22267 	vm_map_lock_read(map);
22268 
22269 	for (entry = vm_map_first_entry(map);
22270 	    entry != vm_map_to_entry(map);
22271 	    entry = entry->vme_next) {
22272 		if (entry->is_sub_map) {
22273 			continue;
22274 		}
22275 		object = VME_OBJECT(entry);
22276 		if (object == NULL) {
22277 			continue;
22278 		}
22279 		vm_object_lock_shared(object);
22280 		for (shadows = 0;
22281 		    object->shadow != NULL;
22282 		    shadows++, object = next_object) {
22283 			next_object = object->shadow;
22284 			vm_object_lock_shared(next_object);
22285 			vm_object_unlock(object);
22286 		}
22287 		vm_object_unlock(object);
22288 		if (shadows > shadows_max) {
22289 			shadows_max = shadows;
22290 		}
22291 	}
22292 
22293 	vm_map_unlock_read(map);
22294 
22295 	return shadows_max;
22296 }
22297 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22298 
22299 void
vm_commit_pagezero_status(vm_map_t lmap)22300 vm_commit_pagezero_status(vm_map_t lmap)
22301 {
22302 	pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22303 }
22304 
22305 #if __x86_64__
22306 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22307 vm_map_set_high_start(
22308 	vm_map_t        map,
22309 	vm_map_offset_t high_start)
22310 {
22311 	map->vmmap_high_start = high_start;
22312 }
22313 #endif /* __x86_64__ */
22314 
22315 #if CODE_SIGNING_MONITOR
22316 
22317 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22318 vm_map_entry_cs_associate(
22319 	vm_map_t                map,
22320 	vm_map_entry_t          entry,
22321 	vm_map_kernel_flags_t   vmk_flags)
22322 {
22323 	vm_object_t cs_object, cs_shadow, backing_object;
22324 	vm_object_offset_t cs_offset, backing_offset;
22325 	void *cs_blobs;
22326 	struct vnode *cs_vnode;
22327 	kern_return_t cs_ret;
22328 
22329 	if (map->pmap == NULL ||
22330 	    entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22331 	    (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22332 	    VME_OBJECT(entry) == VM_OBJECT_NULL) {
22333 		return KERN_SUCCESS;
22334 	}
22335 
22336 	if (!(entry->protection & VM_PROT_EXECUTE)) {
22337 		/*
22338 		 * This memory region is not executable, so the code-signing
22339 		 * monitor would usually not care about it...
22340 		 */
22341 		if (vmk_flags.vmkf_remap_prot_copy &&
22342 		    (entry->max_protection & VM_PROT_EXECUTE)) {
22343 			/*
22344 			 * ... except if the memory region is being remapped
22345 			 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22346 			 * which is what a debugger or dtrace would be doing
22347 			 * to prepare to modify an executable page to insert
22348 			 * a breakpoint or activate a probe.
22349 			 * In that case, fall through so that we can mark
22350 			 * this region as being "debugged" and no longer
22351 			 * strictly code-signed.
22352 			 */
22353 		} else {
22354 			/*
22355 			 * Really not executable, so no need to tell the
22356 			 * code-signing monitor.
22357 			 */
22358 			return KERN_SUCCESS;
22359 		}
22360 	}
22361 
22362 	vm_map_lock_assert_exclusive(map);
22363 
22364 	/*
22365 	 * Check for a debug association mapping before we check for used_for_jit. This
22366 	 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22367 	 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22368 	 * since they are mapped with RW or RX permissions, which the page table monitor
22369 	 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22370 	 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22371 	 * violation when those USER_EXEC pages are mapped as RW.
22372 	 *
22373 	 * Since these pages switch between RW and RX through mprotect, they mimic what
22374 	 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22375 	 * on macOS systems, this works in our favor here and allows us to continue to
22376 	 * support these legacy-programmed applications without sacrificing security on
22377 	 * the page table or the code signing monitor. We don't need to explicitly check
22378 	 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22379 	 * created with RX, then the application must map it as RW in order to first write
22380 	 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22381 	 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22382 	 * Similarly, if the mapping was created as RW, and then switched to RX,
22383 	 * vm_map_protect will again mark the entry as a copy, and both these cases
22384 	 * lead to this if-statement being entered.
22385 	 *
22386 	 * For more information: rdar://115313336.
22387 	 */
22388 	if (vmk_flags.vmkf_remap_prot_copy) {
22389 		cs_ret = csm_associate_debug_region(
22390 			map->pmap,
22391 			entry->vme_start,
22392 			entry->vme_end - entry->vme_start);
22393 
22394 		/*
22395 		 * csm_associate_debug_region returns not supported when the code signing
22396 		 * monitor is disabled. This is intentional, since cs_ret is checked towards
22397 		 * the end of the function, and if it is not supported, then we still want the
22398 		 * VM to perform code-signing enforcement on this entry. That said, if we don't
22399 		 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22400 		 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22401 		 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22402 		 * cases, which will cause a violation when attempted to be mapped as writable).
22403 		 */
22404 		if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22405 			entry->vme_xnu_user_debug = TRUE;
22406 		}
22407 #if DEVELOPMENT || DEBUG
22408 		if (vm_log_xnu_user_debug) {
22409 			printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ]  vme_xnu_user_debug=%d cs_ret %d\n",
22410 			    proc_selfpid(),
22411 			    (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22412 			    __FUNCTION__, __LINE__,
22413 			    map, entry,
22414 			    (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22415 			    entry->vme_xnu_user_debug,
22416 			    cs_ret);
22417 		}
22418 #endif /* DEVELOPMENT || DEBUG */
22419 		goto done;
22420 	}
22421 
22422 	if (entry->used_for_jit) {
22423 		cs_ret = csm_associate_jit_region(
22424 			map->pmap,
22425 			entry->vme_start,
22426 			entry->vme_end - entry->vme_start);
22427 		goto done;
22428 	}
22429 
22430 	cs_object = VME_OBJECT(entry);
22431 	vm_object_lock_shared(cs_object);
22432 	cs_offset = VME_OFFSET(entry);
22433 
22434 	/* find the VM object backed by the code-signed vnode */
22435 	for (;;) {
22436 		/* go to the bottom of cs_object's shadow chain */
22437 		for (;
22438 		    cs_object->shadow != VM_OBJECT_NULL;
22439 		    cs_object = cs_shadow) {
22440 			cs_shadow = cs_object->shadow;
22441 			cs_offset += cs_object->vo_shadow_offset;
22442 			vm_object_lock_shared(cs_shadow);
22443 			vm_object_unlock(cs_object);
22444 		}
22445 		if (cs_object->internal ||
22446 		    cs_object->pager == MEMORY_OBJECT_NULL) {
22447 			vm_object_unlock(cs_object);
22448 			return KERN_SUCCESS;
22449 		}
22450 
22451 		cs_offset += cs_object->paging_offset;
22452 
22453 		/*
22454 		 * cs_object could be backed by a:
22455 		 *      vnode_pager
22456 		 *	apple_protect_pager
22457 		 *      shared_region_pager
22458 		 *	fourk_pager (multiple backing objects -> fail?)
22459 		 * ask the pager if it has a backing VM object
22460 		 */
22461 		if (!memory_object_backing_object(cs_object->pager,
22462 		    cs_offset,
22463 		    &backing_object,
22464 		    &backing_offset)) {
22465 			/* no backing object: cs_object is it */
22466 			break;
22467 		}
22468 
22469 		/* look down the backing object's shadow chain */
22470 		vm_object_lock_shared(backing_object);
22471 		vm_object_unlock(cs_object);
22472 		cs_object = backing_object;
22473 		cs_offset = backing_offset;
22474 	}
22475 
22476 	cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
22477 	if (cs_vnode == NULL) {
22478 		/* no vnode, no code signatures to associate */
22479 		cs_ret = KERN_SUCCESS;
22480 	} else {
22481 		cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
22482 		    &cs_blobs);
22483 		assert(cs_ret == KERN_SUCCESS);
22484 		cs_ret = cs_associate_blob_with_mapping(map->pmap,
22485 		    entry->vme_start,
22486 		    (entry->vme_end - entry->vme_start),
22487 		    cs_offset,
22488 		    cs_blobs);
22489 	}
22490 	vm_object_unlock(cs_object);
22491 	cs_object = VM_OBJECT_NULL;
22492 
22493 done:
22494 	if (cs_ret == KERN_SUCCESS) {
22495 		DTRACE_VM2(vm_map_entry_cs_associate_success,
22496 		    vm_map_offset_t, entry->vme_start,
22497 		    vm_map_offset_t, entry->vme_end);
22498 		if (vm_map_executable_immutable) {
22499 			/*
22500 			 * Prevent this executable
22501 			 * mapping from being unmapped
22502 			 * or modified.
22503 			 */
22504 			entry->vme_permanent = TRUE;
22505 		}
22506 		/*
22507 		 * pmap says it will validate the
22508 		 * code-signing validity of pages
22509 		 * faulted in via this mapping, so
22510 		 * this map entry should be marked so
22511 		 * that vm_fault() bypasses code-signing
22512 		 * validation for faults coming through
22513 		 * this mapping.
22514 		 */
22515 		entry->csm_associated = TRUE;
22516 	} else if (cs_ret == KERN_NOT_SUPPORTED) {
22517 		/*
22518 		 * pmap won't check the code-signing
22519 		 * validity of pages faulted in via
22520 		 * this mapping, so VM should keep
22521 		 * doing it.
22522 		 */
22523 		DTRACE_VM3(vm_map_entry_cs_associate_off,
22524 		    vm_map_offset_t, entry->vme_start,
22525 		    vm_map_offset_t, entry->vme_end,
22526 		    int, cs_ret);
22527 	} else {
22528 		/*
22529 		 * A real error: do not allow
22530 		 * execution in this mapping.
22531 		 */
22532 		DTRACE_VM3(vm_map_entry_cs_associate_failure,
22533 		    vm_map_offset_t, entry->vme_start,
22534 		    vm_map_offset_t, entry->vme_end,
22535 		    int, cs_ret);
22536 		if (vmk_flags.vmkf_overwrite_immutable) {
22537 			/*
22538 			 * We can get here when we remap an apple_protect pager
22539 			 * on top of an already cs_associated executable mapping
22540 			 * with the same code signatures, so we don't want to
22541 			 * lose VM_PROT_EXECUTE in that case...
22542 			 */
22543 		} else {
22544 			entry->protection &= ~VM_PROT_ALLEXEC;
22545 			entry->max_protection &= ~VM_PROT_ALLEXEC;
22546 		}
22547 	}
22548 
22549 	return cs_ret;
22550 }
22551 
22552 #endif /* CODE_SIGNING_MONITOR */
22553 
22554 inline bool
vm_map_is_corpse_source(vm_map_t map)22555 vm_map_is_corpse_source(vm_map_t map)
22556 {
22557 	bool status = false;
22558 	if (map) {
22559 		vm_map_lock_read(map);
22560 		status = map->corpse_source;
22561 		vm_map_unlock_read(map);
22562 	}
22563 	return status;
22564 }
22565 
22566 inline void
vm_map_set_corpse_source(vm_map_t map)22567 vm_map_set_corpse_source(vm_map_t map)
22568 {
22569 	if (map) {
22570 		vm_map_lock(map);
22571 		map->corpse_source = true;
22572 		vm_map_unlock(map);
22573 	}
22574 }
22575 
22576 inline void
vm_map_unset_corpse_source(vm_map_t map)22577 vm_map_unset_corpse_source(vm_map_t map)
22578 {
22579 	if (map) {
22580 		vm_map_lock(map);
22581 		map->corpse_source = false;
22582 		vm_map_unlock(map);
22583 	}
22584 }
22585 /*
22586  * FORKED CORPSE FOOTPRINT
22587  *
22588  * A forked corpse gets a copy of the original VM map but its pmap is mostly
22589  * empty since it never ran and never got to fault in any pages.
22590  * Collecting footprint info (via "sysctl vm.self_region_footprint") for
22591  * a forked corpse would therefore return very little information.
22592  *
22593  * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
22594  * to vm_map_fork() to collect footprint information from the original VM map
22595  * and its pmap, and store it in the forked corpse's VM map.  That information
22596  * is stored in place of the VM map's "hole list" since we'll never need to
22597  * lookup for holes in the corpse's map.
22598  *
22599  * The corpse's footprint info looks like this:
22600  *
22601  * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
22602  * as follows:
22603  *                     +---------------------------------------+
22604  *            header-> | cf_size                               |
22605  *                     +-------------------+-------------------+
22606  *                     | cf_last_region    | cf_last_zeroes    |
22607  *                     +-------------------+-------------------+
22608  *           region1-> | cfr_vaddr                             |
22609  *                     +-------------------+-------------------+
22610  *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
22611  *                     +---------------------------------------+
22612  *                     | d4 | d5 | ...                         |
22613  *                     +---------------------------------------+
22614  *                     | ...                                   |
22615  *                     +-------------------+-------------------+
22616  *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
22617  *                     +-------------------+-------------------+
22618  *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
22619  *                     +---------------------------------------+
22620  *                     | d0 | d1 ...                           |
22621  *                     +---------------------------------------+
22622  *                       ...
22623  *                     +---------------------------------------+
22624  *       last region-> | cfr_vaddr                             |
22625  *                     +---------------------------------------+
22626  *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
22627  *                     +---------------------------------------+
22628  *                       ...
22629  *                     +---------------------------------------+
22630  *                     | dx | dy | dz | na | na | na | na | na |
22631  *                     +---------------------------------------+
22632  *
22633  * where:
22634  *      cf_size:	total size of the buffer (rounded to page size)
22635  *      cf_last_region:	offset in the buffer of the last "region" sub-header
22636  *	cf_last_zeroes: number of trailing "zero" dispositions at the end
22637  *			of last region
22638  *	cfr_vaddr:	virtual address of the start of the covered "region"
22639  *	cfr_num_pages:	number of pages in the covered "region"
22640  *	d*:		disposition of the page at that virtual address
22641  * Regions in the buffer are word-aligned.
22642  *
22643  * We estimate the size of the buffer based on the number of memory regions
22644  * and the virtual size of the address space.  While copying each memory region
22645  * during vm_map_fork(), we also collect the footprint info for that region
22646  * and store it in the buffer, packing it as much as possible (coalescing
22647  * contiguous memory regions to avoid having too many region headers and
22648  * avoiding long streaks of "zero" page dispositions by splitting footprint
22649  * "regions", so the number of regions in the footprint buffer might not match
22650  * the number of memory regions in the address space.
22651  *
22652  * We also have to copy the original task's "nonvolatile" ledgers since that's
22653  * part of the footprint and will need to be reported to any tool asking for
22654  * the footprint information of the forked corpse.
22655  */
22656 
22657 uint64_t vm_map_corpse_footprint_count = 0;
22658 uint64_t vm_map_corpse_footprint_size_avg = 0;
22659 uint64_t vm_map_corpse_footprint_size_max = 0;
22660 uint64_t vm_map_corpse_footprint_full = 0;
22661 uint64_t vm_map_corpse_footprint_no_buf = 0;
22662 
22663 struct vm_map_corpse_footprint_header {
22664 	vm_size_t       cf_size;        /* allocated buffer size */
22665 	uint32_t        cf_last_region; /* offset of last region in buffer */
22666 	union {
22667 		uint32_t cfu_last_zeroes; /* during creation:
22668 		                           * number of "zero" dispositions at
22669 		                           * end of last region */
22670 		uint32_t cfu_hint_region; /* during lookup:
22671 		                           * offset of last looked up region */
22672 #define cf_last_zeroes cfu.cfu_last_zeroes
22673 #define cf_hint_region cfu.cfu_hint_region
22674 	} cfu;
22675 };
22676 typedef uint8_t cf_disp_t;
22677 struct vm_map_corpse_footprint_region {
22678 	vm_map_offset_t cfr_vaddr;      /* region start virtual address */
22679 	uint32_t        cfr_num_pages;  /* number of pages in this "region" */
22680 	cf_disp_t   cfr_disposition[0]; /* disposition of each page */
22681 } __attribute__((packed));
22682 
22683 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)22684 vm_page_disposition_to_cf_disp(
22685 	int disposition)
22686 {
22687 	assert(sizeof(cf_disp_t) == 1);
22688 	/* relocate bits that don't fit in a "uint8_t" */
22689 	if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
22690 		disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
22691 	}
22692 	/* cast gets rid of extra bits */
22693 	return (cf_disp_t) disposition;
22694 }
22695 
22696 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)22697 vm_page_cf_disp_to_disposition(
22698 	cf_disp_t cf_disp)
22699 {
22700 	int disposition;
22701 
22702 	assert(sizeof(cf_disp_t) == 1);
22703 	disposition = (int) cf_disp;
22704 	/* move relocated bits back in place */
22705 	if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
22706 		disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
22707 		disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
22708 	}
22709 	return disposition;
22710 }
22711 
22712 /*
22713  * vm_map_corpse_footprint_new_region:
22714  *      closes the current footprint "region" and creates a new one
22715  *
22716  * Returns NULL if there's not enough space in the buffer for a new region.
22717  */
22718 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)22719 vm_map_corpse_footprint_new_region(
22720 	struct vm_map_corpse_footprint_header *footprint_header)
22721 {
22722 	uintptr_t       footprint_edge;
22723 	uint32_t        new_region_offset;
22724 	struct vm_map_corpse_footprint_region *footprint_region;
22725 	struct vm_map_corpse_footprint_region *new_footprint_region;
22726 
22727 	footprint_edge = ((uintptr_t)footprint_header +
22728 	    footprint_header->cf_size);
22729 	footprint_region = ((struct vm_map_corpse_footprint_region *)
22730 	    ((char *)footprint_header +
22731 	    footprint_header->cf_last_region));
22732 	assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
22733 	    footprint_edge);
22734 
22735 	/* get rid of trailing zeroes in the last region */
22736 	assert(footprint_region->cfr_num_pages >=
22737 	    footprint_header->cf_last_zeroes);
22738 	footprint_region->cfr_num_pages -=
22739 	    footprint_header->cf_last_zeroes;
22740 	footprint_header->cf_last_zeroes = 0;
22741 
22742 	/* reuse this region if it's now empty */
22743 	if (footprint_region->cfr_num_pages == 0) {
22744 		return footprint_region;
22745 	}
22746 
22747 	/* compute offset of new region */
22748 	new_region_offset = footprint_header->cf_last_region;
22749 	new_region_offset += sizeof(*footprint_region);
22750 	new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22751 	new_region_offset = roundup(new_region_offset, sizeof(int));
22752 
22753 	/* check if we're going over the edge */
22754 	if (((uintptr_t)footprint_header +
22755 	    new_region_offset +
22756 	    sizeof(*footprint_region)) >=
22757 	    footprint_edge) {
22758 		/* over the edge: no new region */
22759 		return NULL;
22760 	}
22761 
22762 	/* adjust offset of last region in header */
22763 	footprint_header->cf_last_region = new_region_offset;
22764 
22765 	new_footprint_region = (struct vm_map_corpse_footprint_region *)
22766 	    ((char *)footprint_header +
22767 	    footprint_header->cf_last_region);
22768 	new_footprint_region->cfr_vaddr = 0;
22769 	new_footprint_region->cfr_num_pages = 0;
22770 	/* caller needs to initialize new region */
22771 
22772 	return new_footprint_region;
22773 }
22774 
22775 /*
22776  * vm_map_corpse_footprint_collect:
22777  *	collect footprint information for "old_entry" in "old_map" and
22778  *	stores it in "new_map"'s vmmap_footprint_info.
22779  */
22780 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)22781 vm_map_corpse_footprint_collect(
22782 	vm_map_t        old_map,
22783 	vm_map_entry_t  old_entry,
22784 	vm_map_t        new_map)
22785 {
22786 	vm_map_offset_t va;
22787 	kern_return_t   kr;
22788 	struct vm_map_corpse_footprint_header *footprint_header;
22789 	struct vm_map_corpse_footprint_region *footprint_region;
22790 	struct vm_map_corpse_footprint_region *new_footprint_region;
22791 	cf_disp_t       *next_disp_p;
22792 	uintptr_t       footprint_edge;
22793 	uint32_t        num_pages_tmp;
22794 	int             effective_page_size;
22795 
22796 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
22797 
22798 	va = old_entry->vme_start;
22799 
22800 	vm_map_lock_assert_exclusive(old_map);
22801 	vm_map_lock_assert_exclusive(new_map);
22802 
22803 	assert(new_map->has_corpse_footprint);
22804 	assert(!old_map->has_corpse_footprint);
22805 	if (!new_map->has_corpse_footprint ||
22806 	    old_map->has_corpse_footprint) {
22807 		/*
22808 		 * This can only transfer footprint info from a
22809 		 * map with a live pmap to a map with a corpse footprint.
22810 		 */
22811 		return KERN_NOT_SUPPORTED;
22812 	}
22813 
22814 	if (new_map->vmmap_corpse_footprint == NULL) {
22815 		vm_offset_t     buf;
22816 		vm_size_t       buf_size;
22817 
22818 		buf = 0;
22819 		buf_size = (sizeof(*footprint_header) +
22820 		    (old_map->hdr.nentries
22821 		    *
22822 		    (sizeof(*footprint_region) +
22823 		    +3))            /* potential alignment for each region */
22824 		    +
22825 		    ((old_map->size / effective_page_size)
22826 		    *
22827 		    sizeof(cf_disp_t)));      /* disposition for each page */
22828 //		printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
22829 		buf_size = round_page(buf_size);
22830 
22831 		/* limit buffer to 1 page to validate overflow detection */
22832 //		buf_size = PAGE_SIZE;
22833 
22834 		/* limit size to a somewhat sane amount */
22835 #if XNU_TARGET_OS_OSX
22836 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (8*1024*1024)   /* 8MB */
22837 #else /* XNU_TARGET_OS_OSX */
22838 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE   (256*1024)      /* 256KB */
22839 #endif /* XNU_TARGET_OS_OSX */
22840 		if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
22841 			buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
22842 		}
22843 
22844 		/*
22845 		 * Allocate the pageable buffer (with a trailing guard page).
22846 		 * It will be zero-filled on demand.
22847 		 */
22848 		kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
22849 		    KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
22850 		    VM_KERN_MEMORY_DIAG);
22851 		if (kr != KERN_SUCCESS) {
22852 			vm_map_corpse_footprint_no_buf++;
22853 			return kr;
22854 		}
22855 
22856 		/* initialize header and 1st region */
22857 		footprint_header = (struct vm_map_corpse_footprint_header *)buf;
22858 		new_map->vmmap_corpse_footprint = footprint_header;
22859 
22860 		footprint_header->cf_size = buf_size;
22861 		footprint_header->cf_last_region =
22862 		    sizeof(*footprint_header);
22863 		footprint_header->cf_last_zeroes = 0;
22864 
22865 		footprint_region = (struct vm_map_corpse_footprint_region *)
22866 		    ((char *)footprint_header +
22867 		    footprint_header->cf_last_region);
22868 		footprint_region->cfr_vaddr = 0;
22869 		footprint_region->cfr_num_pages = 0;
22870 	} else {
22871 		/* retrieve header and last region */
22872 		footprint_header = (struct vm_map_corpse_footprint_header *)
22873 		    new_map->vmmap_corpse_footprint;
22874 		footprint_region = (struct vm_map_corpse_footprint_region *)
22875 		    ((char *)footprint_header +
22876 		    footprint_header->cf_last_region);
22877 	}
22878 	footprint_edge = ((uintptr_t)footprint_header +
22879 	    footprint_header->cf_size);
22880 
22881 	if ((footprint_region->cfr_vaddr +
22882 	    (((vm_map_offset_t)footprint_region->cfr_num_pages) *
22883 	    effective_page_size))
22884 	    != old_entry->vme_start) {
22885 		uint64_t num_pages_delta, num_pages_delta_size;
22886 		uint32_t region_offset_delta_size;
22887 
22888 		/*
22889 		 * Not the next contiguous virtual address:
22890 		 * start a new region or store "zero" dispositions for
22891 		 * the missing pages?
22892 		 */
22893 		/* size of gap in actual page dispositions */
22894 		num_pages_delta = ((old_entry->vme_start -
22895 		    footprint_region->cfr_vaddr) / effective_page_size)
22896 		    - footprint_region->cfr_num_pages;
22897 		num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
22898 		/* size of gap as a new footprint region header */
22899 		region_offset_delta_size =
22900 		    (sizeof(*footprint_region) +
22901 		    roundup(((footprint_region->cfr_num_pages -
22902 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
22903 		    sizeof(int)) -
22904 		    ((footprint_region->cfr_num_pages -
22905 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
22906 //		printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
22907 		if (region_offset_delta_size < num_pages_delta_size ||
22908 		    os_add3_overflow(footprint_region->cfr_num_pages,
22909 		    (uint32_t) num_pages_delta,
22910 		    1,
22911 		    &num_pages_tmp)) {
22912 			/*
22913 			 * Storing data for this gap would take more space
22914 			 * than inserting a new footprint region header:
22915 			 * let's start a new region and save space. If it's a
22916 			 * tie, let's avoid using a new region, since that
22917 			 * would require more region hops to find the right
22918 			 * range during lookups.
22919 			 *
22920 			 * If the current region's cfr_num_pages would overflow
22921 			 * if we added "zero" page dispositions for the gap,
22922 			 * no choice but to start a new region.
22923 			 */
22924 //			printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
22925 			new_footprint_region =
22926 			    vm_map_corpse_footprint_new_region(footprint_header);
22927 			/* check that we're not going over the edge */
22928 			if (new_footprint_region == NULL) {
22929 				goto over_the_edge;
22930 			}
22931 			footprint_region = new_footprint_region;
22932 			/* initialize new region as empty */
22933 			footprint_region->cfr_vaddr = old_entry->vme_start;
22934 			footprint_region->cfr_num_pages = 0;
22935 		} else {
22936 			/*
22937 			 * Store "zero" page dispositions for the missing
22938 			 * pages.
22939 			 */
22940 //			printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
22941 			for (; num_pages_delta > 0; num_pages_delta--) {
22942 				next_disp_p = (cf_disp_t *)
22943 				    ((uintptr_t) footprint_region +
22944 				    sizeof(*footprint_region));
22945 				next_disp_p += footprint_region->cfr_num_pages;
22946 				/* check that we're not going over the edge */
22947 				if ((uintptr_t)next_disp_p >= footprint_edge) {
22948 					goto over_the_edge;
22949 				}
22950 				/* store "zero" disposition for this gap page */
22951 				footprint_region->cfr_num_pages++;
22952 				*next_disp_p = (cf_disp_t) 0;
22953 				footprint_header->cf_last_zeroes++;
22954 			}
22955 		}
22956 	}
22957 
22958 	for (va = old_entry->vme_start;
22959 	    va < old_entry->vme_end;
22960 	    va += effective_page_size) {
22961 		int             disposition;
22962 		cf_disp_t       cf_disp;
22963 
22964 		vm_map_footprint_query_page_info(old_map,
22965 		    old_entry,
22966 		    va,
22967 		    &disposition);
22968 		cf_disp = vm_page_disposition_to_cf_disp(disposition);
22969 
22970 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
22971 
22972 		if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
22973 			/*
22974 			 * Ignore "zero" dispositions at start of
22975 			 * region: just move start of region.
22976 			 */
22977 			footprint_region->cfr_vaddr += effective_page_size;
22978 			continue;
22979 		}
22980 
22981 		/* would region's cfr_num_pages overflow? */
22982 		if (os_add_overflow(footprint_region->cfr_num_pages, 1,
22983 		    &num_pages_tmp)) {
22984 			/* overflow: create a new region */
22985 			new_footprint_region =
22986 			    vm_map_corpse_footprint_new_region(
22987 				footprint_header);
22988 			if (new_footprint_region == NULL) {
22989 				goto over_the_edge;
22990 			}
22991 			footprint_region = new_footprint_region;
22992 			footprint_region->cfr_vaddr = va;
22993 			footprint_region->cfr_num_pages = 0;
22994 		}
22995 
22996 		next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
22997 		    sizeof(*footprint_region));
22998 		next_disp_p += footprint_region->cfr_num_pages;
22999 		/* check that we're not going over the edge */
23000 		if ((uintptr_t)next_disp_p >= footprint_edge) {
23001 			goto over_the_edge;
23002 		}
23003 		/* store this dispostion */
23004 		*next_disp_p = cf_disp;
23005 		footprint_region->cfr_num_pages++;
23006 
23007 		if (cf_disp != 0) {
23008 			/* non-zero disp: break the current zero streak */
23009 			footprint_header->cf_last_zeroes = 0;
23010 			/* done */
23011 			continue;
23012 		}
23013 
23014 		/* zero disp: add to the current streak of zeroes */
23015 		footprint_header->cf_last_zeroes++;
23016 		if ((footprint_header->cf_last_zeroes +
23017 		    roundup(((footprint_region->cfr_num_pages -
23018 		    footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23019 		    (sizeof(int) - 1),
23020 		    sizeof(int))) <
23021 		    (sizeof(*footprint_header))) {
23022 			/*
23023 			 * There are not enough trailing "zero" dispositions
23024 			 * (+ the extra padding we would need for the previous
23025 			 * region); creating a new region would not save space
23026 			 * at this point, so let's keep this "zero" disposition
23027 			 * in this region and reconsider later.
23028 			 */
23029 			continue;
23030 		}
23031 		/*
23032 		 * Create a new region to avoid having too many consecutive
23033 		 * "zero" dispositions.
23034 		 */
23035 		new_footprint_region =
23036 		    vm_map_corpse_footprint_new_region(footprint_header);
23037 		if (new_footprint_region == NULL) {
23038 			goto over_the_edge;
23039 		}
23040 		footprint_region = new_footprint_region;
23041 		/* initialize the new region as empty ... */
23042 		footprint_region->cfr_num_pages = 0;
23043 		/* ... and skip this "zero" disp */
23044 		footprint_region->cfr_vaddr = va + effective_page_size;
23045 	}
23046 
23047 	return KERN_SUCCESS;
23048 
23049 over_the_edge:
23050 //	printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23051 	vm_map_corpse_footprint_full++;
23052 	return KERN_RESOURCE_SHORTAGE;
23053 }
23054 
23055 /*
23056  * vm_map_corpse_footprint_collect_done:
23057  *	completes the footprint collection by getting rid of any remaining
23058  *	trailing "zero" dispositions and trimming the unused part of the
23059  *	kernel buffer
23060  */
23061 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23062 vm_map_corpse_footprint_collect_done(
23063 	vm_map_t        new_map)
23064 {
23065 	struct vm_map_corpse_footprint_header *footprint_header;
23066 	struct vm_map_corpse_footprint_region *footprint_region;
23067 	vm_size_t       buf_size, actual_size;
23068 	kern_return_t   kr;
23069 
23070 	assert(new_map->has_corpse_footprint);
23071 	if (!new_map->has_corpse_footprint ||
23072 	    new_map->vmmap_corpse_footprint == NULL) {
23073 		return;
23074 	}
23075 
23076 	footprint_header = (struct vm_map_corpse_footprint_header *)
23077 	    new_map->vmmap_corpse_footprint;
23078 	buf_size = footprint_header->cf_size;
23079 
23080 	footprint_region = (struct vm_map_corpse_footprint_region *)
23081 	    ((char *)footprint_header +
23082 	    footprint_header->cf_last_region);
23083 
23084 	/* get rid of trailing zeroes in last region */
23085 	assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23086 	footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23087 	footprint_header->cf_last_zeroes = 0;
23088 
23089 	actual_size = (vm_size_t)(footprint_header->cf_last_region +
23090 	    sizeof(*footprint_region) +
23091 	    (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23092 
23093 //	printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23094 	vm_map_corpse_footprint_size_avg =
23095 	    (((vm_map_corpse_footprint_size_avg *
23096 	    vm_map_corpse_footprint_count) +
23097 	    actual_size) /
23098 	    (vm_map_corpse_footprint_count + 1));
23099 	vm_map_corpse_footprint_count++;
23100 	if (actual_size > vm_map_corpse_footprint_size_max) {
23101 		vm_map_corpse_footprint_size_max = actual_size;
23102 	}
23103 
23104 	actual_size = round_page(actual_size);
23105 	if (buf_size > actual_size) {
23106 		kr = vm_deallocate(kernel_map,
23107 		    vm_sanitize_wrap_addr((vm_address_t)footprint_header +
23108 		    actual_size + PAGE_SIZE), /* trailing guard page */
23109 		    vm_sanitize_wrap_size(buf_size - actual_size));
23110 		assertf(kr == KERN_SUCCESS,
23111 		    "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23112 		    footprint_header,
23113 		    (uint64_t) buf_size,
23114 		    (uint64_t) actual_size,
23115 		    kr);
23116 		kr = vm_protect(kernel_map,
23117 		    (vm_address_t)footprint_header + actual_size,
23118 		    PAGE_SIZE,
23119 		    FALSE,             /* set_maximum */
23120 		    VM_PROT_NONE);
23121 		assertf(kr == KERN_SUCCESS,
23122 		    "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23123 		    footprint_header,
23124 		    (uint64_t) buf_size,
23125 		    (uint64_t) actual_size,
23126 		    kr);
23127 	}
23128 
23129 	footprint_header->cf_size = actual_size;
23130 }
23131 
23132 /*
23133  * vm_map_corpse_footprint_query_page_info:
23134  *	retrieves the disposition of the page at virtual address "vaddr"
23135  *	in the forked corpse's VM map
23136  *
23137  * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23138  */
23139 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23140 vm_map_corpse_footprint_query_page_info(
23141 	vm_map_t        map,
23142 	vm_map_offset_t va,
23143 	int             *disposition_p)
23144 {
23145 	struct vm_map_corpse_footprint_header *footprint_header;
23146 	struct vm_map_corpse_footprint_region *footprint_region;
23147 	uint32_t        footprint_region_offset;
23148 	vm_map_offset_t region_start, region_end;
23149 	int             disp_idx;
23150 	kern_return_t   kr;
23151 	int             effective_page_size;
23152 	cf_disp_t       cf_disp;
23153 
23154 	if (!map->has_corpse_footprint) {
23155 		*disposition_p = 0;
23156 		kr = KERN_INVALID_ARGUMENT;
23157 		goto done;
23158 	}
23159 
23160 	footprint_header = map->vmmap_corpse_footprint;
23161 	if (footprint_header == NULL) {
23162 		*disposition_p = 0;
23163 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23164 		kr = KERN_INVALID_ARGUMENT;
23165 		goto done;
23166 	}
23167 
23168 	/* start looking at the hint ("cf_hint_region") */
23169 	footprint_region_offset = footprint_header->cf_hint_region;
23170 
23171 	effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23172 
23173 lookup_again:
23174 	if (footprint_region_offset < sizeof(*footprint_header)) {
23175 		/* hint too low: start from 1st region */
23176 		footprint_region_offset = sizeof(*footprint_header);
23177 	}
23178 	if (footprint_region_offset > footprint_header->cf_last_region) {
23179 		/* hint too high: re-start from 1st region */
23180 		footprint_region_offset = sizeof(*footprint_header);
23181 	}
23182 	footprint_region = (struct vm_map_corpse_footprint_region *)
23183 	    ((char *)footprint_header + footprint_region_offset);
23184 	region_start = footprint_region->cfr_vaddr;
23185 	region_end = (region_start +
23186 	    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23187 	    effective_page_size));
23188 	if (va < region_start &&
23189 	    footprint_region_offset != sizeof(*footprint_header)) {
23190 		/* our range starts before the hint region */
23191 
23192 		/* reset the hint (in a racy way...) */
23193 		footprint_header->cf_hint_region = sizeof(*footprint_header);
23194 		/* lookup "va" again from 1st region */
23195 		footprint_region_offset = sizeof(*footprint_header);
23196 		goto lookup_again;
23197 	}
23198 
23199 	while (va >= region_end) {
23200 		if (footprint_region_offset >= footprint_header->cf_last_region) {
23201 			break;
23202 		}
23203 		/* skip the region's header */
23204 		footprint_region_offset += sizeof(*footprint_region);
23205 		/* skip the region's page dispositions */
23206 		footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23207 		/* align to next word boundary */
23208 		footprint_region_offset =
23209 		    roundup(footprint_region_offset,
23210 		    sizeof(int));
23211 		footprint_region = (struct vm_map_corpse_footprint_region *)
23212 		    ((char *)footprint_header + footprint_region_offset);
23213 		region_start = footprint_region->cfr_vaddr;
23214 		region_end = (region_start +
23215 		    ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23216 		    effective_page_size));
23217 	}
23218 	if (va < region_start || va >= region_end) {
23219 		/* page not found */
23220 		*disposition_p = 0;
23221 //		if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23222 		kr = KERN_SUCCESS;
23223 		goto done;
23224 	}
23225 
23226 	/* "va" found: set the lookup hint for next lookup (in a racy way...) */
23227 	footprint_header->cf_hint_region = footprint_region_offset;
23228 
23229 	/* get page disposition for "va" in this region */
23230 	disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23231 	cf_disp = footprint_region->cfr_disposition[disp_idx];
23232 	*disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23233 	kr = KERN_SUCCESS;
23234 done:
23235 //	if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23236 	/* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23237 	DTRACE_VM4(footprint_query_page_info,
23238 	    vm_map_t, map,
23239 	    vm_map_offset_t, va,
23240 	    int, *disposition_p,
23241 	    kern_return_t, kr);
23242 
23243 	return kr;
23244 }
23245 
23246 void
vm_map_corpse_footprint_destroy(vm_map_t map)23247 vm_map_corpse_footprint_destroy(
23248 	vm_map_t        map)
23249 {
23250 	if (map->has_corpse_footprint &&
23251 	    map->vmmap_corpse_footprint != 0) {
23252 		struct vm_map_corpse_footprint_header *footprint_header;
23253 		vm_size_t buf_size;
23254 		kern_return_t kr;
23255 
23256 		footprint_header = map->vmmap_corpse_footprint;
23257 		buf_size = footprint_header->cf_size;
23258 		kr = vm_deallocate(kernel_map,
23259 		    vm_sanitize_wrap_addr((vm_offset_t) map->vmmap_corpse_footprint),
23260 		    vm_sanitize_wrap_size(buf_size + PAGE_SIZE)); /* trailing guard page */
23261 		assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
23262 		map->vmmap_corpse_footprint = 0;
23263 		map->has_corpse_footprint = FALSE;
23264 	}
23265 }
23266 
23267 /*
23268  * vm_map_copy_footprint_ledgers:
23269  *	copies any ledger that's relevant to the memory footprint of "old_task"
23270  *	into the forked corpse's task ("new_task")
23271  */
23272 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23273 vm_map_copy_footprint_ledgers(
23274 	task_t  old_task,
23275 	task_t  new_task)
23276 {
23277 	vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23278 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23279 	vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23280 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23281 	vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23282 	vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23283 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23284 	vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23285 	vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23286 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23287 	vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23288 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23289 	vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23290 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23291 	vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23292 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23293 	vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23294 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23295 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23296 	vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23297 	vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_nofootprint_total);
23298 }
23299 
23300 /*
23301  * vm_map_copy_ledger:
23302  *	copy a single ledger from "old_task" to "new_task"
23303  */
23304 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23305 vm_map_copy_ledger(
23306 	task_t  old_task,
23307 	task_t  new_task,
23308 	int     ledger_entry)
23309 {
23310 	ledger_amount_t old_balance, new_balance, delta;
23311 
23312 	assert(new_task->map->has_corpse_footprint);
23313 	if (!new_task->map->has_corpse_footprint) {
23314 		return;
23315 	}
23316 
23317 	/* turn off sanity checks for the ledger we're about to mess with */
23318 	ledger_disable_panic_on_negative(new_task->ledger,
23319 	    ledger_entry);
23320 
23321 	/* adjust "new_task" to match "old_task" */
23322 	ledger_get_balance(old_task->ledger,
23323 	    ledger_entry,
23324 	    &old_balance);
23325 	ledger_get_balance(new_task->ledger,
23326 	    ledger_entry,
23327 	    &new_balance);
23328 	if (new_balance == old_balance) {
23329 		/* new == old: done */
23330 	} else if (new_balance > old_balance) {
23331 		/* new > old ==> new -= new - old */
23332 		delta = new_balance - old_balance;
23333 		ledger_debit(new_task->ledger,
23334 		    ledger_entry,
23335 		    delta);
23336 	} else {
23337 		/* new < old ==> new += old - new */
23338 		delta = old_balance - new_balance;
23339 		ledger_credit(new_task->ledger,
23340 		    ledger_entry,
23341 		    delta);
23342 	}
23343 }
23344 
23345 /*
23346  * vm_map_get_pmap:
23347  * returns the pmap associated with the vm_map
23348  */
23349 pmap_t
vm_map_get_pmap(vm_map_t map)23350 vm_map_get_pmap(vm_map_t map)
23351 {
23352 	return vm_map_pmap(map);
23353 }
23354 
23355 ppnum_t
vm_map_get_phys_page(vm_map_t map,vm_offset_t addr)23356 vm_map_get_phys_page(
23357 	vm_map_t                map,
23358 	vm_offset_t             addr)
23359 {
23360 	vm_object_offset_t      offset;
23361 	vm_object_t             object;
23362 	vm_map_offset_t         map_offset;
23363 	vm_map_entry_t          entry;
23364 	ppnum_t                 phys_page = 0;
23365 
23366 	map_offset = vm_map_trunc_page(addr, PAGE_MASK);
23367 
23368 	vm_map_lock(map);
23369 	while (vm_map_lookup_entry(map, map_offset, &entry)) {
23370 		if (entry->is_sub_map) {
23371 			vm_map_t        old_map;
23372 			vm_map_lock(VME_SUBMAP(entry));
23373 			old_map = map;
23374 			map = VME_SUBMAP(entry);
23375 			map_offset = (VME_OFFSET(entry) +
23376 			    (map_offset - entry->vme_start));
23377 			vm_map_unlock(old_map);
23378 			continue;
23379 		}
23380 		if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
23381 			vm_map_unlock(map);
23382 			return (ppnum_t) 0;
23383 		}
23384 		if (VME_OBJECT(entry)->phys_contiguous) {
23385 			/* These are  not standard pageable memory mappings */
23386 			/* If they are not present in the object they will  */
23387 			/* have to be picked up from the pager through the  */
23388 			/* fault mechanism.  */
23389 			if (VME_OBJECT(entry)->vo_shadow_offset == 0) {
23390 				/* need to call vm_fault */
23391 				vm_map_unlock(map);
23392 				vm_fault(map, map_offset, VM_PROT_NONE,
23393 				    FALSE /* change_wiring */, VM_KERN_MEMORY_NONE,
23394 				    THREAD_UNINT, NULL, 0);
23395 				vm_map_lock(map);
23396 				continue;
23397 			}
23398 			offset = (VME_OFFSET(entry) +
23399 			    (map_offset - entry->vme_start));
23400 			phys_page = (ppnum_t)
23401 			    ((VME_OBJECT(entry)->vo_shadow_offset
23402 			    + offset) >> PAGE_SHIFT);
23403 			break;
23404 		}
23405 		offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start));
23406 		object = VME_OBJECT(entry);
23407 		vm_object_lock(object);
23408 		while (TRUE) {
23409 			vm_page_t dst_page = vm_page_lookup(object, offset);
23410 			if (dst_page == VM_PAGE_NULL) {
23411 				if (object->shadow) {
23412 					vm_object_t old_object;
23413 					vm_object_lock(object->shadow);
23414 					old_object = object;
23415 					offset = offset + object->vo_shadow_offset;
23416 					object = object->shadow;
23417 					vm_object_unlock(old_object);
23418 				} else {
23419 					vm_object_unlock(object);
23420 					break;
23421 				}
23422 			} else {
23423 				phys_page = (ppnum_t)(VM_PAGE_GET_PHYS_PAGE(dst_page));
23424 				vm_object_unlock(object);
23425 				break;
23426 			}
23427 		}
23428 		break;
23429 	}
23430 
23431 	vm_map_unlock(map);
23432 	return phys_page;
23433 }
23434 
23435 #if CONFIG_MAP_RANGES
23436 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23437 static bitmap_t vm_map_user_range_large_file_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23438 
23439 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23440 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23441 
23442 /*
23443  * vm_map_range_map_init:
23444  *  initializes the VM range ID map to enable index lookup
23445  *  of user VM ranges based on VM tag from userspace.
23446  */
23447 static void
vm_map_range_map_init(void)23448 vm_map_range_map_init(void)
23449 {
23450 	/*
23451 	 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
23452 	 * - the former is malloc metadata which should be kept separate
23453 	 * - the latter has its own ranges
23454 	 */
23455 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
23456 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
23457 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
23458 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
23459 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
23460 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
23461 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
23462 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
23463 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
23464 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
23465 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
23466 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
23467 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
23468 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
23469 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
23470 	bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
23471 	bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOACCELERATOR);
23472 	bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOSURFACE);
23473 }
23474 
23475 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)23476 vm_map_range_random_uniform(
23477 	vm_map_size_t           req_size,
23478 	vm_map_offset_t         min_addr,
23479 	vm_map_offset_t         max_addr,
23480 	vm_map_offset_t         offmask)
23481 {
23482 	vm_map_offset_t random_addr;
23483 	struct mach_vm_range alloc;
23484 
23485 	req_size = (req_size + offmask) & ~offmask;
23486 	min_addr = (min_addr + offmask) & ~offmask;
23487 	max_addr = max_addr & ~offmask;
23488 
23489 	read_random(&random_addr, sizeof(random_addr));
23490 	random_addr %= (max_addr - req_size - min_addr);
23491 	random_addr &= ~offmask;
23492 
23493 	alloc.min_address = min_addr + random_addr;
23494 	alloc.max_address = min_addr + random_addr + req_size;
23495 	return alloc;
23496 }
23497 
23498 static vm_map_offset_t
vm_map_range_offmask(void)23499 vm_map_range_offmask(void)
23500 {
23501 	uint32_t pte_depth;
23502 
23503 	/*
23504 	 * PTE optimizations
23505 	 *
23506 	 *
23507 	 * 16k pages systems
23508 	 * ~~~~~~~~~~~~~~~~~
23509 	 *
23510 	 * A single L1 (sub-)page covers the address space.
23511 	 * - L2 pages cover 64G,
23512 	 * - L3 pages cover 32M.
23513 	 *
23514 	 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
23515 	 * As a result, we really only need to align the ranges to 32M to avoid
23516 	 * partial L3 pages.
23517 	 *
23518 	 * On macOS, the usage of L2 pages will increase, so as a result we will
23519 	 * want to align ranges to 64G in order to utilize them fully.
23520 	 *
23521 	 *
23522 	 * 4k pages systems
23523 	 * ~~~~~~~~~~~~~~~~
23524 	 *
23525 	 * A single L0 (sub-)page covers the address space.
23526 	 * - L1 pages cover 512G,
23527 	 * - L2 pages cover 1G,
23528 	 * - L3 pages cover 2M.
23529 	 *
23530 	 * The long tail of processes on a system will tend to have a VA usage
23531 	 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
23532 	 * This is achievable with a single L1 and a few L2s without
23533 	 * randomization.
23534 	 *
23535 	 * However once randomization is introduced, the system will immediately
23536 	 * need several L1s and many more L2s. As a result:
23537 	 *
23538 	 * - on embedded devices, the cost of these extra pages isn't
23539 	 *   sustainable, and we just disable the feature entirely,
23540 	 *
23541 	 * - on macOS we align ranges to a 512G boundary so that the extra L1
23542 	 *   pages can be used to their full potential.
23543 	 */
23544 
23545 	/*
23546 	 * note, this function assumes _non exotic mappings_
23547 	 * which is why it uses the native kernel's PAGE_SHIFT.
23548 	 */
23549 #if XNU_PLATFORM_MacOSX
23550 	pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
23551 #else /* !XNU_PLATFORM_MacOSX */
23552 	pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
23553 #endif /* !XNU_PLATFORM_MacOSX */
23554 
23555 	if (pte_depth == 0) {
23556 		return 0;
23557 	}
23558 
23559 	return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
23560 }
23561 
23562 /*
23563  * vm_map_range_configure:
23564  *	configures the user vm_map ranges by increasing the maximum VA range of
23565  *  the map and carving out a range at the end of VA space (searching backwards
23566  *  in the newly expanded map).
23567  */
23568 kern_return_t
vm_map_range_configure(vm_map_t map,__unused bool needs_extra_jumbo_va)23569 vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va)
23570 {
23571 	const vm_map_offset_t offmask = vm_map_range_offmask();
23572 	struct mach_vm_range data_range;
23573 	vm_map_offset_t default_end;
23574 	kern_return_t kr;
23575 
23576 	if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
23577 		/*
23578 		 * No point doing vm ranges in a 32bit address space.
23579 		 */
23580 		return KERN_NOT_SUPPORTED;
23581 	}
23582 
23583 	/* Should not be applying ranges to kernel map or kernel map submaps */
23584 	assert(vm_map_pmap(map) != kernel_pmap);
23585 
23586 #if XNU_PLATFORM_MacOSX
23587 
23588 	/*
23589 	 * on macOS, the address space is a massive 47 bits (128T),
23590 	 * with several carve outs that processes can't use:
23591 	 * - the shared region
23592 	 * - the commpage region
23593 	 * - the GPU carve out (if applicable)
23594 	 *
23595 	 * and when nano-malloc is in use it desires memory at the 96T mark.
23596 	 *
23597 	 * However, their location is architecture dependent:
23598 	 * - On intel, the shared region and commpage are
23599 	 *   at the very end of the usable address space (above +127T),
23600 	 *   and there is no GPU carve out, and pthread wants to place
23601 	 *   threads at the 112T mark (0x70T).
23602 	 *
23603 	 * - On arm64, these are in the same spot as on embedded devices:
23604 	 *   o shared region:   [ 6G,  10G)  [ will likely grow over time ]
23605 	 *   o commpage region: [63G,  64G)
23606 	 *   o GPU carve out:   [64G, 448G)
23607 	 *
23608 	 * This is conveninent because the mappings at the end of the address
23609 	 * space (when they exist) are made by the kernel.
23610 	 *
23611 	 * The policy is to allocate a random 1T for the data heap
23612 	 * in the end of the address-space in the:
23613 	 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
23614 	 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
23615 	 */
23616 
23617 	/* see NANOZONE_SIGNATURE in libmalloc */
23618 #if __x86_64__
23619 	default_end = 0x71ull << 40;
23620 #else
23621 	default_end = 0x61ull << 40;
23622 #endif
23623 	data_range  = vm_map_range_random_uniform(1ull << 40,
23624 	        default_end, 0x7full << 40, offmask);
23625 
23626 #else /* !XNU_PLATFORM_MacOSX */
23627 
23628 	/*
23629 	 * Embedded devices:
23630 	 *
23631 	 *   The default VA Size scales with the device physical memory.
23632 	 *
23633 	 *   Out of that:
23634 	 *   - the "zero" page typically uses 4G + some slide
23635 	 *   - the shared region uses SHARED_REGION_SIZE bytes (4G)
23636 	 *
23637 	 *   Without the use of jumbo or any adjustment to the address space,
23638 	 *   a default VM map typically looks like this:
23639 	 *
23640 	 *       0G -->╒════════════╕
23641 	 *             │  pagezero  │
23642 	 *             │  + slide   │
23643 	 *      ~4G -->╞════════════╡<-- vm_map_min(map)
23644 	 *             │            │
23645 	 *       6G -->├────────────┤
23646 	 *             │   shared   │
23647 	 *             │   region   │
23648 	 *      10G -->├────────────┤
23649 	 *             │            │
23650 	 *   max_va -->├────────────┤<-- vm_map_max(map)
23651 	 *             │            │
23652 	 *             ╎   jumbo    ╎
23653 	 *             ╎            ╎
23654 	 *             │            │
23655 	 *      63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
23656 	 *             │  commpage  │
23657 	 *      64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
23658 	 *             │            │
23659 	 *             ╎    GPU     ╎
23660 	 *             ╎  carveout  ╎
23661 	 *             │            │
23662 	 *     448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
23663 	 *             │            │
23664 	 *             ╎            ╎
23665 	 *             ╎            ╎
23666 	 *             │            │
23667 	 *     512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
23668 	 *
23669 	 *   When this drawing was made, "max_va" was smaller than
23670 	 *   ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
23671 	 *   12G of address space for the zero-page, slide, files,
23672 	 *   binaries, heap ...
23673 	 *
23674 	 *   We will want to make a "heap/data" carve out inside
23675 	 *   the jumbo range of half of that usable space, assuming
23676 	 *   that this is less than a forth of the jumbo range.
23677 	 *
23678 	 *   The assert below intends to catch when max_va grows
23679 	 *   too large for this heuristic.
23680 	 */
23681 
23682 	vm_map_lock_read(map);
23683 	default_end = vm_map_max(map);
23684 	vm_map_unlock_read(map);
23685 
23686 	/*
23687 	 * Check that we're not already jumbo'd,
23688 	 * or our address space was somehow modified.
23689 	 *
23690 	 * If so we cannot guarantee that we can set up the ranges
23691 	 * safely without interfering with the existing map.
23692 	 */
23693 	if (default_end > vm_compute_max_offset(true)) {
23694 		return KERN_NO_SPACE;
23695 	}
23696 
23697 	if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
23698 		/*
23699 		 * an override boot-arg was set, disable user-ranges
23700 		 *
23701 		 * XXX: this is problematic because it means these boot-args
23702 		 *      no longer test the behavior changing the value
23703 		 *      of ARM64_MAX_OFFSET_DEVICE_* would have.
23704 		 */
23705 		return KERN_NOT_SUPPORTED;
23706 	}
23707 
23708 	/* expand the default VM space to 64GB */
23709 	vm_map_set_jumbo(map);
23710 
23711 	assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
23712 	data_range = vm_map_range_random_uniform(GiB(10),
23713 	    default_end + PAGE_SIZE, vm_map_max(map), offmask);
23714 
23715 #endif /* !XNU_PLATFORM_MacOSX */
23716 
23717 	/*
23718 	 * Poke holes so that ASAN or people listing regions
23719 	 * do not think this space is free.
23720 	 */
23721 
23722 	if (default_end != data_range.min_address) {
23723 		kr = vm_map_enter(map, &default_end,
23724 		    data_range.min_address - default_end,
23725 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23726 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23727 		assert(kr == KERN_SUCCESS);
23728 	}
23729 
23730 	if (data_range.max_address != vm_map_max(map)) {
23731 		vm_map_entry_t entry;
23732 		vm_size_t size;
23733 
23734 		/*
23735 		 * Extend the end of the hole to the next VM entry or the end of the map,
23736 		 * whichever comes first.
23737 		 */
23738 		vm_map_lock_read(map);
23739 		vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
23740 		if (entry == vm_map_to_entry(map) || entry->vme_start > vm_map_max(map)) {
23741 			size = vm_map_max(map) - data_range.max_address;
23742 		} else {
23743 			size = entry->vme_start - data_range.max_address;
23744 		}
23745 		vm_map_unlock_read(map);
23746 
23747 		kr = vm_map_enter(map, &data_range.max_address, size,
23748 		    0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23749 		    0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23750 		assert(kr == KERN_SUCCESS);
23751 	}
23752 
23753 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
23754 	if (needs_extra_jumbo_va) {
23755 		/* This will grow the address space to MACH_VM_MAX_ADDRESS */
23756 		vm_map_set_extra_jumbo(map);
23757 	}
23758 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
23759 
23760 	vm_map_lock(map);
23761 	map->default_range.min_address = vm_map_min(map);
23762 	map->default_range.max_address = default_end;
23763 	map->data_range = data_range;
23764 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
23765 	/* If process has "extra jumbo" entitlement, enable large file range */
23766 	if (needs_extra_jumbo_va) {
23767 		map->large_file_range = vm_map_range_random_uniform(TiB(1),
23768 		    MACH_VM_JUMBO_ADDRESS, MACH_VM_MAX_ADDRESS, offmask);
23769 	}
23770 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
23771 	map->uses_user_ranges = true;
23772 	vm_map_unlock(map);
23773 
23774 	return KERN_SUCCESS;
23775 }
23776 
23777 /*
23778  * vm_map_range_fork:
23779  *	clones the array of ranges from old_map to new_map in support
23780  *  of a VM map fork.
23781  */
23782 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)23783 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
23784 {
23785 	if (!old_map->uses_user_ranges) {
23786 		/* nothing to do */
23787 		return;
23788 	}
23789 
23790 	new_map->default_range = old_map->default_range;
23791 	new_map->data_range = old_map->data_range;
23792 
23793 	if (old_map->extra_ranges_count) {
23794 		vm_map_user_range_t otable, ntable;
23795 		uint16_t count;
23796 
23797 		otable = old_map->extra_ranges;
23798 		count  = old_map->extra_ranges_count;
23799 		ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
23800 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
23801 		memcpy(ntable, otable,
23802 		    count * sizeof(struct vm_map_user_range));
23803 
23804 		new_map->extra_ranges_count = count;
23805 		new_map->extra_ranges = ntable;
23806 	}
23807 
23808 	new_map->uses_user_ranges = true;
23809 }
23810 
23811 /*
23812  * vm_map_get_user_range:
23813  *	copy the VM user range for the given VM map and range ID.
23814  */
23815 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)23816 vm_map_get_user_range(
23817 	vm_map_t                map,
23818 	vm_map_range_id_t       range_id,
23819 	mach_vm_range_t         range)
23820 {
23821 	if (map == NULL || !map->uses_user_ranges || range == NULL) {
23822 		return KERN_INVALID_ARGUMENT;
23823 	}
23824 
23825 	switch (range_id) {
23826 	case UMEM_RANGE_ID_DEFAULT:
23827 		*range = map->default_range;
23828 		return KERN_SUCCESS;
23829 
23830 	case UMEM_RANGE_ID_HEAP:
23831 		*range = map->data_range;
23832 		return KERN_SUCCESS;
23833 
23834 	case UMEM_RANGE_ID_LARGE_FILE:
23835 		/*
23836 		 * Because this function tells a user-space process about the user
23837 		 * ranges in its VM map, this case communicates whether the large file
23838 		 * range is in use. Note that this is different from how the large file
23839 		 * range ID is handled in `vm_map_get_range()`: there, we "resolve" the
23840 		 * VA policy and return either the large file range or data range,
23841 		 * depending on whether the large file range is enabled.
23842 		 */
23843 		if (map->large_file_range.min_address != map->large_file_range.max_address) {
23844 			/* large file range is configured and should be used */
23845 			*range = map->large_file_range;
23846 		} else {
23847 			return KERN_INVALID_ARGUMENT;
23848 		}
23849 		return KERN_SUCCESS;
23850 
23851 	default:
23852 		return KERN_INVALID_ARGUMENT;
23853 	}
23854 }
23855 
23856 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)23857 vm_map_user_range_resolve(
23858 	vm_map_t                map,
23859 	mach_vm_address_t       addr,
23860 	mach_vm_size_t          size,
23861 	mach_vm_range_t         range)
23862 {
23863 	struct mach_vm_range tmp;
23864 
23865 	vm_map_lock_assert_held(map);
23866 
23867 	static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23868 	static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23869 
23870 	if (mach_vm_range_contains(&map->default_range, addr, size)) {
23871 		if (range) {
23872 			*range = map->default_range;
23873 		}
23874 		return UMEM_RANGE_ID_DEFAULT;
23875 	}
23876 
23877 	if (mach_vm_range_contains(&map->data_range, addr, size)) {
23878 		if (range) {
23879 			*range = map->data_range;
23880 		}
23881 		return UMEM_RANGE_ID_HEAP;
23882 	}
23883 
23884 	if (mach_vm_range_contains(&map->large_file_range, addr, size)) {
23885 		if (range) {
23886 			*range = map->large_file_range;
23887 		}
23888 		return UMEM_RANGE_ID_LARGE_FILE;
23889 	}
23890 
23891 	for (size_t i = 0; i < map->extra_ranges_count; i++) {
23892 		vm_map_user_range_t r = &map->extra_ranges[i];
23893 
23894 		tmp.min_address = r->vmur_min_address;
23895 		tmp.max_address = r->vmur_max_address;
23896 
23897 		if (mach_vm_range_contains(&tmp, addr, size)) {
23898 			if (range) {
23899 				*range = tmp;
23900 			}
23901 			return r->vmur_range_id;
23902 		}
23903 	}
23904 
23905 	if (range) {
23906 		range->min_address = range->max_address = 0;
23907 	}
23908 	return UMEM_RANGE_ID_DEFAULT;
23909 }
23910 #endif /* CONFIG_MAP_RANGES */
23911 
23912 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map,__unused vm_map_size_t size)23913 vm_map_kernel_flags_update_range_id(
23914 	vm_map_kernel_flags_t *vmkf,
23915 	vm_map_t map,
23916 	__unused vm_map_size_t size)
23917 {
23918 	if (map == kernel_map) {
23919 		if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
23920 			vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
23921 		}
23922 #if CONFIG_MAP_RANGES
23923 	} else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
23924 	    vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT) {
23925 		if (bitmap_test(vm_map_user_range_large_file_map, vmkf->vm_tag)
23926 		    || size >= VM_LARGE_FILE_THRESHOLD) {
23927 			/*
23928 			 * if the map doesn't have the large file range configured,
23929 			 * the range will get resolved to the heap range in `vm_map_get_range`
23930 			 */
23931 			vmkf->vmkf_range_id = UMEM_RANGE_ID_LARGE_FILE;
23932 		} else if (bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
23933 			vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
23934 		}
23935 #endif /* CONFIG_MAP_RANGES */
23936 	}
23937 }
23938 
23939 /*
23940  * vm_map_entry_has_device_pager:
23941  * Check if the vm map entry specified by the virtual address has a device pager.
23942  * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
23943  */
23944 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)23945 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
23946 {
23947 	vm_map_entry_t entry;
23948 	vm_object_t object;
23949 	boolean_t result;
23950 
23951 	if (map == NULL) {
23952 		return FALSE;
23953 	}
23954 
23955 	vm_map_lock(map);
23956 	while (TRUE) {
23957 		if (!vm_map_lookup_entry(map, vaddr, &entry)) {
23958 			result = FALSE;
23959 			break;
23960 		}
23961 		if (entry->is_sub_map) {
23962 			// Check the submap
23963 			vm_map_t submap = VME_SUBMAP(entry);
23964 			assert(submap != NULL);
23965 			vm_map_lock(submap);
23966 			vm_map_unlock(map);
23967 			map = submap;
23968 			continue;
23969 		}
23970 		object = VME_OBJECT(entry);
23971 		if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
23972 			result = TRUE;
23973 			break;
23974 		}
23975 		result = FALSE;
23976 		break;
23977 	}
23978 
23979 	vm_map_unlock(map);
23980 	return result;
23981 }
23982 
23983 
23984 #if MACH_ASSERT
23985 
23986 extern int pmap_ledgers_panic;
23987 extern int pmap_ledgers_panic_leeway;
23988 
23989 #define LEDGER_DRIFT(__LEDGER)                    \
23990 	int             __LEDGER##_over;          \
23991 	ledger_amount_t __LEDGER##_over_total;    \
23992 	ledger_amount_t __LEDGER##_over_max;      \
23993 	int             __LEDGER##_under;         \
23994 	ledger_amount_t __LEDGER##_under_total;   \
23995 	ledger_amount_t __LEDGER##_under_max
23996 
23997 struct {
23998 	uint64_t        num_pmaps_checked;
23999 
24000 	LEDGER_DRIFT(phys_footprint);
24001 	LEDGER_DRIFT(internal);
24002 	LEDGER_DRIFT(internal_compressed);
24003 	LEDGER_DRIFT(external);
24004 	LEDGER_DRIFT(reusable);
24005 	LEDGER_DRIFT(iokit_mapped);
24006 	LEDGER_DRIFT(alternate_accounting);
24007 	LEDGER_DRIFT(alternate_accounting_compressed);
24008 	LEDGER_DRIFT(page_table);
24009 	LEDGER_DRIFT(purgeable_volatile);
24010 	LEDGER_DRIFT(purgeable_nonvolatile);
24011 	LEDGER_DRIFT(purgeable_volatile_compressed);
24012 	LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24013 	LEDGER_DRIFT(tagged_nofootprint);
24014 	LEDGER_DRIFT(tagged_footprint);
24015 	LEDGER_DRIFT(tagged_nofootprint_compressed);
24016 	LEDGER_DRIFT(tagged_footprint_compressed);
24017 	LEDGER_DRIFT(network_volatile);
24018 	LEDGER_DRIFT(network_nonvolatile);
24019 	LEDGER_DRIFT(network_volatile_compressed);
24020 	LEDGER_DRIFT(network_nonvolatile_compressed);
24021 	LEDGER_DRIFT(media_nofootprint);
24022 	LEDGER_DRIFT(media_footprint);
24023 	LEDGER_DRIFT(media_nofootprint_compressed);
24024 	LEDGER_DRIFT(media_footprint_compressed);
24025 	LEDGER_DRIFT(graphics_nofootprint);
24026 	LEDGER_DRIFT(graphics_footprint);
24027 	LEDGER_DRIFT(graphics_nofootprint_compressed);
24028 	LEDGER_DRIFT(graphics_footprint_compressed);
24029 	LEDGER_DRIFT(neural_nofootprint);
24030 	LEDGER_DRIFT(neural_footprint);
24031 	LEDGER_DRIFT(neural_nofootprint_compressed);
24032 	LEDGER_DRIFT(neural_footprint_compressed);
24033 	LEDGER_DRIFT(neural_nofootprint_total);
24034 } pmap_ledgers_drift;
24035 
24036 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24037 vm_map_pmap_check_ledgers(
24038 	pmap_t          pmap,
24039 	ledger_t        ledger,
24040 	int             pid,
24041 	char            *procname)
24042 {
24043 	ledger_amount_t bal;
24044 	boolean_t       do_panic;
24045 
24046 	do_panic = FALSE;
24047 
24048 	pmap_ledgers_drift.num_pmaps_checked++;
24049 
24050 #define LEDGER_CHECK_BALANCE(__LEDGER)                                  \
24051 MACRO_BEGIN                                                             \
24052 	int panic_on_negative = TRUE;                                   \
24053 	ledger_get_balance(ledger,                                      \
24054 	                   task_ledgers.__LEDGER,                       \
24055 	                   &bal);                                       \
24056 	ledger_get_panic_on_negative(ledger,                            \
24057 	                             task_ledgers.__LEDGER,             \
24058 	                             &panic_on_negative);               \
24059 	if (bal != 0) {                                                 \
24060 	        if (panic_on_negative ||                                \
24061 	            (pmap_ledgers_panic &&                              \
24062 	             pmap_ledgers_panic_leeway > 0 &&                   \
24063 	             (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
24064 	              bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24065 	                do_panic = TRUE;                                \
24066 	        }                                                       \
24067 	        printf("LEDGER BALANCE proc %d (%s) "                   \
24068 	               "\"%s\" = %lld\n",                               \
24069 	               pid, procname, #__LEDGER, bal);                  \
24070 	        if (bal > 0) {                                          \
24071 	                pmap_ledgers_drift.__LEDGER##_over++;           \
24072 	                pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24073 	                if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24074 	                        pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24075 	                }                                               \
24076 	        } else if (bal < 0) {                                   \
24077 	                pmap_ledgers_drift.__LEDGER##_under++;          \
24078 	                pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24079 	                if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24080 	                        pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24081 	                }                                               \
24082 	        }                                                       \
24083 	}                                                               \
24084 MACRO_END
24085 
24086 	LEDGER_CHECK_BALANCE(phys_footprint);
24087 	LEDGER_CHECK_BALANCE(internal);
24088 	LEDGER_CHECK_BALANCE(internal_compressed);
24089 	LEDGER_CHECK_BALANCE(external);
24090 	LEDGER_CHECK_BALANCE(reusable);
24091 	LEDGER_CHECK_BALANCE(iokit_mapped);
24092 	LEDGER_CHECK_BALANCE(alternate_accounting);
24093 	LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24094 	LEDGER_CHECK_BALANCE(page_table);
24095 	LEDGER_CHECK_BALANCE(purgeable_volatile);
24096 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24097 	LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24098 	LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24099 	LEDGER_CHECK_BALANCE(tagged_nofootprint);
24100 	LEDGER_CHECK_BALANCE(tagged_footprint);
24101 	LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24102 	LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24103 	LEDGER_CHECK_BALANCE(network_volatile);
24104 	LEDGER_CHECK_BALANCE(network_nonvolatile);
24105 	LEDGER_CHECK_BALANCE(network_volatile_compressed);
24106 	LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24107 	LEDGER_CHECK_BALANCE(media_nofootprint);
24108 	LEDGER_CHECK_BALANCE(media_footprint);
24109 	LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24110 	LEDGER_CHECK_BALANCE(media_footprint_compressed);
24111 	LEDGER_CHECK_BALANCE(graphics_nofootprint);
24112 	LEDGER_CHECK_BALANCE(graphics_footprint);
24113 	LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24114 	LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24115 	LEDGER_CHECK_BALANCE(neural_nofootprint);
24116 	LEDGER_CHECK_BALANCE(neural_footprint);
24117 	LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24118 	LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24119 	LEDGER_CHECK_BALANCE(neural_nofootprint_total);
24120 
24121 	if (do_panic) {
24122 		if (pmap_ledgers_panic) {
24123 			panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24124 			    pmap, pid, procname);
24125 		} else {
24126 			printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24127 			    pmap, pid, procname);
24128 		}
24129 	}
24130 }
24131 
24132 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24133 vm_map_pmap_set_process(
24134 	vm_map_t map,
24135 	int pid,
24136 	char *procname)
24137 {
24138 	pmap_set_process(vm_map_pmap(map), pid, procname);
24139 }
24140 
24141 #endif /* MACH_ASSERT */
24142